#DiTokeniser.rules# #diana 28/6/00# #update 9/7/00# #Tokeniser rule file #Each rule should be on one line #Lines that end with "\" are appended with the next one. This facility \ is used for longer rules that cannot be written on a single line # #Lines starting with "#" are treated as comment //Lines starting with "//" are treated as comment # Empty lines are ignored. #A rule has a left hand side (LHS) and a right hand side (RHS); #the RHS is a regular expression tha has to be matched on the input #the LHS describes the annotations to be added to the AnnotationSet. #LHS is separated from the RHS by '>' #LHS knows about the following operators: # + (1..n) # * (0..n) # | (boolean OR) # #RHS uses as separator ';' and has the following format #{LHS} > {Annotation type};{attribute1}={value1};...;{attribute n}={value n} #The primitive constructs are: #UNASSIGNED #UPPERCASE_LETTER #LOWERCASE_LETTER #TITLECASE_LETTER #MODIFIER_LETTER #OTHER_LETTER #NON_SPACING_MARK #ENCLOSING_MARK #COMBINING_SPACING_MARK #DECIMAL_DIGIT_NUMBER #LETTER_NUMBER #OTHER_NUMBER #SPACE_SEPARATOR #LINE_SEPARATOR #PARAGRAPH_SEPARATOR #CONTROL #FORMAT #PRIVATE_USE #SURROGATE #DASH_PUNCTUATION #START_PUNCTUATION #END_PUNCTUATION #CONNECTOR_PUNCTUATION #OTHER_PUNCTUATION #MATH_SYMBOL #CURRENCY_SYMBOL #MODIFIER_SYMBOL #OTHER_SYMBOL #...representing the corresponding enumerated Unicode category types #------- The rules start here ----------------- #Western words# // a word can be any combination of letters, including a dash, // but excluding other symbols or punctuation, e.g. apostrophes "UPPERCASE_LETTER" "LOWERCASE_LETTER"* > Token;orth=upperInitial;kind=word; "UPPERCASE_LETTER" "UPPERCASE_LETTER"+ > Token;orth=allCaps;kind=word; "LOWERCASE_LETTER" (LOWERCASE_LETTER|DASH_PUNCTUATION)* > Token;orth=lowercase;kind=word; // MixedCaps is any mixture of caps and small letters that doesn't // fit in the preceding categories ("UPPERCASE_LETTER" "UPPERCASE_LETTER"+ "LOWERCASE_LETTER"+ \ (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*)|\ ("UPPERCASE_LETTER""LOWERCASE_LETTER"+ "UPPERCASE_LETTER"+ \ (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*)|\ ("LOWERCASE_LETTER" "LOWERCASE_LETTER"+"UPPERCASE_LETTER"+ \ (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*)|\ ("LOWERCASE_LETTER" "LOWERCASE_LETTER"*"UPPERCASE_LETTER"+\ (UPPERCASE_LETTER|LOWERCASE_LETTER|DASH_PUNCTUATION)*) \ > Token;orth=mixedCaps;kind=word; #Bengali rules ("OTHER_LETTER" | "MODIFIER_LETTER" | "MODIFIER_SYMBOL" | "OTHER_SYMBOL" | "NON_SPACING_MARK" | \ "ENCLOSING_MARK" | "UNASSIGNED")+ > Token;kind=word; #numbers# // a number is any combination of digits "DECIMAL_DIGIT_NUMBER"+ >Token;kind=number; #whitespace# (SPACE_SEPARATOR)+ >SpaceToken;kind=space; (CONTROL)+ >SpaceToken;kind=control; #symbols# (MODIFIER_SYMBOL|MATH_SYMBOL|OTHER_SYMBOL) + > Token;kind=symbol; CURRENCY_SYMBOL > Token;kind=symbol;symbolkind=currency; #punctuation# (DASH_PUNCTUATION|CONNECTOR_PUNCTUATION|OTHER_PUNCTUATION)+ >Token;kind=punctuation; "START_PUNCTUATION" >Token;kind=punctuation;position=startpunct; "END_PUNCTUATION" >Token;kind=punctuation;position=endpunct;