Module gatenlp.lang.en.gatetokenizers

Expand source code
from gatenlp.processing.gazetteer.stringregex import StringRegexAnnotator

default_tokenizer_rules = r"""
#words#
// a word can be any combination of letters, including hyphens,
// but excluding symbols and punctuation, e.g. apostrophes
// Note that there is an alternative version of the tokeniser that
// treats hyphens as separate tokens


|(?:\p{Lu}(?:\p{Mn})*)(?:(?:\p{Ll}(?:\p{Mn})*)(?:(?:\p{Ll}(?:\p{Mn})*)|\p{Pd}|\p{Cf})*)*
0 =>  Token orth="upperInitial", kind="word", 

|(?:\p{Lu}(?:\p{Mn})*)(?:\p{Pd}|\p{Cf})*(?:(?:\p{Lu}(?:\p{Mn})*)|\p{Pd}|\p{Cf})+
0 =>  Token orth="allCaps", kind="word", 

|(?:\p{Ll}(?:\p{Mn})*)(?:(?:\p{Ll}(?:\p{Mn})*)|\p{Pd}|\p{Cf})*
0 =>  Token orth="lowercase", kind="word", 

// MixedCaps is any mixture of caps and small letters that doesn't
// fit in the preceding categories

|(?:(?:\p{Ll}(?:\p{Mn})*)(?:\p{Ll}(?:\p{Mn})*)+(?:\p{Lu}(?:\p{Mn})*)+(?:(?:\p{Lu}(?:\p{Mn})*)|(?:\p{Ll}(?:\p{Mn})*))*)|(?:(?:\p{Ll}(?:\p{Mn})*)(?:\p{Ll}(?:\p{Mn})*)*(?:\p{Lu}(?:\p{Mn})*)+(?:(?:\p{Lu}(?:\p{Mn})*)|(?:\p{Ll}(?:\p{Mn})*)|\p{Pd}|\p{Cf})*)|(?:(?:\p{Lu}(?:\p{Mn})*)(?:\p{Pd})*(?:\p{Lu}(?:\p{Mn})*)(?:(?:\p{Lu}(?:\p{Mn})*)|(?:\p{Ll}(?:\p{Mn})*)|\p{Pd}|\p{Cf})*(?:(?:\p{Ll}(?:\p{Mn})*))+(?:(?:\p{Lu}(?:\p{Mn})*)|(?:\p{Ll}(?:\p{Mn})*)|\p{Pd}|\p{Cf})*)|(?:(?:\p{Lu}(?:\p{Mn})*)(?:\p{Ll}(?:\p{Mn})*)+(?:(?:\p{Lu}(?:\p{Mn})*)+(?:\p{Ll}(?:\p{Mn})*)+)+)|(?:(?:(?:\p{Lu}(?:\p{Mn})*))+(?:(?:\p{Ll}(?:\p{Mn})*))+(?:(?:\p{Lu}(?:\p{Mn})*))+)
0 =>  Token orth="mixedCaps", kind="word", 

|(?:\p{Lo}|\p{Mc}|\p{Mn})+
0 => Token kind="word", type="other", 

#numbers#
// a number is any combination of digits
|\p{Nd}+
0 => Token kind="number", 

|\p{No}+
0 => Token kind="number", 

#whitespace#
|(?:\p{Zs}) 
0 => SpaceToken kind="space", 

|(?:\p{Cc}) 
0 => SpaceToken kind="control", 

#symbols#
|(?:\p{Sk}|\p{Sm}|\p{So}) 
0 =>  Token kind="symbol", 

|\p{Sc} 
0 =>  Token kind="symbol", symbolkind="currency", 

#punctuation#
|(?:\p{Pd}|\p{Cf}) 
0 => Token kind="punctuation", subkind="dashpunct", 

|(?:\p{Pc}|\p{Po})
0 => Token kind="punctuation", 

|(?:\p{Ps}|\p{Pi}) 
0 => Token kind="punctuation", position="startpunct", 

|(?:\p{Pe}|\p{Pf}) 
0 => Token kind="punctuation", position="endpunct", 
"""

default_tokenizer = StringRegexAnnotator(source=default_tokenizer_rules, source_fmt="string", select_rules="first",
                                         skip_longest=True, longest_only=True, regex_module="regex",
                                         )

alternate_tokenizer_rules = """
"""