# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2010 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A regular-expression based word tokenizer that tokenizes sentences using the conventions used by the Penn Treebank. """ import re from api import * ###################################################################### #{ Regexp-based treebank tokenizer ###################################################################### # (n.b., this isn't derived from RegexpTokenizer) class TreebankWordTokenizer(TokenizerI): """ A word tokenizer that tokenizes sentences using the conventions used by the Penn Treebank. Contractions, such as "can't", are split in to two tokens. E.g.: - can't S{->} ca n't - he'll S{->} he 'll - weren't S{-} were n't This tokenizer assumes that the text has already been segmented into sentences. Any periods -- apart from those at the end of a string -- are assumed to be part of the word they are attached to (e.g. for abbreviations, etc), and are not separately tokenized. """ # List of contractions adapted from Robert MacIntyre's tokenizer. CONTRACTIONS2 = [re.compile(r"(?i)(.)('ll|'re|'ve|n't|'s|'m|'d)\b"), re.compile(r"(?i)\b(can)(not)\b"), re.compile(r"(?i)\b(D)('ye)\b"), re.compile(r"(?i)\b(Gim)(me)\b"), re.compile(r"(?i)\b(Gon)(na)\b"), re.compile(r"(?i)\b(Got)(ta)\b"), re.compile(r"(?i)\b(Lem)(me)\b"), re.compile(r"(?i)\b(Mor)('n)\b"), re.compile(r"(?i)\b(T)(is)\b"), re.compile(r"(?i)\b(T)(was)\b"), re.compile(r"(?i)\b(Wan)(na)\b")] CONTRACTIONS3 = [re.compile(r"(?i)\b(Whad)(dd)(ya)\b"), re.compile(r"(?i)\b(Wha)(t)(cha)\b")] def tokenize(self, text): for regexp in self.CONTRACTIONS2: text = regexp.sub(r'\1 \2', text) for regexp in self.CONTRACTIONS3: text = regexp.sub(r'\1 \2 \3', text) # Separate most punctuation text = re.sub(r"([^\w\.\'\-\/,&])", r' \1 ', text) # Separate commas if they're followed by space. # (E.g., don't separate 2,500) text = re.sub(r"(,\s)", r' \1', text) # Separate single quotes if they're followed by a space. text = re.sub(r"('\s)", r' \1', text) # Separate periods that come before newline or end of string. text = re.sub('\. *(\n|$)', ' . ', text) return text.split()