# Natural Language Toolkit: Simple Tokenizers # # Copyright (C) 2001-2010 NLTK Project # Author: Edward Loper # Steven Bird # Trevor Cohn # URL: # For license information, see LICENSE.TXT """ Tokenizers that divide strings into substrings using the string C{split()} method. These tokenizers follow the standard L{TokenizerI} interface, and so can be used with any code that expects a tokenizer. For example, these tokenizers can be used to specify the tokenization conventions when building a L{CorpusReader}. But if you are tokenizing a string yourself, consider using string C{split()} method directly instead. """ from api import * from util import * class SpaceTokenizer(StringTokenizer): r""" A tokenizer that divides a string into substrings by treating any single space character as a separator. If you are performing the tokenization yourself (rather than building a tokenizer to pass to some other piece of code), consider using the string C{split()} method instead: >>> words = s.split(' ') """ _string = ' ' class TabTokenizer(StringTokenizer): r""" A tokenizer that divides a string into substrings by treating any single tab character as a separator. If you are performing the tokenization yourself (rather than building a tokenizer to pass to some other piece of code), consider using the string C{split()} method instead: >>> words = s.split('\t') """ _string = '\t' class CharTokenizer(StringTokenizer): r""" A tokenizer that produces individual characters. If you are performing the tokenization yourself (rather than building a tokenizer to pass to some other piece of code), consider iterating over the characters of the string directly instead: for char in string """ def tokenize(self, s): return list(s) def span_tokenize(self, s): for i, j in enumerate(range(1, len(s+1))): yield i, j class LineTokenizer(TokenizerI): r""" A tokenizer that divides a string into substrings by treating any single newline character as a separator. Handling of blank lines may be controlled using a constructor parameter. """ def __init__(self, blanklines='discard'): """ @param blanklines: Indicates how blank lines should be handled. Valid values are: - C{'discard'}: strip blank lines out of the token list before returning it. A line is considered blank if it contains only whitespace characters. - C{'keep'}: leave all blank lines in the token list. - C{'discard-eof'}: if the string ends with a newline, then do not generate a corresponding token C{''} after that newline. """ valid_blanklines = ('discard', 'keep', 'discard-eof') if blanklines not in valid_blanklines: raise ValueError('Blank lines must be one of: %s' % ' '.join(valid_blanklines)) self._blanklines = blanklines def tokenize(self, s): lines = s.split('\n') # If requested, strip off blank lines. if self._blanklines == 'discard': lines = [l for l in lines if l.rstrip()] elif self._blanklines == 'discard-eof': if lines and not lines[-1].strip(): lines.pop() return lines # discard-eof not implemented def span_tokenize(self, s): if self._blanklines == 'keep': for span in string_span_tokenize(s, r'\n'): yield span else: for span in regexp_span_tokenize(s, r'\n(\s+\n)*'): yield span ###################################################################### #{ Tokenization Functions ###################################################################### def line_tokenize(text, blanklines='discard'): return LineTokenizer(blanklines).tokenize(text)