.. Copyright (C) 2001-2010 NLTK Project .. For license information, see LICENSE.TXT ========== Chunking ========== >>> from nltk.chunk import * >>> from nltk.chunk.util import * >>> from nltk.chunk.regexp import * >>> from nltk import Tree >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." >>> gold_chunked_text = tagstr2tree(tagged_text) >>> unchunked_text = gold_chunked_text.flatten() Chunking uses a special regexp syntax for rules that delimit the chunks. These rules must be converted to 'regular' regular expressions before a sentence can be chunked. >>> tag_pattern = "

?*" >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) >>> regexp_pattern '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)' Construct some new chunking rules. >>> chunk_rule = ChunkRule("<.*>+", "Chunk everything") >>> chink_rule = ChinkRule("", "Chink on verbs/prepositions") >>> split_rule = SplitRule("

", "

", ... "Split successive determiner/noun pairs") Create and score a series of chunk parsers, successively more complex. >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_node='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print chunked_text (S (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.)) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 0.0 >>> chunkscore.recall() 0.0 >>> chunkscore.f_measure() 0 >>> for chunk in chunkscore.missed(): print chunk (NP The/DT cat/NN) (NP the/DT mat/NN) (NP the/DT dog/NN) >>> for chunk in chunkscore.incorrect(): print chunk (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.) >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule], ... chunk_node='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print chunked_text (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN the/DT dog/NN) chewed/VBD ./.) >>> assert chunked_text == chunk_parser.parse(list(unchunked_text)) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 0.5 >>> chunkscore.recall() 0.33333333333333331 >>> chunkscore.f_measure() 0.40000000000000002 >>> for chunk in chunkscore.missed(): print chunk (NP the/DT mat/NN) (NP the/DT dog/NN) >>> for chunk in chunkscore.incorrect(): print chunk (NP the/DT mat/NN the/DT dog/NN) >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule], ... chunk_node='NP') >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) # Input:

<.> # Chunk everything: {

<.>} # Chink on verbs/prepositions: {

} {

} <.> # Split successive determiner/noun pairs: {

} {

}{

} <.> >>> print chunked_text (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN) (NP the/DT dog/NN) chewed/VBD ./.) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 1.0 >>> chunkscore.recall() 1.0 >>> chunkscore.f_measure() 1.0 >>> chunkscore.missed() [] >>> chunkscore.incorrect() [] >>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE [+'>, '>, ', '

'>] Printing parsers: >>> print repr(chunk_parser) >>> print chunk_parser RegexpChunkParser with 3 rules: Chunk everything +'> Chink on verbs/prepositions '> Split successive determiner/noun pairs ', '

'> Regression Tests ~~~~~~~~~~~~~~~~ ChunkParserI ------------ `ChunkParserI` is an abstract interface -- it is not meant to be instantiated directly. >>> ChunkParserI().parse([]) Traceback (most recent call last): . . . AssertionError: ChunkParserI is an abstract interface ChunkString ----------- ChunkString can be built from a tree of tagged tuples, a tree of trees, or a mixed list of both: >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) >>> ChunkString(t1) '> >>> ChunkString(t2) '> >>> ChunkString(t3) '> Other values generate an error: >>> ChunkString(Tree('S', ['x'])) Traceback (most recent call last): . . . ValueError: chunk structures must contain tagged tokens or trees The `str()` for a chunk string adds spaces to it, which makes it line up with `str()` output for other chunk strings over the same underlying input. >>> cs = ChunkString(t1) >>> print cs >>> cs.xform('', '{}') >>> print cs {} The `_verify()` method makes sure that our transforms don't corrupt the chunk string. By setting debug_level=2, `_verify()` will be called at the end of every call to `xform`. >>> cs = ChunkString(t1, debug_level=3) >>> # tag not marked with <...>: >>> cs.xform('', 't3') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: t3 >>> # brackets not balanced: >>> cs.xform('', '{') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: { >>> # nested brackets: >>> cs.xform('', '{{}}') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: {{}} >>> # modified tags: >>> cs.xform('', '') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed >>> # added tags: >>> cs.xform('', '') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed Chunking Rules -------------- Test the different rule constructors & __repr__ methods: >>> r1 = RegexpChunkRule(''+ChunkString.IN_CHINK_PATTERN, ... '{}', 'chunk and ') >>> r2 = RegexpChunkRule(re.compile(''+ChunkString.IN_CHINK_PATTERN), ... '{}', 'chunk and ') >>> r3 = ChunkRule('', 'chunk and ') >>> r4 = ChinkRule('', 'chink and ') >>> r5 = UnChunkRule('', 'unchunk and ') >>> r6 = MergeRule('', '', 'merge w/ ') >>> r7 = SplitRule('', '', 'split from ') >>> r8 = ExpandLeftRule('', '', 'expand left ') >>> r9 = ExpandRightRule('', '', 'expand right ') >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: ... print rule (?=[^\\}]*(\\{|$))'->'{}'> (?=[^\\}]*(\\{|$))'->'{}'> '> '> '> ', ''> ', ''> ', ''> ', ''> `tag_pattern2re_pattern()` complains if the tag pattern looks problematic: >>> tag_pattern2re_pattern('{}') Traceback (most recent call last): . . . ValueError: Bad tag pattern: '{}' RegexpChunkParser ----------------- A warning is printed when parsing an empty sentence: >>> parser = RegexpChunkParser([ChunkRule('', '')]) >>> parser.parse(Tree('S', [])) Warning: parsing empty text Tree('S', []) RegexpParser ------------ >>> parser = RegexpParser(''' ... NP: {

? * *} # NP ... P: {} # Preposition ... V: {} # Verb ... PP: {
} # PP -> P NP ... VP: { *} # VP -> V (NP|PP)* ... ''') >>> print repr(parser) >>> print parser chunk.RegexpParser with 5 stages: RegexpChunkParser with 1 rules: NP ? * *'> RegexpChunkParser with 1 rules: Preposition '> RegexpChunkParser with 1 rules: Verb '> RegexpChunkParser with 1 rules: PP -> P NP '> RegexpChunkParser with 1 rules: VP -> V (NP|PP)* *'> >>> print parser.parse(unchunked_text, trace=True) # Input:

<.> # NP: {

} {

}{

} <.> # Input: <.> # Preposition: {} <.> # Input:
<.> # Verb: {}
{} <.> # Input:
<.> # PP -> P NP: {
} <.> # Input: <.> # VP -> V (NP|PP)*: { }{} <.> (S (NP The/DT cat/NN) (VP (V sat/VBD) (PP (P on/IN) (NP the/DT mat/NN)) (NP the/DT dog/NN)) (VP (V chewed/VBD)) ./.) Test parsing of other rule types: >>> print RegexpParser(''' ... X: ... }{ # chink rule ... }{ # split rule ... {} # merge rule ... {} # chunk rule w/ context ... ''') chunk.RegexpParser with 1 stages: RegexpChunkParser with 4 rules: chink rule '> split rule ', ''> merge rule ', ''> chunk rule w/ context ', '', ''> Illegal patterns give an error message: >>> print RegexpParser('X: {} {}') Traceback (most recent call last): . . . ValueError: Illegal chunk pattern: {} {}