.. Copyright (C) 2001-2010 NLTK Project .. For license information, see LICENSE.TXT =========== Probability =========== >>> import nltk >>> from nltk.probability import * FreqDist -------- >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] >>> fd1 = nltk.FreqDist(text1) >>> fd1.items() [('!', 1), ('a', 1), ('anywhere', 1), ('fish', 1), ('goes', 1), ('good', 1), ('no', 1), ('porpoise', 1), ('without', 1)] >>> fd1 == nltk.FreqDist(text1) True Note that items are sorted in order of decreasing frequency; two items of the same frequency are sorted alphabetically by their key. >>> both = nltk.FreqDist(text1 + text2) >>> both.items() [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] >>> both == fd1 + nltk.FreqDist(text2) True >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged True >>> fd2 = nltk.FreqDist(text2) >>> fd1.update(fd2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd1.update(text2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd2 = nltk.FreqDist(fd1) >>> fd2 == fd1 True Testing some HMM estimators --------------------------- We extract a small part (500 sentences) of the Brown corpus >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] >>> print len(corpus) 500 We create a HMM trainer - note that we need the tags and symbols from the whole corpus, not just the training corpus >>> tag_set = list(set([tag for sent in corpus for (word,tag) in sent])) >>> print len(tag_set) 92 >>> symbols = list(set([word for sent in corpus for (word,tag) in sent])) >>> print len(symbols) 1464 >>> trainer = nltk.HiddenMarkovModelTrainer(tag_set, symbols) We divide the corpus into 90% training and 10% testing >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> print len(train_corpus) 450 >>> print len(test_corpus) 50 And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) ... print '%.2f%%' % (100 * nltk.tag.accuracy(hmm, test_corpus)) Maximum Likelihood Estimation - this resulted in an initialization error before r7209 >>> mle = lambda fd, bins: MLEProbDist(fd) >>> train_and_test(mle) 14.43% Laplace (= Lidstone with gamma==1) >>> train_and_test(LaplaceProbDist) 66.04% Expected Likelihood Estimation (= Lidstone with gamma==0.5) >>> train_and_test(ELEProbDist) 73.01% Lidstone Estimation, for gamma==0.1, 0.5 and 1 (the later two should be exactly equal to MLE and ELE above) >>> def lidstone(gamma): ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) >>> train_and_test(lidstone(0.1)) 82.51% >>> train_and_test(lidstone(0.5)) 73.01% >>> train_and_test(lidstone(1.0)) 66.04% Witten Bell Estimation - This resulted in ZeroDivisionError before r7209 >>> train_and_test(WittenBellProbDist) 88.12% Good Turing Estimation - The accuracy is only 17%, see issues #26 and #133 >>> gt = lambda fd, bins: GoodTuringProbDist(fd) >>> train_and_test(gt) 0.34% >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd) >>> train_and_test(gt) 14.43% Remains to be added: - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist Squashed bugs ------------- Issue 511: override pop and popitem to invalidate the cache >>> fd = nltk.FreqDist('a') >>> fd.keys() ['a'] >>> fd.pop('a') 1 >>> fd.keys() [] Issue 533: access cumulative frequencies with no arguments >>> fd = nltk.FreqDist('aab') >>> list(fd._cumulative_frequencies(['a'])) [2.0] >>> list(fd._cumulative_frequencies()) [2.0, 3.0]