diff --git a/AUTHORS.md b/AUTHORS.md index 6fe886fd20..93283a4a7d 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -209,6 +209,7 @@ - Prasasto Adi - Safwan Kamarrudin - Arthur Tilley +- Vilhjalmur Thorsteinsson ## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: ### Contributors to the Porter Stemmer diff --git a/nltk/probability.py b/nltk/probability.py old mode 100644 new mode 100755 index 423eb7272e..bc5a2bd98f --- a/nltk/probability.py +++ b/nltk/probability.py @@ -105,6 +105,9 @@ def __init__(self, samples=None): """ Counter.__init__(self, samples) + # Cached number of samples in this FreqDist + self._N = None + def N(self): """ Return the total number of sample outcomes that have been @@ -114,7 +117,38 @@ def N(self): :rtype: int """ - return sum(self.values()) + if self._N is None: + # Not already cached, or cache has been invalidated + self._N = sum(self.values()) + return self._N + + def __setitem__(self, key, val): + """ + Override ``Counter.__setitem__()`` to invalidate the cached N + """ + self._N = None + super(FreqDist, self).__setitem__(key, val) + + def __delitem__(self, key): + """ + Override ``Counter.__delitem__()`` to invalidate the cached N + """ + self._N = None + super(FreqDist, self).__delitem__(key) + + def update(self, *args, **kwargs): + """ + Override ``Counter.update()`` to invalidate the cached N + """ + self._N = None + super(FreqDist, self).update(*args, **kwargs) + + def setdefault(self, key, val): + """ + Override ``Counter.setdefault()`` to invalidate the cached N + """ + self._N = None + super(FreqDist, self).setdefault(key, val) def B(self): """ @@ -192,9 +226,10 @@ def freq(self, sample): :type sample: any :rtype: float """ - if self.N() == 0: + n = self.N() + if n == 0: return 0 - return self[sample] / self.N() + return self[sample] / n def max(self): """ @@ -1749,6 +1784,7 @@ def __init__(self, cond_samples=None): :type cond_samples: Sequence of (condition, sample) tuples """ defaultdict.__init__(self, FreqDist) + if cond_samples: for (cond, sample) in cond_samples: self[cond][sample] += 1 diff --git a/nltk/tag/tnt.py b/nltk/tag/tnt.py index 0347b8df23..63db23a3a0 100755 --- a/nltk/tag/tnt.py +++ b/nltk/tag/tnt.py @@ -357,30 +357,24 @@ def _tagword(self, sent, current_states): # if word is known # compute the set of possible tags # and their associated log probabilities - if word in self._wd.conditions(): + if word in self._wd: self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].keys(): - p_uni = self._uni.freq((t,C)) - p_bi = self._bi[history[-1]].freq((t,C)) - p_tri = self._tri[tuple(history[-2:])].freq((t,C)) - p_wd = self._wd[word][t] / self._uni[(t,C)] + tC = (t,C) + p_uni = self._uni.freq(tC) + p_bi = self._bi[history[-1]].freq(tC) + p_tri = self._tri[tuple(history[-2:])].freq(tC) + p_wd = self._wd[word][t] / self._uni[tC] p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri p2 = log(p, 2) + log(p_wd, 2) - logprobs.append(((t,C), p2)) - - - # compute the result of appending each tag to this history - for (tag, logprob) in logprobs: - new_states.append((history + [tag], - curr_sent_logprob + logprob)) - - - + # compute the result of appending each tag to this history + new_states.append((history + [tC], + curr_sent_logprob + p2)) # otherwise a new word, set of possible tags is unknown else: @@ -398,7 +392,7 @@ def _tagword(self, sent, current_states): tag = ('Unk',C) # otherwise apply the unknown word tagger - else : + else: [(_w, t)] = list(self._unk.tag([word])) tag = (t,C) @@ -407,8 +401,6 @@ def _tagword(self, sent, current_states): new_states = current_states - - # now have computed a set of possible new_states # sort states by log prob @@ -420,7 +412,6 @@ def _tagword(self, sent, current_states): if len(new_states) > self._N: new_states = new_states[:self._N] - # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)