Probibility.py

from __future__ import division
from collections import defaultdict
import operator 
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string
# remember to include the other import from the previous post
import math
#from CountingWords import *
#from Term_Cocurrence import *
#from countwords import *
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']


# n_docs is the total n. of tweets
n_docs = 370
p_t = {}
p_t_com = defaultdict(lambda : defaultdict(int))
 
for term, n in count_stop_single.items():
    p_t[term] = n / n_docs
    for t2 in com[term]:
        p_t_com[term][t2] = com[term][t2] / n_docs


positive_vocab = [
    'good', 'nice', 'great', 'awesome', 'outstanding',
    'fantastic', 'terrific', ':)', ':-)', 'like', 'love',
    # shall we also include game-specific terms?
    # 'triumph', 'triumphal', 'triumphant', 'victory', etc.
]
negative_vocab = [
    'bad', 'terrible', 'crap', 'useless', 'hate', ':(', ':-(']
    # 'defeat', etc.

pmi = defaultdict(lambda : defaultdict(int))
for t1 in p_t:
    for t2 in com[t1]:
        denom = p_t[t1] * p_t[t2]
        pmi[t1][t2] = math.log((p_t_com[t1][t2] / denom),2)
 
semantic_orientation = {}
for term, n in p_t.items():
    positive_assoc = sum(pmi[term][tx] for tx in positive_vocab)
    negative_assoc = sum(pmi[term][tx] for tx in negative_vocab)
    semantic_orientation[term] = positive_assoc - negative_assoc

semantic_sorted = sorted(semantic_orientation.items(), 
                         key=operator.itemgetter(1), 
                         reverse=True)
#print(semantic_sorted)
top_pos = semantic_sorted[:10]
top_neg = semantic_sorted[-10:]
 
print(top_pos)
print(top_neg)
print("Chealsea: %f" % semantic_orientation['#Chelsea'])