-
Notifications
You must be signed in to change notification settings - Fork 0
/
Probibility.py
62 lines (51 loc) · 1.77 KB
/
Probibility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from __future__ import division
from collections import defaultdict
import operator
import json
from collections import Counter
import re
from nltk.corpus import stopwords
import string
# remember to include the other import from the previous post
import math
#from CountingWords import *
#from Term_Cocurrence import *
#from countwords import *
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
# n_docs is the total n. of tweets
n_docs = 370
p_t = {}
p_t_com = defaultdict(lambda : defaultdict(int))
for term, n in count_stop_single.items():
p_t[term] = n / n_docs
for t2 in com[term]:
p_t_com[term][t2] = com[term][t2] / n_docs
positive_vocab = [
'good', 'nice', 'great', 'awesome', 'outstanding',
'fantastic', 'terrific', ':)', ':-)', 'like', 'love',
# shall we also include game-specific terms?
# 'triumph', 'triumphal', 'triumphant', 'victory', etc.
]
negative_vocab = [
'bad', 'terrible', 'crap', 'useless', 'hate', ':(', ':-(']
# 'defeat', etc.
pmi = defaultdict(lambda : defaultdict(int))
for t1 in p_t:
for t2 in com[t1]:
denom = p_t[t1] * p_t[t2]
pmi[t1][t2] = math.log((p_t_com[t1][t2] / denom),2)
semantic_orientation = {}
for term, n in p_t.items():
positive_assoc = sum(pmi[term][tx] for tx in positive_vocab)
negative_assoc = sum(pmi[term][tx] for tx in negative_vocab)
semantic_orientation[term] = positive_assoc - negative_assoc
semantic_sorted = sorted(semantic_orientation.items(),
key=operator.itemgetter(1),
reverse=True)
#print(semantic_sorted)
top_pos = semantic_sorted[:10]
top_neg = semantic_sorted[-10:]
print(top_pos)
print(top_neg)
print("Chealsea: %f" % semantic_orientation['#Chelsea'])