forked from fjavieralba/basic_sentiment_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_sentiment_analysis.py
165 lines (139 loc) · 6.09 KB
/
basic_sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -*- coding: utf-8 -*-
"""
basic_sentiment_analysis
~~~~~~~~~~~~~~~~~~~~~~~~
This module contains the code and examples described in
http://fjavieralba.com/basic-sentiment-analysis-with-python.html
"""
from pprint import pprint
import nltk
import yaml
import sys
import os
import re
class Splitter(object):
def __init__(self):
self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
def split(self, text):
"""
input format: a paragraph of text
output format: a list of lists of words.
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
"""
sentences = self.nltk_splitter.tokenize(text)
tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
return tokenized_sentences
class POSTagger(object):
def __init__(self):
pass
def pos_tag(self, sentences):
"""
input format: list of lists of words
e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
output format: list of lists of tagged tokens. Each tagged tokens has a
form, a lemma, and a list of tags
e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
"""
pos = [nltk.pos_tag(sentence) for sentence in sentences]
#adapt format
pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
return pos
class DictionaryTagger(object):
def __init__(self, dictionary_paths):
files = [open(path, 'r') for path in dictionary_paths]
dictionaries = [yaml.load(dict_file) for dict_file in files]
map(lambda x: x.close(), files)
self.dictionary = {}
self.max_key_size = 0
for curr_dict in dictionaries:
for key in curr_dict:
if key in self.dictionary:
self.dictionary[key].extend(curr_dict[key])
else:
self.dictionary[key] = curr_dict[key]
self.max_key_size = max(self.max_key_size, len(key))
def tag(self, postagged_sentences):
return [self.tag_sentence(sentence) for sentence in postagged_sentences]
def tag_sentence(self, sentence, tag_with_lemmas=False):
"""
the result is only one tagging of all the possible ones.
The resulting tagging is determined by these two priority rules:
- longest matches have higher priority
- search is made from left to right
"""
tag_sentence = []
N = len(sentence)
if self.max_key_size == 0:
self.max_key_size = N
i = 0
while (i < N):
j = min(i + self.max_key_size, N) #avoid overflow
tagged = False
while (j > i):
expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
if tag_with_lemmas:
literal = expression_lemma
else:
literal = expression_form
if literal in self.dictionary:
#self.logger.debug("found: %s" % literal)
is_single_token = j - i == 1
original_position = i
i = j
taggings = [tag for tag in self.dictionary[literal]]
tagged_expression = (expression_form, expression_lemma, taggings)
if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
original_token_tagging = sentence[original_position][2]
tagged_expression[2].extend(original_token_tagging)
tag_sentence.append(tagged_expression)
tagged = True
else:
j = j - 1
if not tagged:
tag_sentence.append(sentence[i])
i += 1
return tag_sentence
def value_of(sentiment):
if sentiment == 'positive': return 1
if sentiment == 'negative': return -1
return 0
def sentence_score(sentence_tokens, previous_token, acum_score):
if not sentence_tokens:
return acum_score
else:
current_token = sentence_tokens[0]
tags = current_token[2]
token_score = sum([value_of(tag) for tag in tags])
if previous_token is not None:
previous_tags = previous_token[2]
if 'inc' in previous_tags:
token_score *= 2.0
elif 'dec' in previous_tags:
token_score /= 2.0
elif 'inv' in previous_tags:
token_score *= -1.0
return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)
def sentiment_score(review):
return sum([sentence_score(sentence, None, 0.0) for sentence in review])
if __name__ == "__main__":
text = """What can I say about this place. The staff of the restaurant is
nice and the eggplant is not bad. Apart from that, very uninspired food,
lack of atmosphere and too expensive. I am a staunch vegetarian and was
sorely dissapointed with the veggie options on the menu. Will be the last
time I visit, I recommend others to avoid."""
splitter = Splitter()
postagger = POSTagger()
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml',
'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml'])
splitted_sentences = splitter.split(text)
pprint(splitted_sentences)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
pprint(pos_tagged_sentences)
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
pprint(dict_tagged_sentences)
print("analyzing sentiment...")
score = sentiment_score(dict_tagged_sentences)
print(score)