-
Notifications
You must be signed in to change notification settings - Fork 3
/
markovbrain.py
197 lines (161 loc) · 7.03 KB
/
markovbrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import re
import nltk
import names
import random
import markovify
from unidecode import unidecode
from fuzzywuzzy import process
DEBUG=False
STATESIZE=2
DEFAULT_MAX_OVERLAP_RATIO = 0.7
DEFAULT_MAX_OVERLAP_TOTAL = 15
DEFAULT_TRIES = 40
DEFAULT_MIN_OVERLAP_RATIO = 0.3
DEFAULT_MIN_OVERLAP_TOTAL = 15
class POSifiedText(markovify.Text):
last_sentences = []
last_words = []
keyword_tags = []
last_state = None
def word_split(self, sentence):
words = re.split(self.word_split_pattern, sentence)
words = [ "::".join(tag) for tag in nltk.pos_tag(words) ]
return words
def word_join(self, words):
sentence = " ".join(word.split("::")[0] for word in words)
return sentence
def test_sentence_output(self, words, intext, max_overlap_ratio, max_overlap_total):
"""
Given a generated list of words, accept or reject it. This one rejects
sentences that too closely match the original text, namely those that
contain any identical sequence of words of X length, where X is the
smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total
number of words, and (b) `max_overlap_total` (default: 15).
"""
# Reject large chunks of similarity
overlap_ratio = int(round(max_overlap_ratio * len(words)))
overlap_max = min(max_overlap_total, overlap_ratio)
overlap_over = overlap_max + 1
gram_count = max((len(words) - overlap_max), 1)
grams = [ words[i:i+overlap_over] for i in range(gram_count) ]
for g in grams:
gram_joined = self.word_join(g)
if gram_joined in intext:
return False
return True
def reset_text(self):
print("Resetting last_words...")
self.last_words = []
self.keyword_tags = []
random.seed()
def tags(self, words):
words = [ w.split('::')[0].lower() for w in words if w.split('::')[1] in ['NNP','NNPS' ] and w.split('::')[0].isalpha()]
return words
def keywords(self, words):
#skipits = ['DT', 'IN', 'CC', 'TO', 'PRP']
nouns = ['NNP', 'NNPS', 'NN', 'NNS' ]
#words = [ w for w in words if w.split('::')[1] not in skipits ]
words = [ w.split('::')[0].rstrip('s').lower() for w in words if w.split('::')[1] in nouns ]
return words
def generate_corpus(self, text):
"""
Given a text string, returns a list of lists; that is, a list of
"sentences," each of which is a list of words. Before splitting into
words, the sentences are filtered through `self.test_sentence_input`
"""
sentences = self.sentence_split(text)
cleaned = map(self.clean_sentence, sentences)
passing = filter(self.test_sentence_input, cleaned)
runs = map(self.word_split, passing)
return runs
def clean_sentence(self, sentence):
replace_pat = re.compile(r"( [\"\'])|([\"\'] )|(\.\.\.)|(Alex Jones Show)|(Engineered Lifestyles)|(\[.*\][ ]?-?[ ]?)|([\(\)])|(\n)")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str":
decoded = sentence
else:
decoded = unidecode(sentence)
cleaned = replace_pat.sub(' ', decoded)
cleaned = cleaned.strip()
return cleaned
def test_sentence_input(self, sentence):
"""
A basic sentence filter. This one rejects sentences that contain
the type of punctuation that would look strange on its own
in a randomly-generated sentence.
"""
reject_pat = re.compile(r"(\.\.\.)|(Tumblr)|(http)|(#\D)|(Infowar)|(Infowars)|(Alex Jones Show)|(^')|('$)|\s'|'\s|[\"(\(\)\[\])]|(\n)|(^\s*$)")
#reject_pat = re.compile(r"(\.\.\.)|(Tumblr)|(http)|(#\D)|(Alex Jones Show)|(^')|('$)|\s'|'\s|[\"(\(\)\[\])]|(\n)|(^\s*$)")
#reject_pat = re.compile(r"(Tumblr)|(http)|(#\D)|(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
# Decode unicode, mainly to normalize fancy quotation marks
if sentence.__class__.__name__ == "str":
decoded = sentence
else:
decoded = unidecode(sentence)
# Sentence shouldn't contain problematic characters
if re.search(reject_pat, decoded):
if DEBUG:
print("REJECTING:", decoded)
return False
return True
def seed(self, sentence):
words = self.word_split(sentence)
self.last_words = self.keywords(words)
def make_sentence(self, init_state=None, **kwargs):
"""
Attempts `tries` (default: 10) times to generate a valid sentence,
based on the model and `test_sentence_output`. Passes `max_overlap_ratio`
and `max_overlap_total` to `test_sentence_output`.
If successful, returns the sentence as a string. If not, returns None.
If `init_state` (a tuple of `self.chain.state_size` words) is not specified,
this method chooses a sentence-start at random, in accordance with
the model.
"""
tries = kwargs.get('tries', DEFAULT_TRIES)
mor = kwargs.get('max_overlap_ratio', DEFAULT_MAX_OVERLAP_RATIO)
mot = kwargs.get('max_overlap_total', DEFAULT_MAX_OVERLAP_TOTAL)
maxlen = kwargs.get('maxlen', 9999)
if not init_state and self.last_state:
print("last state:", self.last_state)
init_state = self.last_state
#else:
# print("no last state")
sentence = None
words = []
best_words = []
best_sentence = ""
best_ratio = -999
ratio = -999
sentences = []
for _ in range(tries):
if init_state != None:
if init_state[0] == BEGIN:
prefix = list(init_state[1:])
else:
prefix = list(init_state)
else:
prefix = []
words = prefix + self.chain.walk(init_state)
# Make sure the sentence is not too similar to original corpus
if not self.test_sentence_output(words, self.rejoined_text, mor, mot):
continue
# Now make sure sentence is not too similar to generated text
if not self.test_sentence_output(words, self.last_words, mor, mot):
continue
sentence = self.word_join(words)
if len(sentence) > maxlen:
continue
sentences.append(sentence)
if DEBUG:
print("LAST WORDS:", self.last_words)
joined = self.word_join(self.last_words)
best_sentence, best_ratio = process.extractOne(joined, sentences)
#if best_ratio:
if DEBUG:
print("BEST RATIO:", best_ratio)
best_words = best_sentence.split()
self.keyword_tags.extend(self.tags(self.word_split(best_sentence)))
#best_sentence = self.word_join(best_words)
#self.last_words.extend(self.keywords(best_words))
self.last_words.extend(best_words)
return best_sentence