-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords.py
54 lines (36 loc) · 1.43 KB
/
words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re
import cgi
from chain import Chain, BEGIN, END
class Words(object):
def __init__(self, corpus, state_size = 2, min_word_len = 2, chain = None):
self.threshold = 100
self.corpus = corpus
self.state_size = state_size
self.min_word_len = min_word_len
self.chain = chain or Chain(self.split_corpus(corpus), state_size)
def split_corpus(self, corpus):
if (len(corpus) == 0):
raise Exception("Corpus is empty")
corpus_list = []
for line in corpus.split('\n'):
'''remove punctuation and lowercase everything'''
line = re.sub(ur"[\p{P}\p{L}<>¿¡]+", "", line.decode('utf-8')).lower()
for word in line.split():
corpus_list += [list(word)]
return corpus_list
def generate_word(self):
word = []
error_count = 0
while len(word) < self.min_word_len:
word = []
error_count += 1
for letter in self.chain.walk():
word += letter
if error_count > self.threshold:
raise Exception('Incorrect parameters. There is no way to generate words long enough with these parameters')
return cgi.escape(''.join(word))
def add_word(self, word):
if not word:
raise Exception("No word was given")
corpus = self.split_corpus(word)
return self.chain.add(corpus)