-
Notifications
You must be signed in to change notification settings - Fork 7
/
wiki_rock_train.py
68 lines (56 loc) · 1.85 KB
/
wiki_rock_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# encoding: utf-8
from __future__ import unicode_literals
import codecs
import glob
import time
import stopword_filtering
import tokenization
import gensim
import dictionary_tokenization
class EnglishSentences(object):
def __init__(self, globPattern, tknsr, debug=False):
self.globPattern = globPattern
self.debug = debug
self.tknsr = tknsr
def __iter__(self):
i = 0
for f in glob.glob(self.globPattern):
i+=1
txt = get_text(f)
for sentence in tokenization.segment_to_sentences(txt):
yield self.tknsr(sentence)
if self.debug and i % 100 == 0:
print(i)
class Corpus:
def __init__(self, globPattern, dic, tokenisr, debug=False):
self._glob_pattern = globPattern
self._dic = dic
self._debug = debug
self._tokeniser = tokenisr
def __iter__(self):
i = 0
for f in glob.glob(self._glob_pattern):
fs = codecs.open(f, encoding="utf-8", mode="r")
txt = fs.read()
fs.close()
words = self._tokeniser(txt)
i += 1
if self._debug and i % 100 == 0:
print(i)
yield self._dic.doc2bow(words)
def get_text(fileName):
f = codecs.open(fileName, mode='r', encoding='utf-8')
txt = f.read()
f.close()
return txt
def train_and_save(multiword_dic_path, stopword_path, corpus_files_glob_pattern, mode_file_name = 'rock_music.w2v'):
tknsr = dictionary_tokenization.DictionaryBasedMultiwordFinder(dictionary_path=multiword_dic_path)
filtr = stopword_filtering.StopwordFilter(stopword_path)
tokenizze = lambda text: filtr.filter(tknsr.tokenise(text))
wvc = gensim.models.Word2Vec(EnglishSentences(corpus_files_glob_pattern, tokenizze, True), min_count=2)
wvc.save(mode_file_name)
'''
# Uncomment to train =>
train_and_save('data/wiki_rock_multiword_dic.txt', 'data/stop-words-english1.txt',
'<THE_LOCATION>/wiki_rock_corpus/*.txt')
'''