-
Notifications
You must be signed in to change notification settings - Fork 1
/
vocab.py
141 lines (129 loc) · 4.37 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import csv
import os
import subprocess
import stanza
from tensorflow.core.example import example_pb2
import tensorflow as tf
import collections
import struct
stanza.download('en')
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
def process(summary, text, nlp):
s = nlp(summary)
s_um = []
for i, sentence in enumerate(s.sentences):
#s_um.append('<s>')
for token in sentence.tokens:
s_um.append(token.text)
#s_um.append('</s>')
t = nlp(text)
t_ex = []
for i, sentence in enumerate(t.sentences):
for token in sentence.tokens:
t_ex.append(token.text)
article = ' '.join(t_ex)
abstract = ' '.join(s_um)
return abstract, article
def write(file, makevocab=False):
nlp = stanza.Pipeline(lang='en', processors='tokenize', use_gpu = True)
if makevocab:
vocab_counter = collections.Counter()
with open(file, 'r') as f:
reader = csv.reader(f)
count = 0
for row in reader:
if count == 0:
count += 1
continue
# get the text
summary = (row[0]).lower()
if file == 'wikihowAll.csv':
if len(row) >= 3:
text = (row[2]).lower()
else:
if len(row) >= 2:
text = (row[1]).lower()
abstract, article = process(summary, text, nlp)
count += 1
"""
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([article.encode()])
tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode()])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
"""
if makevocab:
print(count, flush=True)
art_tokens = article.split(' ')
abs_tokens = abstract.split(' ')
abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab
tokens = art_tokens + abs_tokens
tokens = [t.strip() for t in tokens] # strip
tokens = [t for t in tokens if t!=""] # remove empty
vocab_counter.update(tokens)
if makevocab:
with open(os.path.join(vocab_dir, "vocab-wiki"), 'w') as writer:
for word, count in vocab_counter.most_common(VOCAB_SIZE):
writer.write(word + ' ' + str(count) + '\n')
def chunk_file():
reader = open(in_file, "rb")
chunk = 0
finished = False
while not finished:
chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % ('train', chunk)) # new chunk
with open(chunk_fname, 'wb') as writer:
for _ in range(CHUNK_SIZE):
len_bytes = reader.read(8)
if not len_bytes:
finished = True
break
str_len = struct.unpack('q', len_bytes)[0]
example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, example_str))
chunk += 1
VOCAB_SIZE = 200000
CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data
vocab_dir = "./vocab"
"""
file = 'cl_news_summary_more.csv'
finish_dir = './outdir_news/'
chunks_dir = './outdirChunks_news/'
in_file = './outdir_news/train.bin'
if not os.path.exists(finish_dir):
os.makedirs(finish_dir)
if not os.path.exists(chunks_dir):
os.makedirs(chunks_dir)
write(file, True)
chunk_file()
"""
file = 'wikihowAll.csv'
"""
finish_dir = './data/wiki/main/'
chunks_dir = './data/wiki/chunks/'
if not os.path.exists(finish_dir):
os.makedirs(finish_dir)
if not os.path.exists(chunks_dir):
"""
write(file, True)
#chunk_file()
"""
file = 'AMItrain.csv'
finish_dir = './data/meeting/main/'
if not os.path.exists(finish_dir):
os.makedirs(finish_dir)
write(file, True)
file = 'AMItest.csv'
finish_dir = './outdir_AMItest/'
if not os.path.exists(finish_dir):
os.makedirs(finish_dir)
write(file, True)
file = 'AMIval.csv'
finish_dir = './outdir_AMIval/'
if not os.path.exists(finish_dir):
os.makedirs(finish_dir)
write(file, True)
"""