-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.py
209 lines (192 loc) · 7.92 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
'''
preprocess the original Web services
'''
import re
import os
import json
import random
import joblib
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
DATA_ROOT = './data/'
DATASETS =['pw','aws']
MIN_COUNT = 6
stop_words = set(stopwords.words('english'))
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
stop_words.update(english_punctuations)
special_words = {'3d':['3d'],'iot':['iot'],'elearning':['elearning'],'it':['it'],'ecommerce':['ecommerce']}
need_replace_words = {'Text/Captioning':'Captioning','Edit/Processing-Text':'Processing-Text',\
'Edit/Processing-Image':'Processing-Image','Names Entity Recognition - NER':'Names Entity Recognition',\
'Amazon SageMaker Ground Truth Services':'Ground Truth','Edit/Processing-Video':'Processing-Video',\
'Text/OCR':'OCR','ELT/ETL':'Extract-Transform-Load','Continuous Integration and Continuous Delivery':'Continuous Integration Delivery'}
def _replacer(text):
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', r'\g<1> will'),
(r'(\w+)n\'t', r'\g<1> not'),
(r'(\w+)\'ve', r'\g<1> have'),
(r'(\w+)\'s', r'\g<1> is'),
(r'(\w+)\'re', r'\g<1> are'),
(r'(\w+)\'d', r'\g<1> would')]
patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]
s = text
for (pattern, repl) in patterns:
(s, _) = re.subn(pattern, repl, s)
return s
def my_tokenize(seq):
seq = re.sub('https?://\S* ', '', seq)
seq = _replacer(seq)
for k,v in need_replace_words.items():
seq = seq.replace(k,v)
a = word_tokenize(seq)
b = []
for word in a:
if word.lower() in special_words:
b.extend(special_words[word.lower()])
else:
candidates = list(filter(lambda x : x.strip() != '',word.split('-')))
for candidate in candidates:
pre = ''
for chr in candidate:
if not chr.isalpha():
if pre != '':
b.append(pre.lower())
pre = ''
else:
if chr.islower():
pre += chr
else:
if pre == '':
pre += chr
else:
if pre[-1].islower():
b.append(pre.lower())
pre = chr
else:
pre += chr
if pre != '':
b.append(pre.lower())
b = [x for x in b if x not in stop_words and not x.isdigit()]
return b
for dataset in DATASETS:
assert dataset in ['pw','aws']
print('Processing dataset: %s'%dataset)
train_indices = set()
test_indices = set()
res = []
test_count = 0
test_label_count = 0
train_label_count = 0
index2label = {}
label_word_tokens = []
iterators = None
if dataset == 'pw':
MAX_COUNT_TEST = 30
MAX_COUNT_TRAIN = 100 # the max service count for training set, just for saving running time
apis = pd.read_csv(os.path.join(DATA_ROOT,'pw_services.csv'))
apis.fillna('',inplace=True)
apis = apis.dropna(subset=['Description'])
apis['Categories'] = apis['Categories'].apply(lambda x : eval(x))
index2line = {}
label_index = {}
for index,line in enumerate(apis.itertuples()):
index2line[index] = line
for label in line.Categories:
if label == '':
continue
if label not in label_index:
label_index[label] = []
label_index[label].append(index)
label_index = {k:v for k,v in label_index.items() if len(v) >= MIN_COUNT and len(v) <= MAX_COUNT_TRAIN}
label_list = list(label_index.keys())
for v in label_index.values():
if len(v) <= MAX_COUNT_TEST:
test_indices.update(v)
iterators = label_index
else:
MAX_COUNT_TEST = 150
MAX_COUNT_TRAIN = 1000
apis = json.load(open(os.path.join(DATA_ROOT,'amazon_services.json'),'r',encoding='utf8'))
label_id = {}
id_item = {}
for item in apis:
id_item[item['Id']] = item
for label in item['Categories']:
if label == '':
continue
if label not in label_id:
label_id[label] = []
label_id[label].append(item['Id'])
label_id = {k:v for k,v in label_id.items() if len(v) >= MIN_COUNT and len(v) <= MAX_COUNT_TRAIN}
label_list = list(label_id.keys())
for v in label_id.values():
if len(v) <= MAX_COUNT_TEST:
test_indices.update(v)
iterators = label_id
print('Label count: {}'.format(len(label_list)))
label = 0
des2labels = {}
for k,v in sorted(iterators.items(), key=lambda x: len(x[1])):
label_tokens = my_tokenize(k)
label_word_tokens.append(' '.join(label_tokens) + '--------' + k)
index2label[label] = label_tokens
if len(v) <= MAX_COUNT_TEST:
test_count += len(v)
test_label_count += 1
else:
train_label_count += 1
v = [x for x in v if x not in test_indices]
train_indices.update(v)
if dataset == 'pw':
for desc_index in v:
line = index2line[desc_index]
flag = False
tmp = {}
text = my_tokenize(line.Description)
tmp['name'] = line.Name
tmp['text'] = text
tmp['label'] = label
tmp['raw'] = line.Description
tmp['index'] = desc_index
if line.Description not in des2labels:
des2labels[line.Description] = set()
des2labels[line.Description].add(label)
if len(text) > 0:
res.append(tmp)
else:
for id in v:
item = id_item[id]
flag = False
tmp = {}
text = my_tokenize(item['ShortDescription'])
tmp['name'] = item['Title']
tmp['text'] = text
tmp['label'] = label
tmp['raw'] = item['ShortDescription']
tmp['index'] = id
if item['ShortDescription'] not in des2labels:
des2labels[item['ShortDescription']] = set()
des2labels[item['ShortDescription']].add(label)
if len(text) > 0:
res.append(tmp)
label += 1
for tmp in res:
tmp['labels'] = list(des2labels[tmp['raw']])
print('common label count between train and test dataset: %d'%len(train_indices&test_indices))
print('test label count: {}'.format(test_label_count))
print('train label count: {}'.format(train_label_count))
test_sub = res[:test_count]
train_sub = res[test_count:]
random.shuffle(test_sub)
random.shuffle(train_sub)
res = test_sub + train_sub
with open(os.path.join(DATA_ROOT,'%s.json'%dataset),'w') as fw:
for line in res:
fw.write(json.dumps(line)+'\n')
joblib.dump(index2label,os.path.join(DATA_ROOT,'%s_index2label.pkl'%dataset))
with open(os.path.join(DATA_ROOT,'%s_label_tokens.txt'%dataset),'w') as fw:
fw.write('\n'.join(label_word_tokens))