-
Notifications
You must be signed in to change notification settings - Fork 0
/
eva_nlp_wiki_training_data.py
60 lines (49 loc) · 1.78 KB
/
eva_nlp_wiki_training_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-01-15 02:04:57
# @Author : Tom Hu ([email protected])
# @Link : http://h1994st.com
# @Version : 1.0
import os
import math
base_dir = '/Users/tomhu/Desktop/Experiment/covert channle/NLP/'
wiki_dir = os.path.join(base_dir, 'wikiSent')
wiki_word_count_dir = os.path.join(wiki_dir, 'wikiWordCount')
wiki_score_dir = os.path.join(wiki_dir, 'wikiSentScoreV2')
output_dir = '/Users/tomhu/Desktop/Experiment/covert channle/NLP/output'
wiki_out_file = os.path.join(output_dir, 'wiki_out.csv')
csv_header = 'NN,VB,ADJ,ADV,Total,Score,Class\n'
# Wiki
with open(wiki_out_file, 'w') as out_fp:
out_fp.write(csv_header)
for sentence_hash in os.listdir(wiki_score_dir):
if sentence_hash == '.DS_Store':
continue
score_file = os.path.join(wiki_score_dir, sentence_hash)
if not os.path.exists(score_file):
continue
with open(score_file, 'r') as fp:
score = float(fp.read().strip())
if math.isnan(score):
print sentence_hash, 'score: NaN'
continue
# e.g.
# NN :6
# VB :3
# ADJ :1
# ADV :0
# TOTAL:16
count_file = os.path.join(wiki_word_count_dir, sentence_hash)
if not os.path.exists(count_file):
continue
with open(count_file, 'r') as fp:
lines = fp.readlines()
num_nn = int(lines[0][6:])
num_vb = int(lines[1][6:])
num_adj = int(lines[2][6:])
num_adv = int(lines[3][6:])
num_total = int(lines[4][6:])
line = '%d,%d,%d,%d,%d,%f,%s\n' % (
num_nn, num_vb, num_adj, num_adv, num_total, score, 'normal')
print sentence_hash, line.strip()
out_fp.write(line)