-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
98 lines (77 loc) · 2.56 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import accuracy_score,confusion_matrix,precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import itertools
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing,decomposition,linear_model,naive_bayes
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def file_parser(in_file):
text =[]
label =[]
with open(in_file) as file:
for line in file:
fields= line.split("\t")
label.append(fields[0])
txt = fields[-1]
text.append(txt.split('\n')[0])
data =pd.DataFrame({'text':text,
'label':label})
data.fillna('None')
return shuffle(data)
def file_parser_test(in_file):
text =[]
with open(in_file) as file:
for line in file:
fields= line.split("\n")
text.append(fields[0])
data =pd.DataFrame({'text':text
})
data.fillna('None')
return shuffle(data)
def feature_extractor(text,n_feat):
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=n_feat)
tfidf_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{2,}', max_features=n_feat,ngram_range=(2,3))
tfidf_vect.fit(text)
tfidf_char.fit(text)
# r = pd.DataFrame(tfidf_vect.transform(text).todense())
# s = pd.DataFrame(tfidf_char.transform(text).todense())
# t = pd.concat([r,s],axis=1)
return tfidf_char.transform(text)
def label_encoder(labels,inv = False,encoder=None):
sh = labels.shape
if inv ==False:
if encoder ==None:
n_labels = np.unique(np.asarray(labels))
label_encoder = sklearn.preprocessing.LabelEncoder()
label_encoder.fit(n_labels)
encoded = label_encoder.transform(labels)
elif encoder != None:
label_encoder = encoder
encoded = encoder.transform(labels)
return encoded,label_encoder
elif inv==True:
decoded = encoder.inverse_transform(labels)
return decoded
def preprocess_features(features,n_feat):
pca = decomposition.TruncatedSVD(n_components =n_feat)
pca.fit(features)
feat = preprocessing.normalize(pca.transform(features))
return feat
def balance_data(data_x,data_y):
sm = SMOTE(random_state=1)
x_data,y_data = sm.fit_sample(data_x,data_y)
return shuffle(x_data,y_data)
def train_valid(data_x,data_y,test_size):
sh = len(data_y)
ntr = int(sh*(1-test_size))
train_x = data_x[:ntr,:]
train_y = data_y[:ntr]
valid_x = data_x[ntr:,:]
valid_y = data_y[ntr:]
return train_x,train_y,valid_x,valid_y