-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_extraction.py
192 lines (168 loc) · 8.15 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
def TFIDF(X_train, X_test, typ='word', n1=None, n2=None):
'''
inputs:
documents: this is a list of documents where each document is a list of preprcessed tokens
outputs:
features: This is a 2D array with N rows and V cols
where N is the number of documnets & V is the vocab size
this is a list of features of each document, so features[0] is the features of doc1
which is the tfidf of this document for each word in the vocabulary.
'''
if typ == 'word':
if n1 is None or n2 is None:
n1, n2 = 1, 2
vectorizer = TfidfVectorizer(token_pattern=r'\S+', lowercase=False, tokenizer=lambda x: x, ngram_range=(n1, n2))
elif typ == 'char':
if n1 is None or n2 is None:
n1, n2 = 2, 3
vectorizer =TfidfVectorizer(analyzer='char', ngram_range=(n1, n2))
#
xtrain= [" ".join(doc) for doc in X_train]
xtest= [" ".join(doc) for doc in X_test]
train_features= vectorizer.fit_transform(xtrain)
test_features=vectorizer.transform(raw_documents=xtest)
#
return train_features.toarray(), test_features.toarray()
# print(TFIDF([['a','b','c'],['a','b','c','d']], [['a','b','c'],['a','b','c','d']], typ='word', n1=1, n2=2))
def BOW(train_documents,test_documents):
'''
inputs:
documents: this is a list of documents where each document is a a list of preprcessed tokens
outputs:
features: This is a 2D array with N rows and V cols
where N is the number of documnets & V is the vocab size
this is a list of features of each document, so features[0] is the features of doc1
which is the count of each word in this document for each word in the vocabulary.
'''
# count vectorizer function takes sentences as input
# convert it into matrix representation
# where each cell will be filled by the frequency of each vocab
vectorizer = CountVectorizer(analyzer=lambda x: x)
train_features = vectorizer.fit_transform(train_documents)
test_features=vectorizer.transform(raw_documents=test_documents)
return train_features.toarray(), test_features.toarray()
def get_mean_vector(word2vec_model, words):
# remove out-of-vocabulary words
words = [word for word in words if word in word2vec_model.wv.index_to_key]
if len(words) >= 1:
return np.mean(word2vec_model.wv[words], axis=0)
else:
return []
def CBOW(train_documents,test_documents):
'''
inputs:
documents: this is a list of documents where each document is a a list of preprcessed tokens
outputs:
features: This is a 2D array with N rows and V cols
where N is the number of words & V is the vector size
this is a list of features of each word in each document, so features[0] is the vector of word[0]
'''
# Create CBOW (Continuous Bag of Words) model
# vector_size should be tuned according to the size of the vocabulary we have, could be 100 for the normal dataset
vectorizer_train = Word2Vec(train_documents, min_count = 1, vector_size = 300, window = 5)
vectorizer_test = Word2Vec(test_documents, min_count = 1, vector_size = 300, window = 5)
# append the vectors of all sentences to a list
vectors_cbow_train = []
# get the vector of all sentences
for sentence in train_documents:
vec = get_mean_vector(vectorizer_train, sentence)
if len(vec) > 0:
vectors_cbow_train.append(vec)
vectors_cbow_test = []
for sentence in test_documents:
vec = get_mean_vector(vectorizer_test, sentence)
if len(vec) > 0:
vectors_cbow_test.append(vec)
return np.array(vectors_cbow_train), np.array(vectors_cbow_test)
# return vectorizer_train.wv[vectorizer_train.wv.index_to_key], vectorizer_test.wv[vectorizer_test.wv.index_to_key]
def SG(train_documents,test_documents):
'''
inputs:
documents: this is a list of documents where each document is a a list of preprcessed tokens
outputs:
features: This is a 2D array with N rows and V cols
where N is the number of words & V is the vector size
this is a list of features of each word in each document, so features[0] is the vector of word[0]
'''
# create skip-gram model (SG)
# vector_size should be tuned according to the size of the vocabulary we have, could be 100 for the normal dataset
vectorizer_train = Word2Vec(train_documents, min_count = 1, vector_size = 300, window = 5, sg=1)
vectorizer_test = Word2Vec(test_documents, min_count = 1, vector_size = 300, window = 5, sg=1)
# append the vectors of all sentences to a list
vectors_sg_train = []
# get the vector of all sentences
for sentence in train_documents:
vec = get_mean_vector(vectorizer_train, sentence)
if len(vec) > 0:
vectors_sg_train.append(vec)
vectors_sg_test = []
for sentence in test_documents:
vec = get_mean_vector(vectorizer_test, sentence)
if len(vec) > 0:
vectors_sg_test.append(vec)
return np.array(vectors_sg_train), np.array(vectors_sg_test)
def SGLstm(train_documents,test_documents,max_len):
'''
inputs:
documents: this is a list of documents where each document is a a list of preprcessed tokens
outputs:
features: This is a 2D array with N rows and V cols
where N is the number of words & V is the vector size
this is a list of features of each word in each document, so features[0] is the vector of word[0]
'''
# create skip-gram model (SG)
# vector_size should be tuned according to the size of the vocabulary we have, could be 100 for the normal dataset
vectorizer_train = Word2Vec(train_documents, min_count = 1, vector_size = 300, window = 5, sg=1)
vectorizer_test = Word2Vec(test_documents, min_count = 1, vector_size = 300, window = 5, sg=1)
# append the vectors of each word in each sentence to a list
vectors_sg_train = []
# get the vector of all sentences
for sentence in train_documents:
vectors_sg_train_sent = []
for word in sentence:
if word in vectorizer_train.wv.index_to_key:
vectors_sg_train_sent.append(vectorizer_train.wv[word])
vectors_sg_train.append(vectors_sg_train_sent)
vectors_sg_test = []
for sentence in test_documents:
vectors_sg_test_sent = []
for word in sentence:
if word in vectorizer_test.wv.index_to_key:
vectors_sg_train_sent.append(vectorizer_test.wv[word])
vectors_sg_test.append(vectors_sg_test_sent)
return vectors_sg_train, vectors_sg_test
# return np.array(vectors_sg_train), np.array(vectors_sg_test)
# def applyPCA(X,n_components=100):
# X_copy=X.copy()
# pca=PCA(n_components=n_components)
# X_copy=pca.fit_transform(X_copy)
# return X_copy
# Just a demmo test
doca ='انا طالبه في هندسة'.split()
docb= 'انا سعيدة جدا'.split()
docc='انا هاله'.split()
# print(SG( [doca,docb],[docc]))
# print(CBOW( [doca,docb],[docc]))
# print(TFIDF([doca,docb],[docc]))
# print(BOW([doca,docb],[docc]))
# SGLstm
# Train_X_sg = np.zeros((len(train_documents), max_len, 300))
# for i in range(len(train_documents)):
# for j in range(len(train_documents[i])):
# if train_documents[i][j] in vectorizer_train:
# Train_X_sg[i][j] = vectorizer_train[train_documents[i][j]]
# else:
# print(train_documents[i][j])
# Test_X_sg = np.zeros((len(test_documents), max_len, 300))
# for i in range(len(test_documents)):
# for j in range(len(test_documents[i])):
# if test_documents[i][j] in vectorizer_test:
# Test_X_sg[i][j] = vectorizer_test[test_documents[i][j]]
# else:
# print(train_documents[i][j])