-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmat2tfidf.py
52 lines (43 loc) · 1.27 KB
/
mat2tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
class Mat2Tfidf_Parser:
def __init__(self, mat_data_file):
self._data_file = mat_data_file
self._samples_count = 0
self._features_count = 0
self._tf_matrix = []
self._tfidf_matrix = []
def _tf(self, file_pointer):
"""
Parse file line by line and extract term counts for given positions
"""
for line in file_pointer:
data = line.split()
tmp = [0.0] * self._features_count
for i in range(0, len(data), 2):
# indices in mat file are not zero-based
tmp[int(data[i])-1] = float(data[i+1])
self._tf_matrix.append(tmp)
def parse(self):
# first line: samples_count word_count idk_count)
f = open(self._data_file, 'r')
header = f.readline().split()
self._samples_count = int(header[0])
self._features_count = int(header[1])
self._tf(f)
tfidf = TfidfTransformer(norm='l2')
self._tfidf_matrix = tfidf.fit_transform(self._tf_matrix)
def get_tfidf_matrix(self):
"""
Return tf-idf matrix of input.
"""
return self._tfidf_matrix
def get_tf_matrix(self):
"""
Return term-frequency matrix of input.
"""
return self._tf_matrix
def get_samples_count(self):
return self._samples_count
def get_features_count(self):
return self._features_count