-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
53 lines (47 loc) · 1.93 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
def read_lndoc(path, non_def = 1):
if os.stat(path).st_size > 0:
df = pd.read_csv(path, delimiter='\t', header=None)
df = df.rename(columns={0:'texts'})
name = path.split('/')[-1].split('.')[0]
if non_def == 1:
df['labels'] = 1 if name in ['Y','2'] else 0
else:
df['labels'] = 1 if name in ['Y', 'N1', '2','1'] else 0
df = df.drop_duplicates()
return df
def reformat(path, non_def = 1):
not_data = read_lndoc(path + '/0.lndoc')
def_data = read_lndoc(path + '/2.lndoc')
if non_def == 1:
weak_data = read_lndoc(path + '/1.lndoc', 1)
not_data = pd.concat([not_data, weak_data], axis=0, ignore_index=True)
else:
weak_data = read_lndoc(path + '/1.lndoc', 0)
def_data = pd.concat([def_data, weak_data], axis=0, ignore_index=True)
data = pd.concat([def_data, not_data], axis=0, ignore_index=True)
data['texts'] = [' '.join(word_tokenize(x)) for x in data['texts']]
return data
def preprocess_raw_corpus(path, col1, col2, file='csv', non_def = 1):
if file == 'csv':
df = pd.read_csv(path)[[col1, col2]]
else:
df = pd.read_excel(path)[[col1, col2]]
df = df.rename(columns = {col1:'texts',
col2:'labels'})
df = df.dropna(subset=['labels'])
if non_def == 1:
df['labels'] = [1 if x == 2.0 else 0 for x in df['labels']]
else:
df['labels'] = [1 if x == 2.0 or x == 1.0 else 0 for x in df['labels']]
df['texts'] = [' '.join(word_tokenize(x)) for x in df['texts']]
return df
def get_value_counts(df, names):
return df.labels.value_counts().reset_index().rename(columns = {'labels':names,
'index':'labels'})