-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanData.py
307 lines (249 loc) · 10.2 KB
/
cleanData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 28 11:32:13 2017
@author: zaheerbabar
"""
import sys, traceback
import signal
import re
import os
import random
import shutil
import string
import time
import datetime
from pprint import pprint
from math import log10
from imp import reload
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
try:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
except ImportError:
sys.stderr.write('Error! You need to install nltk (http://nltk.org/index.html)')
# my library
import textTools
def signal_handler(signal, frame):
sys.stderr.write(pimpString('You pressed Ctrl+C!','red'))
sys.exit(0)
def is_uppercase(astr):
return (len(set(string.ascii_uppercase).intersection(astr.strip()))+len(set(string.digits).intersection(astr.strip()))) == len(astr.strip())
"""
Class that represent a patient folder with an ID and splitted
and cleaned in 3 parts: patient , clinical , pathological
"""
def DocsToFMatrix(docs):
# initialize the vectorizer
vectorizer = CountVectorizer(min_df=15)
x1 = vectorizer.fit_transform(docs)
# create dataFrame
df = pd.DataFrame(x1.toarray().transpose(), index=vectorizer.get_feature_names())
return df
class Folder:
def __init__(self,plain_text,black_list=[],debug=False):
"""
Instantiate the class Folder starting from the folder plain text
@param plain_text as the variable name suggest, it represents there plain folder text
@type text: str
"""
self.debug=debug
self.plain_text=plain_text
self.black_list=black_list
self.unknown_initials=[]
def process(self):
"""
Process the plain text and then split it in the correct variables
"""
self.semi_clean_text=self.clean(self.plain_text)
#print self.patient_text
self.clean_text=textTools.doStem(nltk.word_tokenize(self.semi_clean_text))
#print self.clinical_text
def clean(self,text,onlyAlphaNum=True):
"""
Clean the input from the symbols that are not letters and from unwanted string line
@param text the text to clean
@type text: str
@param onlyAlphaNum True if you want to leave only the letters and numbers in the text
@type onlyAlphaNum bool
@return clean_lines the cleaned text
@rtype clean_lines: str
"""
clean_lines=""
lines=text.split("\n")
#print("After splitting", lines)
first_line=True
for l in lines:
l=l.strip()
#l=self.removeDates(l)
#edssL=self.isEdssLine(l)
#edssL=""
#if edssL:
#l=edssL
#elif onlyAlphaNum:
if(onlyAlphaNum):
l=''.join(e for e in l if e.isalnum() or e==" ")
l=l.strip()
if not l or not l.find("Page") or not l.find("Neuropathological code") or not l.find("Medication"):
continue
#print("Lets Check",l)
#l=self.removeCommon(l)
#print("Lets Check again",l)
#l=self.initialsToWord(l)
if first_line:
clean_lines=clean_lines + l
first_line=False
else:
clean_lines=clean_lines + " " + l
return clean_lines
def removeDates(self,astr):
"""
Remove the dates from the text
@param astr the string to analyzed
@type text: str
@return clean_str the cleaned string
@rtype clean_str: str
"""
clean_str=""
astrs=re.split(" +",astr)
first_line=True
for w in astrs:
#print "Length of splitting:",len(re.split("/|-",w)),"for word",w
if (len(re.split("/|-",w))>1 and re.split("/|-",w)[0].isdigit() and re.split("/|-",w)[1].isdigit()) or not w :
continue
if first_line:
clean_str=clean_str + w
first_line=False
else:
clean_str=clean_str + " " + w
return clean_str
def removeCommon(self,line):
"""
Remove the common unwanted string from a lines
@param line the line to analyzed
@type text: str
@return clean_str the cleaned string
@type clean_str: str
"""
common_strings=["Cause of death"]
clean_str=""
for c in common_strings:
position=line.lower().find(c.lower())
if not position==-1:
clean_str= line[(position+len(c)):len(line)]
else:
clean_str=line
return clean_str
def isEdssLine(self,line):
ret_line=''
if 'edss' in line.lower():
strEdss=line.split(" ")
if not len(strEdss)<3:
ret_line='edss' + strEdss[1]
return ret_line
def initialsToWord(self,line):
wellknown_initials=["F","M","MS"]
wellknown_initials_words=["Female","Male","Multiple Sclerosis"]
ret_line=""
first_word=True
for w in line.split(" "):
word=w.strip()
if not word:
continue
if is_uppercase(word):
try:
index_word=wellknown_initials.index(word)
word=wellknown_initials_words[index_word]
except ValueError:
if not word in self.unknown_initials:
self.unknown_initials.append(word)
continue
if first_word:
ret_line=word
first_word=False
else:
ret_line=ret_line + " " + word
return ret_line
def _calculate_languages_ratios(self,text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
#----------------------------------------------------------------------
def main(argv):
clean_abstract=np.array([])
black_list=['show',
'year',
'many',
'due'#,
#'eds'
#'type',
#'reveal',
#'serie',
#'column',
#'define',
#'string',
#'contains',
#'till',
#'patient',
#'give',
#'notice'
]
if len(argv)>1:
filename=argv[1]
else:
filename="Dataset/All_Diseases.csv"
try:
Disease_Data=pd.read_csv(filename)
abstracts=Disease_Data['abstract']
abstracts=np.array(abstracts)
i=0
for content in abstracts:
content=str(content)
if(not content.strip()):
clean_abstract = np.append(clean_abstract, "")
else:
#print(i, content)
current = Folder(content, black_list)
current.process()
clean_abstract = np.append(clean_abstract, current.clean_text)
i=i+1
# writeFile(current.clean_text)
Disease_Data['clean_abstract']=clean_abstract
Disease_Data.to_csv('Dataset/clean_disease.csv',encoding='utf-8',index=False)
# Tm=DocsToFMatrix(clean_abstract)
# Tm=Tm.transpose()
#
# Tm['Class']=Disease_Data['Class']
# print(Tm)
# Tm.to_csv('Disease_Data.csv',index=False)
except IOError:
print("Cannot find file",filename)
# content='Epidemiologic studies of acute myocardial infarction(AMI) have described gender differences in the time of death after infarction, with greater numbers of men dying before hospitalization than women.However, in controlled, hospital-based clinical trials, women die at higher rates than men.We hypothesized that evidence of a gender difference in the time of death following AMI may be found in controlled studies of hospitalized AMI patients.We performed a retrospective analysis of the Global Utilization of Streptokinase and Tissue Plasminogen Activator for Occluded Coronary Arteries (GUSTO-1) and International Joint Efficacy Comparison of Thrombolytics (INJECT) trial databases using logistic regression modeling and time-to-death analyses.The age-adjusted female-to-male odds ratio for mortality was 1.4 (95 % confidence interval 1.3 to 1.5) in GUSTO-1 and 1.5 (95 % confidence interval 1.3 to 1.8) in INJECT.GUSTO-1 showed that among patients dying during the first 24 hours after symptom onset, men died an average of 1.7 hours earlier than women (p < 0.001).This difference was due to earlier deaths among men < or =65 years of age.Furthermore, in GUSTO-1, the analysis of time to death in hour increments demonstrated that greater proportions of men died at earlier time points than women and a disproportionate number of early deaths occurred among younger men than among women of any age or older men.In INJECT, where time to death could only be analyzed in 1-day increments, no gender differences were evident.These results raise the possibility that the pattern of earlier death for men in thrombolytic clinical trials represents the continuation of a gender-specific mortality pattern that began before hospitalization.The death of a disproportionate number of men before hospitalization may represent an inherent gender bias for clinical studies enrolling only hospitalized patients.More high-risk men would be excluded from these studies than women because of death before hospitalization.Hence, gender comparisons of in -hospital mortality rates may artificially inflate values for women.'
# current = Folder(content, [])
# current.process()
# print(current.clean_text)
if __name__ == '__main__':
main(sys.argv)