-
Notifications
You must be signed in to change notification settings - Fork 4
/
NLP.py
182 lines (172 loc) · 6.86 KB
/
NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import streamlit as st
import pyautogui
import nltk
from autocorrect import Speller
from gensim.summarization import summarize as g_sumn
clear = st.button("CLEAR")
inputtext = st.text_area("Input",height=200)
def nlp(text,operation):
if operation == 'Lower Case':
return text.lower()
if operation == 'Sent Tokenize':
sent_tokenize = nltk.sent_tokenize(text)
result = {
# remove str() if you want the output as list
"result": str(sent_tokenize)
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Word Tokenize':
word_tokenize = nltk.word_tokenize(text)
result = {
"result": str(word_tokenize) # remove str() if you want the output as list
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Lemmatize':
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
word_tokens = nltk.word_tokenize(text)
lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in
word_tokens]
result = {
"result": " ".join(lemmatized_word)
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Stemming':
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
word_tokens = nltk.word_tokenize(text)
lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in
word_tokens]
result = {
"result": " ".join(lemmatized_word)
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Remove Numbers':
remove_num = ''.join(c for c in text if not c.isdigit())
result = {
"result": remove_num
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Remove Punctuation':
from string import punctuation
def strip_punctuation(s):
return ''.join(c for c in s if c not in punctuation)
text = strip_punctuation(text)
result = {
"result": text
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Spell Check':
spell = Speller(lang='en')
spells = [spell(w) for w in (nltk.word_tokenize(text))]
result = {
"result": " ".join(spells)
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Remove Stopwords':
from nltk.corpus import stopwords
stopword = stopwords.words('english')
word_tokens = nltk.word_tokenize(text)
removing_stopwords = [word for word in word_tokens if word not in stopword]
result = {
"result": " ".join(removing_stopwords)
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Keyword':
word = nltk.word_tokenize(text)
pos_tag = nltk.pos_tag(word)
chunk = nltk.ne_chunk(pos_tag)
NE = [" ".join(w for w, t in ele) for ele in chunk if isinstance(ele, nltk.Tree)]
result = {
"result": NE
}
result = {str(key): value for key, value in result.items()}
res = result['result'][0]
return res
if operation == 'Summarize':
sent = nltk.sent_tokenize(text)
if len(sent) < 2:
summary1 = "please pass more than 3 sentences to summarize the text"
else:
summary = g_sumn(text)
summ = nltk.sent_tokenize(summary)
summary1 = (" ".join(summ[:2]))
result = {
"result": summary1
}
result = {str(key): value for key, value in result.items()}
res = result['result']
return res
if operation == 'Remove Tags':
import re
cleaned_text = re.sub('<[^<]+?>', '', text)
result = {
"result": cleaned_text
}
result = {str(key): value for key, value in result.items()}
res = re.sub(' +', ' ', result['result'])
return res
def Clear():
pyautogui.press("tab", interval=0.15)
pyautogui.hotkey("ctrl", "a",'del', interval=0.15)
pyautogui.press("tab", interval=0.15)
choice = st.sidebar.radio("SELECT NLP FUNCTION", ('Lower Case','Lemmatize','Summarize','Stemming',
'Keyword','Spell Check','Remove Tags',
'Sent Tokenize','Word Tokenize',
'Remove Punctuation','Remove Numbers',
'Remove Stopwords'))
if choice == 'Lower Case' and len(inputtext) > 0 :
output = nlp(inputtext,'Lower Case')
st.text_area("Output",output,height=200)
elif choice == 'Lemmatize' and len(inputtext) > 0:
output = nlp(inputtext,'Lemmatize')
st.text_area("Output", output, height=200)
elif choice == 'Summarize' and len(inputtext) > 0:
output = nlp(inputtext,'Summarize')
st.text_area("Output",output,height=200)
elif choice == 'Stemming' and len(inputtext) > 0:
output = nlp(inputtext,'Stemming')
st.text_area("Output",output,height=200)
elif choice == 'Keyword' and len(inputtext) > 0:
output = nlp(inputtext,'Keyword')
st.text_area("Output",output,height=200)
elif choice == 'Spell Check' and len(inputtext) > 0:
output = nlp(inputtext,'Spell Check')
st.text_area("Output",output,height=200)
elif choice == 'Remove Tags' and len(inputtext) > 0:
output = nlp(inputtext,'Remove Tags')
st.text_area("Output",output,height=200)
elif choice == 'Sent Tokenize' and len(inputtext) > 0:
output = nlp(inputtext,'Sent Tokenize')
st.text_area("Output",output,height=200)
elif choice == 'Word Tokenize' and len(inputtext) > 0:
output = nlp(inputtext,'Word Tokenize')
st.text_area("Output",output,height=200)
elif choice == 'Remove Numbers' and len(inputtext) > 0:
output = nlp(inputtext,'Remove Numbers')
st.text_area("Output",output,height=200)
elif choice == 'Remove Punctuation' and len(inputtext) > 0:
output = nlp(inputtext,'Remove Punctuation')
st.text_area("Output",output,height=200)
elif choice == 'Remove Stopwords' and len(inputtext) > 0:
output = nlp(inputtext,'Remove Stopwords')
st.text_area("Output",output,height=200)
# Clear I/O
if clear:
Clear()