-
Notifications
You must be signed in to change notification settings - Fork 0
/
KeywordAnalysis.py
52 lines (44 loc) · 2 KB
/
KeywordAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from nltk import *
import requests
from bs4 import BeautifulSoup
from collections import Counter
def text_grab(pmc):
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC" + pmc
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
article_text = ""
# Grabs all the article text while ignoring the references section
paragraphs = soup.find_all("div", "tsec sec")
for i in paragraphs[:-1]:
article_text += ' ' + i.text
return article_text
def get_continuous_chunks(article_text):
stopwords = ['et', 'al.', 'n', 'deviation', 'windowFigure', 'windowFig', ' ]',
'additional', 'data', 'file', ' ', 'distribution', 'significant',
'clinical', 'adverse', 'sample', 'studies', 'significance',
'<', '>', '=', 'window', 't', 't-test', 'supplementary', 'Supplementary', 'important'
'experimental', 'study',
'subjects', 'conditions', 'experiments',
'control', 'panel']
words = word_tokenize(article_text)
filtered_text = [w for w in words if w not in stopwords]
processed_text = pos_tag(filtered_text)
# Regex to grammatically identify a set of: adjective+noun(+noun)
# From NLTK: JJ* = adjective/numeral/ordinal/comparative/superlative, NN* = Nouns
chunk_gram = r"Chunk: {<JJ.?><NN.?>?<NN.?>}"
chunk_parser = RegexpParser(chunk_gram)
chunked = chunk_parser.parse(processed_text)
keywords = [i.leaves() for i in chunked if type(i) == Tree]
# NLTK returns chunks in the form of lists of tuples of the word and its pos tag
newkeys = []
for chunk in keywords:
if len(chunk) < 2:
newkeys.append(list[0][0])
else:
temp = []
for pair in chunk:
temp.append(pair[0])
newkeys.append(" ".join(temp))
counted = Counter(newkeys).most_common(11)
keywords = [i[0] for i in counted]
return keywords