forked from findoctor/Keyphrase-extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
74 lines (65 loc) · 2.33 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import codecs
from . import config as cfg
def read_from_txt(path):
# return string as raw text
with open(path, 'r') as file:
text = file.read().replace('\n', '.\n')
return text
def extract_phrases(my_tree, phrase):
"""
Helper function for generating candidate phrases
"""
my_phrases = []
if my_tree.label() == phrase:
my_phrases.append(my_tree.copy(True))
for child in my_tree:
if type(child) is nltk.Tree:
list_of_phrases = extract_phrases(child, phrase)
if len(list_of_phrases) > 0:
my_phrases.extend(list_of_phrases)
return my_phrases
def generate_candidate_phrase(test_sentence, grammers = cfg.GRAMMER):
words = nltk.word_tokenize(test_sentence)
tags = nltk.pos_tag(words)
candidate_phrases = set([])
for grammar in grammers:
parser = nltk.RegexpParser(grammar)
tree = parser.parse(tags)
candidate_trees = extract_phrases(tree, 'NP')
for phrase in candidate_trees:
candidate_phrases.add(" ".join([x[0].lower() for x in phrase.leaves()])) # Lower case
return list(candidate_phrases)
# adj or noun as graph nodes (usd for TPR, pageRank)
def generate_graph_nodes(test_sentence, grammar = cfg.adj_or_noun):
words = nltk.word_tokenize(test_sentence)
tags = nltk.pos_tag(words)
candidate_words = set([])
parser = nltk.RegexpParser(grammar)
tree = parser.parse(tags)
candidate_trees = extract_phrases(tree, 'NP')
for phrase in candidate_trees:
candidate_words.add("".join([x[0].lower() for x in phrase.leaves()]))
return list(candidate_words)
def clean_text(doc):
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
result = tokenizer.tokenize(doc)
new_sentence =[]
for w in result:
if w not in stop_words:
new_sentence.append(w)
new_sentence = " ".join(item for item in new_sentence)
return new_sentence
def generate_window_words(doc):
# Avoiding "He's" be tokenized as He + 's
# tokenizer = nltk.tokenize.MWETokenizer()
result = nltk.word_tokenize(doc)
new_sentence =[]
for w in result:
new_sentence.append(w)
return new_sentence