-
Notifications
You must be signed in to change notification settings - Fork 4
/
ngram_model.py
82 lines (68 loc) · 2.37 KB
/
ngram_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 25 09:01:45 2020
@author: Jie Chen
"""
import csv
import json
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
def get_ngrams(sequence, n):
# Generate n-gram list
ngram_list = []
if (n >= 1 and n <= len(sequence)):
# Add 'STOP' before processing sequence
sequence_with_stop = sequence[:]
sequence_with_stop.append('STOP')
# index is the index of last word which should be stored in a tuple
for index in range(len(sequence_with_stop)):
ngram_tuple = ()
# pointer points to the first element which should be stored in a tuple,
# then second element, and so on. pointer is no larger than index.
pointer = index - n + 1
while pointer <= index:
ngram_tuple += ('START', ) if pointer < 0 else (sequence_with_stop[pointer], )
pointer += 1
ngram_list.append(ngram_tuple)
return ngram_list
# Retrieve negative word list from file
# 'n' means will get the most common n words.
def get_neg_words(filename, n):
negwords = []
with open(filename, newline='') as file:
next(file)
csvReader = csv.reader(file)
count = 0
for index, word, n_ in csvReader:
negwords.append(word)
count += 1
if count == n:
return negwords
negwords = get_neg_words("data/neg_wordlist.csv", 42)
# Get abstract corpus
with open("data/json_summary.json") as file:
summary = json.load(file)
data = [sublist['data'] for sublist in summary]
abstract = [sublist['abstract'] for sublist in data]
corpus = abstract
# Get bigram in the format of tuple
bigrams_neg = []
for text in abstract:
doc = nlp(text)
token = [token.text for token in doc]
bigrams = get_ngrams(token, 2)
for tup in bigrams:
for negw in negwords:
if negw in tup:
bigrams_neg.append(tup)
# Get trigram in the format of string for better output vision
trigrams_neg = []
bigrams_process = [bigrams_neg[i:i+2] for i in range(0,len(bigrams_neg),2)]
for tup1, tup2 in bigrams_process:
trigrams_neg.append(tup1[0] + " " + tup1[1] + " " + tup2[1])
"""
with open('data/neg_trigrams.txt', 'w') as file:
for grams in trigrams_neg:
file.write(grams + '\n')
"""