Skip to content

Commit

Permalink
flask with question_generator
Browse files Browse the repository at this point in the history
  • Loading branch information
taehunkim115 committed Feb 25, 2020
1 parent 3ddcb3c commit 83e18f9
Show file tree
Hide file tree
Showing 1,266 changed files with 275 additions and 205,428 deletions.
44 changes: 44 additions & 0 deletions Product/server/app/Coreference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
## Environment Setup:
## pip install spacy==2.1.0
## pip install neuralcoref
## python -m spacy download en


def fix_pronouns(text):

import pip
import spacy
import neuralcoref

## Doesn't work, need to install specific version of spacy
## def import_or_install(package):
## try:
## __import__(package)
## except ImportError:
## pip.main(['install', package])
##
##
## import_or_install('neuralcoref')
## import_or_install('spacy')


#print(spacy.__version__)
nlp = spacy.load('en')
neuralcoref.add_to_pipe(nlp,greedyness=0.5,max_dist=100,blacklist=False)

doc = nlp(text)
clusters = doc._.coref_clusters
resolved_coref = doc._.coref_resolved

return resolved_coref

if __name__ == "__main__":
fixed = fix_pronouns('The dog came back home for dinner. It was happy.')
print(fixed)
## Paramaters for neuralcoref: (keyword arguments)
##greedyness float A number between 0 and 1 determining how greedy the model is about making coreference decisions (more greedy means more coreference links). The default value is 0.5.
##max_dist int How many mentions back to look when considering possible antecedents of the current mention. Decreasing the value will cause the system to run faster but less accurately. The default value is 50.
##max_dist_match int The system will consider linking the current mention to a preceding one further than max_dist away if they share a noun or proper noun. In this case, it looks max_dist_match away instead. The default value is 500.
##blacklist boolean Should the system resolve coreferences for pronouns in the following list: ["i", "me", "my", "you", "your"]. The default value is True (coreference resolved).
##store_scores boolean Should the system store the scores for the coreferences in annotations. The default value is True.
##conv_dict dict(str, list(str)) A conversion dictionary that you can use to replace the embeddings of rare words (keys) by an average of the embeddings of a list of common words (values). Ex: conv_dict={"Angela": ["woman", "girl"]} will help resolving coreferences for Angela by using the embeddings for the more common woman and girl instead of the embedding of Angela. This currently only works for single words (not for words groups).
190 changes: 190 additions & 0 deletions Product/server/app/Extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import sys

import lexnlp.nlp.en.segments.sentences as lex_sentences
import lexnlp.nlp.en.segments.sections as lex_sections
import lexnlp.nlp.en.segments.paragraphs as lex_paragraphs
import lexnlp.nlp.en.segments.pages as lex_pages
import lexnlp.extract.en.dates as lex_dates
import lexnlp.extract.en.courts as lex_courts
import lexnlp.extract.en.definitions as lex_definitions
import lexnlp.extract.en.regulations as lex_regulations
import lexnlp.extract.en.trademarks as lex_trademarks
import lexnlp.extract.en.entities.nltk_maxent as lex_entities
import Coreference as pronouns



class Extractor:

def __init__(self, text):
self.text = self.init_preprocess(text)
self.init_tokenize()
self.sections = None
self.pages = None
self.paragraphs = None
self.sentences = None
return



'''
Functions: Prep the Document
'''
# Pre-process Document
def init_preprocess(self, text = None):
return lex_sentences.pre_process_document(text)


# Tokenize the Document
def init_tokenize(self, text = None):
if not text:
text = self.text
self.sections = self.get_sections(text)
self.paragraphs = self.get_paragraphs(text)
self.sentences = self.get_sentences(text)
self.pages = self.get_pages(text)
return


# Give Extractor new document
def update_text(self, text = None):
if not text:
text = self.text
self.text = self.init_preprocess(text)
return

# Fix Coreference Issues
def fix_pronouns(self, text=None, silence=0):
if not text:
text = self.text
self.sentences = pronouns.fix_pronouns(text)
if silence!=0:
print(self.sentences)
return

'''
Functions: Tokenize The Document
'''
# Returns list of sections
def get_sections(self, text = None):
if not text:
text = self.text
return list(lex_sections.get_sections(text))

# Returns list of pages
def get_pages(self, text = None):
if not text:
text = self.text
return list(lex_pages.get_pages(text))

# Returns list of paragraphs
def get_paragraphs(self, text = None):
if not text:
text = self.text
return list(lex_paragraphs.get_paragraphs(text))


# Returns list of sentences
def get_sentences(self, text = None):
if not text:
text = self.text
return list(lex_sentences.get_sentence_list(text))



'''
Functions: Extract Entities
'''
# Returns list of dates
def extract_dates(self, text = None):
if not text:
text = self.text
return list(lex_dates.get_dates(text))


# Returns list of companies
def extract_companies(self, text = None):
if not text:
text = self.text
return list(lex_entities.get_companies(text))


# Returns list of geopolitical entities
def extract_geopolitical(self, text = None):
if not text:
text = self.text
return list(lex_entities.get_geopolitical(text))


# Returns list of persons
def extract_persons(self, text = None):
if not text:
text = self.text
return list(lex_entities.get_persons(text))


# Returns list of courts
def extract_courts(self, text = None):
if not text:
text = self.text
return list(lex_courts.get_courts(text))


# Returns list of definitions
def extract_definitions(self, text = None):
if not text:
text = self.text
return list(lex_definitions.get_definitions(text))


# Returns list of regulations
def extract_regulations(self, text = None):
if not text:
text = self.text
return list(lex_definitions.get_regulations(text))


# Returns list of trademarks
def extract_regulations(self, text = None):
if not text:
text = self.text
return list(lex_definitions.get_trademarks(text))



'''
Functions: Locate Entities
'''
# Returns page locations for entity
def locate_pages(self, entity):
if not self.pages:
self.init_preprocess()
self.init_tokenize()
if len(entity)>1:
entity = " ".join(entity)
result = []
for page_index, page_content in enumerate(self.pages):
count = page_content.count(entity)
for x in range(count):
# TODO: Needs adjusting for Roman Numeral pages (i, ii, ...) before page 1
result.append((page_index, entity))

return result


# Returns section locations for entity
def locate_sections(self, entity):
if not self.sections:
self.init_preprocess()
self.init_tokenize()
if len(entity)>1:
entity = " ".join(entity)
result = []
for section_index, section_content in enumerate(self.sections):
count = section_content.count(entity)
for x in range(count):
# TODO: Need to get section heading for the sections
result.append((section_index, entity))

return result

30 changes: 30 additions & 0 deletions Product/server/app/Extractor_to_QuestionGen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import Extractor as ex
import question_generator as gen

# To speed up script, start servers:
##bash runStanfordParserServer.sh
##bash runSSTServer.sh



#Dish sample
#direct_path = ""

#Apple Brief
direct_path = ''

with open(direct_path, 'r') as file:
brief = file.read()

test = ex.Extractor(brief)
qGen = gen.QuestionGenerator()
test.fix_pronouns(silence=1)
sentences = test.get_sentences()

for sentence in sentences:
flashcard = qGen.generate_question(sentence)
if flashcard:
#print(type(flashcard), type(flashcard[0]))
print("Question: {}\n\nAnswer: {}'\n-------------".format(flashcard[0]['Q'], flashcard[0]['A']))


11 changes: 9 additions & 2 deletions Product/server/app/routes.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
from app import app
from flask import request
import lexnlp.nlp.en.segments.sentences as lex_sentences
import lexnlp.extract.en.dates as lex_dates

@app.route('/')
@app.route('/index')
def index():
return "this is a test ayy"

@app.route('/date')
def date():

sample = '''
STATEMENT OF ISSUES
1. As the Board recognized, the most important bargaining issue between DISH and the union was QPC. DISH was clear that it would not agree to retain QPC in any form. Following lengthy and unsuccessful negotiations, the union insisted on keeping QPC. DISH made and then implemented its last, best, and final offer.
Expand Down Expand Up @@ -42,7 +48,8 @@ def index():
r = ''

for fact in facts:
r += "Question:\nWhy is {} significant?\n\nAnswer:\n{}".format(str(fact[0]), fact[1])
r += '\n'
# print('i ran')

r = r + "Question:\nWhy is {} significant?\n\nAnswer:\n{}".format(str(fact[0]), fact[1])

return r
1 change: 1 addition & 0 deletions Product/server/nltk.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
all
1 change: 1 addition & 0 deletions Product/server/question-generation
Submodule question-generation added at e027a1

This file was deleted.

39 changes: 0 additions & 39 deletions Product/venv/Lib/site-packages/Click-7.0.dist-info/LICENSE.txt

This file was deleted.

Loading

0 comments on commit 83e18f9

Please sign in to comment.