flask with question_generator

denmonz · Feb 25, 2020 · 83e18f9 · 83e18f9
1 parent 3ddcb3c
commit 83e18f9
Show file tree

Hide file tree

Showing 1,266 changed files with 275 additions and 205,428 deletions.
diff --git a/Product/server/app/Coreference.py b/Product/server/app/Coreference.py
@@ -0,0 +1,44 @@
+## Environment Setup:
+## pip install spacy==2.1.0
+## pip install neuralcoref
+## python -m spacy download en
+
+
+def fix_pronouns(text):
+
+    import pip
+    import spacy
+    import neuralcoref
+
+##   Doesn't work, need to install specific version of spacy
+##   def import_or_install(package):
+##        try:
+##            __import__(package)
+##        except ImportError:
+##            pip.main(['install', package]) 
+##
+##    
+##    import_or_install('neuralcoref')
+##    import_or_install('spacy')
+
+
+    #print(spacy.__version__)
+    nlp = spacy.load('en')
+    neuralcoref.add_to_pipe(nlp,greedyness=0.5,max_dist=100,blacklist=False)
+
+    doc = nlp(text)
+    clusters = doc._.coref_clusters
+    resolved_coref = doc._.coref_resolved
+
+    return resolved_coref
+
+if __name__ == "__main__":
+    fixed = fix_pronouns('The dog came back home for dinner. It was happy.')
+    print(fixed)
+## Paramaters for neuralcoref: (keyword arguments)
+##greedyness	float	A number between 0 and 1 determining how greedy the model is about making coreference decisions (more greedy means more coreference links). The default value is 0.5.
+##max_dist	int	How many mentions back to look when considering possible antecedents of the current mention. Decreasing the value will cause the system to run faster but less accurately. The default value is 50.
+##max_dist_match	int	The system will consider linking the current mention to a preceding one further than max_dist away if they share a noun or proper noun. In this case, it looks max_dist_match away instead. The default value is 500.
+##blacklist	boolean	Should the system resolve coreferences for pronouns in the following list: ["i", "me", "my", "you", "your"]. The default value is True (coreference resolved).
+##store_scores	boolean	Should the system store the scores for the coreferences in annotations. The default value is True.
+##conv_dict	dict(str, list(str))	A conversion dictionary that you can use to replace the embeddings of rare words (keys) by an average of the embeddings of a list of common words (values). Ex: conv_dict={"Angela": ["woman", "girl"]} will help resolving coreferences for Angela by using the embeddings for the more common woman and girl instead of the embedding of Angela. This currently only works for single words (not for words groups).
diff --git a/Product/server/app/Extractor.py b/Product/server/app/Extractor.py
@@ -0,0 +1,190 @@
+import sys
+
+import lexnlp.nlp.en.segments.sentences as lex_sentences
+import lexnlp.nlp.en.segments.sections as lex_sections
+import lexnlp.nlp.en.segments.paragraphs as lex_paragraphs
+import lexnlp.nlp.en.segments.pages as lex_pages
+import lexnlp.extract.en.dates as lex_dates
+import lexnlp.extract.en.courts as lex_courts
+import lexnlp.extract.en.definitions as lex_definitions
+import lexnlp.extract.en.regulations as lex_regulations
+import lexnlp.extract.en.trademarks as lex_trademarks
+import lexnlp.extract.en.entities.nltk_maxent as lex_entities
+import Coreference as pronouns
+
+
+
+class Extractor:
+
+    def __init__(self, text):
+        self.text = self.init_preprocess(text)
+        self.init_tokenize()
+        self.sections = None
+        self.pages = None
+        self.paragraphs = None
+        self.sentences = None
+        return
+
+
+
+    '''
+    Functions: Prep the Document
+    '''
+    # Pre-process Document
+    def init_preprocess(self, text = None):
+        return lex_sentences.pre_process_document(text)
+
+
+    # Tokenize the Document
+    def init_tokenize(self, text = None):
+        if not text:
+            text = self.text
+        self.sections = self.get_sections(text)
+        self.paragraphs = self.get_paragraphs(text)
+        self.sentences = self.get_sentences(text)
+        self.pages = self.get_pages(text)
+        return
+
+
+    # Give Extractor new document
+    def update_text(self, text = None):
+        if not text:
+            text = self.text
+        self.text = self.init_preprocess(text)
+        return 
+
+    # Fix Coreference Issues
+    def fix_pronouns(self, text=None, silence=0):
+        if not text:
+            text = self.text
+        self.sentences = pronouns.fix_pronouns(text)
+        if silence!=0:
+            print(self.sentences)
+        return
+
+    '''
+    Functions: Tokenize The Document
+    '''
+    # Returns list of sections
+    def get_sections(self, text = None):
+        if not text:
+            text = self.text 
+        return list(lex_sections.get_sections(text))
+
+    # Returns list of pages
+    def get_pages(self, text = None):
+        if not text:
+            text = self.text 
+        return list(lex_pages.get_pages(text))
+
+    # Returns list of paragraphs
+    def get_paragraphs(self, text = None):
+        if not text:
+            text = self.text 
+        return list(lex_paragraphs.get_paragraphs(text))
+
+
+    # Returns list of sentences
+    def get_sentences(self, text = None):
+        if not text:
+            text = self.text 
+        return list(lex_sentences.get_sentence_list(text))
+
+
+
+    '''
+    Functions: Extract Entities
+    '''
+    # Returns list of dates
+    def extract_dates(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_dates.get_dates(text))
+
+
+    # Returns list of companies
+    def extract_companies(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_entities.get_companies(text))
+
+
+    # Returns list of geopolitical entities
+    def extract_geopolitical(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_entities.get_geopolitical(text))
+
+
+    # Returns list of persons
+    def extract_persons(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_entities.get_persons(text))
+
+
+    # Returns list of courts
+    def extract_courts(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_courts.get_courts(text))
+
+
+    # Returns list of definitions
+    def extract_definitions(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_definitions.get_definitions(text))
+
+
+    # Returns list of regulations
+    def extract_regulations(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_definitions.get_regulations(text))
+
+
+    # Returns list of trademarks
+    def extract_regulations(self, text = None):
+        if not text:
+            text = self.text
+        return list(lex_definitions.get_trademarks(text))
+
+
+
+    '''
+    Functions: Locate Entities
+    '''
+    # Returns page locations for entity
+    def locate_pages(self, entity):
+        if not self.pages:
+            self.init_preprocess()
+            self.init_tokenize()
+        if len(entity)>1:
+            entity = " ".join(entity)           
+        result = []
+        for page_index, page_content in enumerate(self.pages):
+            count = page_content.count(entity)
+            for x in range(count):
+                # TODO: Needs adjusting for Roman Numeral pages (i, ii, ...) before page 1
+                result.append((page_index, entity))
+
+        return result
+
+
+    # Returns section locations for entity
+    def locate_sections(self, entity):
+        if not self.sections:
+            self.init_preprocess()
+            self.init_tokenize()
+        if len(entity)>1:
+            entity = " ".join(entity)
+        result = []
+        for section_index, section_content in enumerate(self.sections):
+            count = section_content.count(entity)
+            for x in range(count):
+                # TODO: Need to get section heading for the sections
+                result.append((section_index, entity))
+
+        return result
+
diff --git a/Product/server/app/Extractor_to_QuestionGen.py b/Product/server/app/Extractor_to_QuestionGen.py
@@ -0,0 +1,30 @@
+import Extractor as ex
+import question_generator as gen
+
+# To speed up script, start servers:
+##bash runStanfordParserServer.sh
+##bash runSSTServer.sh
+
+
+
+#Dish sample
+#direct_path = ""
+
+#Apple Brief
+direct_path = ''
+
+with open(direct_path, 'r') as file:
+    brief = file.read()
+
+test = ex.Extractor(brief)
+qGen = gen.QuestionGenerator()
+test.fix_pronouns(silence=1)
+sentences = test.get_sentences()
+
+for sentence in sentences:
+    flashcard = qGen.generate_question(sentence)
+    if flashcard:
+        #print(type(flashcard), type(flashcard[0]))
+        print("Question: {}\n\nAnswer: {}'\n-------------".format(flashcard[0]['Q'], flashcard[0]['A']))
+
+
diff --git a/Product/server/app/routes.py b/Product/server/app/routes.py
@@ -1,10 +1,16 @@
 from app import app
+from flask import request
 import lexnlp.nlp.en.segments.sentences as lex_sentences
 import lexnlp.extract.en.dates as lex_dates
 
 @app.route('/')
 @app.route('/index')
 def index():
+    return "this is a test ayy"
+
+@app.route('/date')
+def date():
+
     sample = '''
 STATEMENT OF ISSUES 
 1. As the Board recognized, the most important bargaining issue between DISH and the union was QPC. DISH was clear that it would not agree to retain QPC in any form. Following lengthy and unsuccessful negotiations, the union insisted on keeping QPC. DISH made and then implemented its last, best, and final offer. 
@@ -42,7 +48,8 @@ def index():
     r = ''
 
     for fact in facts:
-        r += "Question:\nWhy is {} significant?\n\nAnswer:\n{}".format(str(fact[0]), fact[1])
-        r += '\n'
+        # print('i ran')
+
+        r = r + "Question:\nWhy is {} significant?\n\nAnswer:\n{}".format(str(fact[0]), fact[1])
 
     return r
diff --git a/Product/server/nltk.txt b/Product/server/nltk.txt
@@ -0,0 +1 @@
+all
diff --git a/Product/server/question-generation b/Product/server/question-generation
diff --git a/Product/venv/Lib/site-packages/Click-7.0.dist-info/INSTALLER b/Product/venv/Lib/site-packages/Click-7.0.dist-info/INSTALLER
diff --git a/Product/venv/Lib/site-packages/Click-7.0.dist-info/LICENSE.txt b/Product/venv/Lib/site-packages/Click-7.0.dist-info/LICENSE.txt