-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3ddcb3c
commit 83e18f9
Showing
1,266 changed files
with
275 additions
and
205,428 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
## Environment Setup: | ||
## pip install spacy==2.1.0 | ||
## pip install neuralcoref | ||
## python -m spacy download en | ||
|
||
|
||
def fix_pronouns(text): | ||
|
||
import pip | ||
import spacy | ||
import neuralcoref | ||
|
||
## Doesn't work, need to install specific version of spacy | ||
## def import_or_install(package): | ||
## try: | ||
## __import__(package) | ||
## except ImportError: | ||
## pip.main(['install', package]) | ||
## | ||
## | ||
## import_or_install('neuralcoref') | ||
## import_or_install('spacy') | ||
|
||
|
||
#print(spacy.__version__) | ||
nlp = spacy.load('en') | ||
neuralcoref.add_to_pipe(nlp,greedyness=0.5,max_dist=100,blacklist=False) | ||
|
||
doc = nlp(text) | ||
clusters = doc._.coref_clusters | ||
resolved_coref = doc._.coref_resolved | ||
|
||
return resolved_coref | ||
|
||
if __name__ == "__main__": | ||
fixed = fix_pronouns('The dog came back home for dinner. It was happy.') | ||
print(fixed) | ||
## Paramaters for neuralcoref: (keyword arguments) | ||
##greedyness float A number between 0 and 1 determining how greedy the model is about making coreference decisions (more greedy means more coreference links). The default value is 0.5. | ||
##max_dist int How many mentions back to look when considering possible antecedents of the current mention. Decreasing the value will cause the system to run faster but less accurately. The default value is 50. | ||
##max_dist_match int The system will consider linking the current mention to a preceding one further than max_dist away if they share a noun or proper noun. In this case, it looks max_dist_match away instead. The default value is 500. | ||
##blacklist boolean Should the system resolve coreferences for pronouns in the following list: ["i", "me", "my", "you", "your"]. The default value is True (coreference resolved). | ||
##store_scores boolean Should the system store the scores for the coreferences in annotations. The default value is True. | ||
##conv_dict dict(str, list(str)) A conversion dictionary that you can use to replace the embeddings of rare words (keys) by an average of the embeddings of a list of common words (values). Ex: conv_dict={"Angela": ["woman", "girl"]} will help resolving coreferences for Angela by using the embeddings for the more common woman and girl instead of the embedding of Angela. This currently only works for single words (not for words groups). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
import sys | ||
|
||
import lexnlp.nlp.en.segments.sentences as lex_sentences | ||
import lexnlp.nlp.en.segments.sections as lex_sections | ||
import lexnlp.nlp.en.segments.paragraphs as lex_paragraphs | ||
import lexnlp.nlp.en.segments.pages as lex_pages | ||
import lexnlp.extract.en.dates as lex_dates | ||
import lexnlp.extract.en.courts as lex_courts | ||
import lexnlp.extract.en.definitions as lex_definitions | ||
import lexnlp.extract.en.regulations as lex_regulations | ||
import lexnlp.extract.en.trademarks as lex_trademarks | ||
import lexnlp.extract.en.entities.nltk_maxent as lex_entities | ||
import Coreference as pronouns | ||
|
||
|
||
|
||
class Extractor: | ||
|
||
def __init__(self, text): | ||
self.text = self.init_preprocess(text) | ||
self.init_tokenize() | ||
self.sections = None | ||
self.pages = None | ||
self.paragraphs = None | ||
self.sentences = None | ||
return | ||
|
||
|
||
|
||
''' | ||
Functions: Prep the Document | ||
''' | ||
# Pre-process Document | ||
def init_preprocess(self, text = None): | ||
return lex_sentences.pre_process_document(text) | ||
|
||
|
||
# Tokenize the Document | ||
def init_tokenize(self, text = None): | ||
if not text: | ||
text = self.text | ||
self.sections = self.get_sections(text) | ||
self.paragraphs = self.get_paragraphs(text) | ||
self.sentences = self.get_sentences(text) | ||
self.pages = self.get_pages(text) | ||
return | ||
|
||
|
||
# Give Extractor new document | ||
def update_text(self, text = None): | ||
if not text: | ||
text = self.text | ||
self.text = self.init_preprocess(text) | ||
return | ||
|
||
# Fix Coreference Issues | ||
def fix_pronouns(self, text=None, silence=0): | ||
if not text: | ||
text = self.text | ||
self.sentences = pronouns.fix_pronouns(text) | ||
if silence!=0: | ||
print(self.sentences) | ||
return | ||
|
||
''' | ||
Functions: Tokenize The Document | ||
''' | ||
# Returns list of sections | ||
def get_sections(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_sections.get_sections(text)) | ||
|
||
# Returns list of pages | ||
def get_pages(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_pages.get_pages(text)) | ||
|
||
# Returns list of paragraphs | ||
def get_paragraphs(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_paragraphs.get_paragraphs(text)) | ||
|
||
|
||
# Returns list of sentences | ||
def get_sentences(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_sentences.get_sentence_list(text)) | ||
|
||
|
||
|
||
''' | ||
Functions: Extract Entities | ||
''' | ||
# Returns list of dates | ||
def extract_dates(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_dates.get_dates(text)) | ||
|
||
|
||
# Returns list of companies | ||
def extract_companies(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_entities.get_companies(text)) | ||
|
||
|
||
# Returns list of geopolitical entities | ||
def extract_geopolitical(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_entities.get_geopolitical(text)) | ||
|
||
|
||
# Returns list of persons | ||
def extract_persons(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_entities.get_persons(text)) | ||
|
||
|
||
# Returns list of courts | ||
def extract_courts(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_courts.get_courts(text)) | ||
|
||
|
||
# Returns list of definitions | ||
def extract_definitions(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_definitions.get_definitions(text)) | ||
|
||
|
||
# Returns list of regulations | ||
def extract_regulations(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_definitions.get_regulations(text)) | ||
|
||
|
||
# Returns list of trademarks | ||
def extract_regulations(self, text = None): | ||
if not text: | ||
text = self.text | ||
return list(lex_definitions.get_trademarks(text)) | ||
|
||
|
||
|
||
''' | ||
Functions: Locate Entities | ||
''' | ||
# Returns page locations for entity | ||
def locate_pages(self, entity): | ||
if not self.pages: | ||
self.init_preprocess() | ||
self.init_tokenize() | ||
if len(entity)>1: | ||
entity = " ".join(entity) | ||
result = [] | ||
for page_index, page_content in enumerate(self.pages): | ||
count = page_content.count(entity) | ||
for x in range(count): | ||
# TODO: Needs adjusting for Roman Numeral pages (i, ii, ...) before page 1 | ||
result.append((page_index, entity)) | ||
|
||
return result | ||
|
||
|
||
# Returns section locations for entity | ||
def locate_sections(self, entity): | ||
if not self.sections: | ||
self.init_preprocess() | ||
self.init_tokenize() | ||
if len(entity)>1: | ||
entity = " ".join(entity) | ||
result = [] | ||
for section_index, section_content in enumerate(self.sections): | ||
count = section_content.count(entity) | ||
for x in range(count): | ||
# TODO: Need to get section heading for the sections | ||
result.append((section_index, entity)) | ||
|
||
return result | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import Extractor as ex | ||
import question_generator as gen | ||
|
||
# To speed up script, start servers: | ||
##bash runStanfordParserServer.sh | ||
##bash runSSTServer.sh | ||
|
||
|
||
|
||
#Dish sample | ||
#direct_path = "" | ||
|
||
#Apple Brief | ||
direct_path = '' | ||
|
||
with open(direct_path, 'r') as file: | ||
brief = file.read() | ||
|
||
test = ex.Extractor(brief) | ||
qGen = gen.QuestionGenerator() | ||
test.fix_pronouns(silence=1) | ||
sentences = test.get_sentences() | ||
|
||
for sentence in sentences: | ||
flashcard = qGen.generate_question(sentence) | ||
if flashcard: | ||
#print(type(flashcard), type(flashcard[0])) | ||
print("Question: {}\n\nAnswer: {}'\n-------------".format(flashcard[0]['Q'], flashcard[0]['A'])) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
all |
Submodule question-generation
added at
e027a1
This file was deleted.
Oops, something went wrong.
39 changes: 0 additions & 39 deletions
39
Product/venv/Lib/site-packages/Click-7.0.dist-info/LICENSE.txt
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.