Skip to content

Commit

Permalink
Implemented a new question-answer-pair filter:
Browse files Browse the repository at this point in the history
    * Complementing our specific data structure (see "get_dict.py")
    * "Get_dict.py" and any txt files are for demo purpose only
    * Currently having 3 semantic structure rules
  • Loading branch information
ZhiliWang committed Mar 17, 2020
1 parent b83becf commit 16eaddc
Show file tree
Hide file tree
Showing 19 changed files with 139 additions and 1,325 deletions.
Binary file removed Flashcard Generator/qa/Example2.xlsx
Binary file not shown.
Binary file removed Flashcard Generator/qa/Example3.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed Flashcard Generator/qa/__pycache__/xl_ls.cpython-36.pyc
Binary file not shown.
89 changes: 0 additions & 89 deletions Flashcard Generator/qa/demo.py

This file was deleted.

15 changes: 0 additions & 15 deletions Flashcard Generator/qa/demo2.py

This file was deleted.

16 changes: 16 additions & 0 deletions Flashcard Generator/qa/filtered_qa.txt

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions Flashcard Generator/qa/get_dict.py

Large diffs are not rendered by default.

Binary file removed Flashcard Generator/qa/merge.xlsx
Binary file not shown.
93 changes: 0 additions & 93 deletions Flashcard Generator/qa/q_filter.py

This file was deleted.

111 changes: 111 additions & 0 deletions Flashcard Generator/qa/qa_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
### Potential Rule-based Question Filters ###
# Created by Zhili Wang
# Gathering rule-based filtering functions
# Potentially implement these in main.py
# all inputs and outputs are in forms of a q:a dictinary:
# {heading: {q1:a1, q2:a2, ..., q230:a230}}
# might need to be careful with heading data type
import nltk
import lexnlp
import lexnlp.nlp.en.tokens
from get_dict import get_dict


def dict_zip(dict, sections):
dict = dict[sections][0]
ls_keys = [k for k in dict]
ls_values = [v for v in dict.values()]
# ls_ls = [[k,v] for k, v in dict.items()]
# ls_tup = [(k,v) for k, v in dict.items()]
return ls_keys, ls_values


# I am taking a different approach to keep every filter short
# and deal with the difficult structure of our json in 'assemble' function
def if_2_short(question):
# purpose:
# remove questions that are too short (likely to be bad):
# remove ones consisted of 2 terms (e.g. "What 345?")
# remove ones shorter than a length of 8 (worst case: "Who is ?")

## c++ way:
## return (len(ques) <= 8 or len(ques.split()) <= 2) ? True : False
# if it is not a bad question --> True --> we take further actions
return True if len(question) > 8 or len(question.split()) > 3 else False


# isolating this function for now
def rm_not_quote(qna, heading):
# if "`" exists once, it usually is accompanied by other weird quote marks
# rid all of them:
# heading = dict_zip(dict)[0][0]
# qna = dict[heading]
questions = dict_zip(qna, heading)[0]

for ques in questions:
if "``" in ques:
newq = ques.replace("``", '"')
qna[newq] = qna.pop(ques)
'''
elif "`" in ques:
newq = ques.replace("`", "")
#newq = ques.strip('"')
#newq = ques.strip("'")
qna[newq] = qna.pop(ques)
'''

questions = dict_zip(qna, heading)[0]
for ques in questions:
newques = ques.split("?", 1)[0] + "?"
qna[newques] = qna.pop(ques)
return qna


def if_bad_question(question): # (qna, heading):
# get rid of the "What-is-[bad-word]" type of questions
# Pronoun + be + [bad word]
# heading = dict_zip(dict)[0][0]
# qna = dict[heading]
nouns = list(lexnlp.nlp.en.tokens.get_nouns(question))
verbs = list(lexnlp.nlp.en.tokens.get_nouns(question))
# if it is a bad question --> True --> we take further actions
if len(question.split()) == 3:
if (question.split()[1] in verbs) and (question.split()[2] not in nouns):
return True
return False


def get_sections(dict_json):
# purpose:
# pass in our structure of q&a data and get a list of sections
sections = []
for section in dict_json.keys():
sections.append(section)
return sections


def assemble(dict1): # we can add the json file of qa here as input later
# get list of sections
dict1 = get_dict()
sections = get_sections(dict1)
print("Here are the sections: {}".format(sections))

# traverse the list of sections to access to every section of data
for sec in sections:
# get each section's
if len(dict1[sec]) == 0:
# if nothing in this section, do nothing
pass
else:
for pairs in dict1[sec]:
temp_new_ls = list()
if if_2_short(pairs["question"]):
temp_new_ls.append(pairs)
elif if_bad_question(pairs["question"]):
temp_new_ls.append(pairs)
dict1[sec] = temp_new_ls
return dict1


if __name__ == "__main__":
print(assemble())
33 changes: 0 additions & 33 deletions Flashcard Generator/qa/ques_filter.py

This file was deleted.

47 changes: 0 additions & 47 deletions Flashcard Generator/qa/question_filter.py

This file was deleted.

Loading

0 comments on commit 16eaddc

Please sign in to comment.