Skip to content

Commit

Permalink
Merge pull request #15 from Knox-AAU/refactor
Browse files Browse the repository at this point in the history
Refactor
  • Loading branch information
JonasGLund99 authored Nov 29, 2023
2 parents e082c30 + 0c187a5 commit 62b1e41
Show file tree
Hide file tree
Showing 24 changed files with 29,987 additions and 29,378 deletions.
58,384 changes: 29,192 additions & 29,192 deletions DBpedia_Ont.ttl

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ WORKDIR /code
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt

COPY ./server ./server
COPY . .

CMD [ "python", "./server/server.py", "--host", "0.0.0.0", "--port", "8000", "--reload"]
CMD ["python", "-u", "-m", "server.server", "--host", "0.0.0.0", "--port", "80", "--reload"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ Docker should be installed <a href="https://www.docker.com/products/docker-deskt

### Run docker container using this command

`docker-compose up`
`docker-compose up --build`

You can also do it manually:

Expand Down
8 changes: 5 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
services:
server:
build: .
build: . # Dockerfile location
container_name: server-container
command: python ./server/server.py --host 0.0.0.0 --port 8000 --reload
command: python -u -m server.server --host 0.0.0.0 --port 80 --reload
volumes:
- .:/code # Mount current directory to /code in the image
ports:
- 8000:8000
- "8000:80"
16 changes: 0 additions & 16 deletions getRel.py

This file was deleted.

9 changes: 0 additions & 9 deletions output.py

This file was deleted.

1 change: 1 addition & 0 deletions relation_extraction/AlleRelations.txt

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions relation_extraction/LessNaive/lessNaive.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from .openie import POST_corenlp
import json
import sys
ontology_file_path = 'DBpedia_Ont.ttl'

import urllib.parse
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from rapidfuzz.distance import Levenshtein
from output import format_output
from getRel import extract_specific_relations
from relation_extraction.output import format_output
from relation_extraction.get_relations import extract_specific_relations


def find_best_ontology_match(api_relation, ontology_relations):
Expand Down Expand Up @@ -72,7 +72,7 @@ def do_relation_extraction(data, ontology_relations):
return tuples

def main():
ontology_relations = extract_specific_relations(ontology_file_path)
ontology_relations = extract_specific_relations()
do_relation_extraction(json.load(open("inputSentences.json")), ontology_relations)


Expand Down
62 changes: 37 additions & 25 deletions relation_extraction/NaiveMVP/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
import strsimpy
import sys
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from output import format_output
from getRel import extract_specific_relations
from relation_extraction.output import send_to_database_component
from relation_extraction.get_relations import extract_specific_relations
import datetime
import multiprocessing as mp
from functools import partial

ontology_file_path = 'DBpedia_Ont.ttl'
threshold = 0
normalized_levenshtein = NormalizedLevenshtein()

Expand All @@ -17,53 +16,46 @@ def find_best_match(token, relations):
"Finds the best match given a token and a set of relations"
best_relation_match = ""
highest_similarity = 0
dt = datetime.datetime.now()
for relation in relations:
similarity = normalized_levenshtein.similarity(token, relation)
highest_similarity = similarity if similarity > highest_similarity else highest_similarity
best_relation_match = relation if similarity == highest_similarity else best_relation_match
# print(f"find_best_match: {(datetime.datetime.now()-dt).total_seconds()}")

return {'similarity': highest_similarity, 'predicted_relation': best_relation_match}

def filter_tokens(tokens, entity_mentions):
"Filters out tokens that are substrings of the entity mentions"

filtered_tokens = []

for entity_mention in entity_mentions:
for token in tokens:
if token not in entity_mention["name"]:
filtered_tokens.append(token)

return filtered_tokens
ems = [em["name"] for em in entity_mentions]
return [token for token in tokens if token not in ems]

def find_best_triple(sentence, relations):
"Finds the best triple by comparing each token in a sentence to every relation and returning the triple where the similarity was highest"
entity_mentions = sentence["entity_mentions"]
dt = datetime.datetime.now()
filtered_tokens = filter_tokens(sentence["tokens"], entity_mentions)
#print(f"filter_tokens: {(datetime.datetime.now()-dt).total_seconds()}")
best_triple = []
highest_similarity = 0
dt = datetime.datetime.now()
for token in filtered_tokens:
result = find_best_match(token, relations)
if result["similarity"] > highest_similarity and result["similarity"] > threshold: #Only supporting 2 entity mentions per sentence
highest_similarity = result["similarity"]
best_triple = [entity_mentions[0]["name"], result["predicted_relation"], entity_mentions[1]["name"]]
best_triple = [entity_mentions[0]["iri"], result["predicted_relation"], entity_mentions[1]["iri"]]
if highest_similarity == 0:
best_triple = [entity_mentions[0]["name"], "---",entity_mentions[1]["name"]]
#print(f"handle all tokens: {(datetime.datetime.now()-dt).total_seconds()}")
best_triple = [entity_mentions[0]["iri"], "---",entity_mentions[1]["iri"]]
return best_triple

def parse_data(data, relations):
"Parses JSON data and converts it into a dictionary with information on sentence, tokens, and entity mentions"
output = []
for file in data:
file_name = file["fileName"]
sentences_in_data = file["sentences"]

for sentence_object in sentences_in_data:
for i, em in enumerate(sentence_object["entityMentions"]): #remove all entity mentions with iri=null
if em["iri"] is None:
sentence_object["entityMentions"].pop(i)
print(f"Removed entity because iri=null: {em}")
if len(sentence_object["entityMentions"]) < 2: #skip if less than 2 entity mentions
continue
tokens = sentence_object["sentence"].split(" ")
entity_mentions = sentence_object["entityMentions"]

Expand All @@ -73,17 +65,37 @@ def parse_data(data, relations):
'entity_mentions': entity_mentions
}

output.append(find_best_triple(sentence, relations))
output.append([elem.replace(" ","_") for elem in find_best_triple(sentence, relations)])

return output

def handle_relation_post_request(data):
try:
relations = extract_specific_relations()
except Exception as E:
print(f"Exception during retrieval of relations: {str(E)}")
raise Exception(f"Exception during retrieval of relations")

try:
parsed_data = parse_data(data, relations)
except Exception as E:
print(f"Exception during parse of data {str(E)}")
raise Exception("Incorrectly formatted input. Exception during parsing")

try:
send_to_database_component(parsed_data)
except Exception as E:
print(f"Exception during request to database. {str(E)}")
raise Exception("Data was not sent to database due to connection error")


def main():
relations = extract_specific_relations(ontology_file_path)
relations = extract_specific_relations()
# Opening JSON file
with open('relation_extraction/inputSentences.json', 'r') as f:
with open('inputSentences.json', 'r') as f:
# returns JSON object as a dictionary
data = json.load(f)
format_output(parse_data(data, relations))
send_to_database_component(parse_data(data, relations))

if __name__ == "__main__":
main()
Empty file added relation_extraction/__init__.py
Empty file.
100 changes: 100 additions & 0 deletions relation_extraction/evaluation/DanskEvaluering.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
<?xml version='1.0' encoding='utf-8'?>
<benchmark>
<entries>
<entry category="MusicalWork" eid="Id11" shape="(X (X) (X) (X))" shape_type="sibling" size="3">
<originaltripleset>
<otriple>Hjertevarme | runtime | 42.5</otriple>
<otriple>Hjertevarme | producer | Mikael Rasmussen</otriple>
<otriple>Hjertevarme | subsequentWork | Solopgang</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>Hjertevarme | runtime | 42.5</mtriple>
<mtriple>Hjertevarme | producer | Mikael Rasmussen</mtriple>
<mtriple>Hjertevarme | followedBy | Solopgang</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">Hjertevarme er et album på 42,5 minutter produceret af Mikael Rasmussen, efterfulgt af albummet Solopgang.</lex>
</entry>
<entry category="Company" eid="Id12" shape="(X (X))" shape_type="NA" size="1">
<originaltripleset>
<otriple>HyggeHjem | location | Aarhus</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>HyggeHjem | location | Aarhus</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">HyggeHjem har sin placering i Aarhus.</lex>
</entry>
<entry category="City" eid="Id13" shape="(X (X) (X) (X) (X) (X) (X) (X))" shape_type="sibling" size="7">
<originaltripleset>
<otriple>Odense | populationMetro | 178329</otriple>
<otriple>Odense | leaderTitle | Borgmester</otriple>
<otriple>Odense | type | By</otriple>
<otriple>Odense | populationDensity | 1500.0</otriple>
<otriple>Odense | governmentType | Kommunalregering</otriple>
<otriple>Odense | utcOffset | +1</otriple>
<otriple>Odense | isPartOf | Fyn</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>Odense | populationMetro | 178329</mtriple>
<mtriple>Odense | leaderTitle | "Borgmester"</mtriple>
<mtriple>Odense | type | By</mtriple>
<mtriple>Odense | populationDensity | 1500.0</mtriple>
<mtriple>Odense | governmentType | Kommunalregering</mtriple>
<mtriple>Odense | utcOffset | +1</mtriple>
<mtriple>Odense | isPartOf | Fyn</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">Byen Odense, en del af Fyn med en befolkningstæthed på 1500,0 og en metro-befolkning på 178.329, har en UTC-offset på +1. Regeringstypen i Odense er kommunalregering, og borgmesteren er en af lederne.</lex>
</entry>
<entry category="MeanOfTransportation" eid="Id14" shape="(X (X) (X))" shape_type="sibling" size="2">
<originaltripleset>
<otriple>DanskeFartøjer | powerType | Elektrisk</otriple>
<otriple>
DanskeFartøjer | length | "24000"^^<http://dbpedia.org/datatype/millimetre>
</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>DanskeFartøjer | powerType | Elektrisk</mtriple>
<mtriple>DanskeFartøjer | length | 24000 (millimeter)</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">DanskeFartøjer er 24.000 millimeter langt og har en elektrisk motortype.</lex>
</entry>
<entry category="Building" eid="Id15" shape="(X (X) (X) (X) (X (X)))" shape_type="mixed" size="5">
<originaltripleset>
<otriple>København_Tårn | architect | Lars Mikkelsen</otriple>
<otriple>København_Tårn | address | "Købmagergade 52"@da</otriple>
<otriple>København_Tårn | currentTenants | Danmarks Radio</otriple>
<otriple>København_Tårn | location | København</otriple>
<otriple>Danmarks Radio | country | Denmark</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>København_Tårn | architect | Lars Mikkelsen</mtriple>
<mtriple>København_Tårn | address | "Købmagergade 52"</mtriple>
<mtriple>København_Tårn | currentTenants | Danmarks Radio</mtriple>
<mtriple>København_Tårn | location | København</mtriple>
<mtriple>Danmarks Radio | country | Denmark</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">København Tårn, beliggende i København, Danmark, er designet af Lars Mikkelsen. Adressen på tårnet er "Købmagergade 52" og nuværende lejere er Danmarks Radio.</lex>
</entry>
<entry category="Artist" eid="Id16" shape="(X (X) (X (X)))" shape_type="mixed" size="3">
<originaltripleset>
<otriple>Jens_Larsen | birthPlace | Aalborg</otriple>
<otriple>Jens_Larsen | deathPlace | København</otriple>
<otriple>København | ethnicGroup | Dansk</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>Jens_Larsen | birthPlace | Aalborg</mtriple>
<mtriple>Jens_Larsen | deathPlace | København</mtriple>
<mtriple>København | ethnicGroup | Dansk</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">Jens Larsen blev født i Aalborg og døde i København. Etniske grupper i København inkluderer danskere.</lex>
</entry>
<entry category="Film" eid="Id17" shape="(X (X))" shape_type="NA" size="1">
<originaltripleset>
<otriple>Det_Tabte_Rige | editing | Anna Jørgensen</otriple>
</originaltripleset>
<modifiedtripleset>
<mtriple>Det_Tabte_Rige | editing | Anna Jørgensen</mtriple>
</modifiedtripleset>
<lex lang="da" lid="1">Det Tabte Rige er en film redigeret af Anna Jørgensen.</lex>
</entry>
</entries>
</benchmark>
13 changes: 7 additions & 6 deletions relation_extraction/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from relation_extraction.LessNaive.lessNaive import do_relation_extraction
from relation_extraction.NaiveMVP.main import parse_data
import re
from getRel import extract_specific_relations
from relation_extraction.get_relations import extract_specific_relations
import datetime
import json

Expand All @@ -19,7 +19,7 @@ def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 3,

def convert_testdata_to_input_format():
objs = []
tree = ET.parse('relation_extraction/Evaluation/testdata.xml')
tree = ET.parse('relation_extraction/Evaluation/testdataMini.xml')
root = tree.getroot()
for entry in root.findall('.//entry'):
sentence = entry.findall('lex')[0].text
Expand Down Expand Up @@ -54,11 +54,11 @@ def calculate_metrics(data):
def main():
input_objs = convert_testdata_to_input_format()
print("testdata converted successfully")
ontology_relations = extract_specific_relations(ontology_file_path)
ontology_relations = extract_specific_relations()


solutions_to_test = {
#"less_naive": do_relation_extraction,
# "less_naive": do_relation_extraction
"naive": parse_data
}
evaluation_results = dict() #dictionary to hold results of tests
Expand All @@ -74,13 +74,14 @@ def main():
expected_triples = obj["triples"]
total_triples += len(expected_triples)
ems = []
for triple in expected_triples:
for j, triple in enumerate(expected_triples):
ems.append(triple[0])
ems.append(triple[2])
expected_triples[j] = [expected_triples[j][0].replace(" ", "_"), expected_triples[j][1], expected_triples[j][2].replace(" ", "_")]

ems = list(dict.fromkeys(ems)) #remove duplicate ems

entity_mentions = [{ "name": em, "startIndex": 0, "endIndex": 0 } for em in ems]
entity_mentions = [{ "name": em, "startIndex": 0, "endIndex": 0, "iri": em.replace(" ", "_") } for em in ems]
input_obj = [{
"fileName": "path/to/Artikel.txt",
"sentences": [
Expand Down
Loading

0 comments on commit 62b1e41

Please sign in to comment.