diff --git a/Dockerfile b/Dockerfile index 8296b8e..35d5d4a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,12 +4,6 @@ WORKDIR /code COPY . . -# Install necessary build tools and dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - cmake \ - && rm -rf /var/lib/apt/lists/* - RUN pip install --no-cache-dir -r requirements_docker.txt CMD ["python", "-u", "-m", "server.server", "--host", "0.0.0.0", "--port", "4444", "--reload"] \ No newline at end of file diff --git a/concept_linking/requirements.txt b/concept_linking/requirements.txt index 054f287..9592a41 100644 --- a/concept_linking/requirements.txt +++ b/concept_linking/requirements.txt @@ -1,9 +1,9 @@ # Tools #Requirements for LlamaServer --r tools/LlamaServer/requirements.txt +#-r tools/LlamaServer/requirements.txt #Requirements for OntologyGraphBuilder --r tools/OntologyGraphBuilder/requirements.txt +#-r tools/OntologyGraphBuilder/requirements.txt # Solutions #Requirements for MachineLearning diff --git a/concept_linking/solutions/PromptEngineering/main.py b/concept_linking/solutions/PromptEngineering/main.py index 7145898..074a227 100644 --- a/concept_linking/solutions/PromptEngineering/main.py +++ b/concept_linking/solutions/PromptEngineering/main.py @@ -8,13 +8,16 @@ from relation_extraction.knowledge_graph_messenger import KnowledgeGraphMessenger from concept_linking.tools.triple_helper import * -# Local API url +# Local API url python api_url = "http://127.0.0.1:5000/llama" +# Local API url docker +# api_url = "http://llama-cpu-server:5000/llama" + # Remote API url # api_url = "http://knox-proxy01.srv.aau.dk/llama-api/llama" -headers = {"Content-Type": "application/json"} +headers = {"Access-Authorization": os.getenv("ACCESS_SECRET"), "Content-Type": "application/json"} PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) @@ -181,7 +184,7 @@ def perform_entity_type_classification(post_json, output_file_path=None, output_ if __name__ == '__main__': - input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN.json") + input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN_small.json") output_file = os.path.join(PROJECT_ROOT, "data/files/PromptEngineering/output.json") f = open(input_file, encoding="utf-8") diff --git a/concept_linking/tools/LlamaServer/Dockerfile b/concept_linking/tools/LlamaServer/Dockerfile index eba484c..489a33b 100644 --- a/concept_linking/tools/LlamaServer/Dockerfile +++ b/concept_linking/tools/LlamaServer/Dockerfile @@ -1,6 +1,5 @@ # Use python as base image -FROM python - +FROM python:3.11-slim # Set the working directory in the container WORKDIR /app @@ -8,15 +7,20 @@ WORKDIR /app COPY llama_cpu_server.py . COPY requirements.txt . +#Install necessary build tools and dependencies for running C++(llama_cpp) +# This can be removed when app is in production and remote llama api server is reliable and used instead of local llama +# Install dependencies and curl +RUN apt-get update && apt-get install -y build-essential cmake curl && rm -rf /var/lib/apt/lists/* + + # Install dependencies RUN pip install --no-cache-dir -r requirements.txt -# Check if the model file exists, and if not, download it using the provided function -RUN python -c "from llama_cpu_server import download_model; download_model('llama-2-7b-chat.Q2_K.gguf', 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true')" - +# Download the model file from the URL if it doesn't exist +RUN test -e /app/llama-2-7b-chat.Q2_K.gguf || curl -o llama-2-7b-chat.Q2_K.gguf -LJO 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true' # Expose port 5000 outside of the container EXPOSE 5000 # Run llama_cpu_server.py when the container launches -CMD ["python", "llama_cpu_server.py"] +CMD ["python", "-u", "-m", "llama_cpu_server", "--host", "0.0.0.0", "--port", "5000", "--reload"] diff --git a/concept_linking/tools/LlamaServer/docker-compose.yml b/concept_linking/tools/LlamaServer/docker-compose.yml index 42bc3b1..18ab729 100644 --- a/concept_linking/tools/LlamaServer/docker-compose.yml +++ b/concept_linking/tools/LlamaServer/docker-compose.yml @@ -1,11 +1,9 @@ -version: '3' - services: llama-cpu-server: - build: - context: . - dockerfile: Dockerfile - ports: - - "5000:5000" + build: . + container_name: llama-server + command: python -u -m llama_cpu_server --host 0.0.0.0 --port 5000 --reload volumes: - ./concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf:/app/concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf + ports: + - "5000:5000" \ No newline at end of file diff --git a/relation_extraction/evaluation/DanskEvaluering.xml b/relation_extraction/evaluation/DanskEvaluering.xml index f87b77c..fc51d20 100644 --- a/relation_extraction/evaluation/DanskEvaluering.xml +++ b/relation_extraction/evaluation/DanskEvaluering.xml @@ -48,7 +48,7 @@ DanskeFartøjer | powerType | Elektrisk - DanskeFartøjer | length | "24000"^^ + DanskeFartøjer | length | "24000"^^<http://dbpedia.org/datatype/millimetre> @@ -61,14 +61,12 @@ København_Tårn | architect | Lars Mikkelsen København_Tårn | address | "Købmagergade 52"@da - København_Tårn | currentTenants | Danmarks Radio København_Tårn | location | København Danmarks Radio | country | Denmark København_Tårn | architect | Lars Mikkelsen København_Tårn | address | "Købmagergade 52" - København_Tårn | currentTenants | Danmarks Radio København_Tårn | location | København Danmarks Radio | country | Denmark diff --git a/relation_extraction/evaluation/evaluation.py b/relation_extraction/evaluation/evaluation.py index ed817b7..4958bc3 100644 --- a/relation_extraction/evaluation/evaluation.py +++ b/relation_extraction/evaluation/evaluation.py @@ -5,6 +5,7 @@ from relation_extraction.NaiveMVP.main import parse_data from relation_extraction.multilingual.llm_messenger import LLMMessenger import re +import copy import datetime import json @@ -21,7 +22,7 @@ def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 3, def convert_testdata_to_input_format(): objs = [] - tree = ET.parse('relation_extraction/Evaluation/testdataMini.xml') + tree = ET.parse('relation_extraction/evaluation/DanskEvaluering.xml') root = tree.getroot() for entry in root.findall('.//entry'): sentence = entry.findall('lex')[0].text @@ -42,7 +43,13 @@ def calculate_metrics(data): FP = 0 FN = 0 - for element in data["triples"]: + data_without_duplicates = copy.deepcopy(data) + + for triples in data_without_duplicates["triples"]: + triples["triples_from_solution"] = set(tuple(triple) for triple in triples["triples_from_solution"]) + triples["triples_from_solution"] = list(list(triple) for triple in triples["triples_from_solution"]) + + for element in data_without_duplicates["triples"]: TP += element["contains_hits"] FP += len(element["triples_from_solution"]) - element["contains_hits"] FN += len(element["expected_triples"]) - element["contains_hits"] @@ -99,27 +106,32 @@ def main(): split_relations = [ontology_relations[i:i + chunk_size] for i in range(0, len(ontology_relations), chunk_size)] #Split the relations into lists of size chunk_size res = [] for split_relation in split_relations: - res.append(solution(input_obj, split_relation, ontology_relations)) + part_res = solution(input_obj, split_relation, ontology_relations) + for triple in part_res: + triple[1] = triple[1].replace("http://dbpedia.org/ontology/", "") + res.extend(part_res) res_hits = 0 - for triple in res: + convert_to_set_res = set(tuple(triples) for triples in res) + removed_duplicates_res = list(list(triples) for triples in convert_to_set_res) + for triple in removed_duplicates_res: if triple in expected_triples: res_hits += 1 hits +=1 - + evaluation_result_triples.append({"sentence":sentence, "triples_from_solution": res, "expected_triples": expected_triples, "contains_hits": res_hits}) eta = round((((datetime.datetime.now()-dt).total_seconds()/60)/((i+1)/len(input_objs)))*(1-((i+1)/len(input_objs))),5) progress_suffix = f"Complete. Timeusage: {round((datetime.datetime.now()-dt).total_seconds()/60,5)} minutes. Eta {eta} minutes." printProgressBar(i + 1, len(input_objs), prefix = 'Progress:', suffix = progress_suffix, length = 50) - print(f"Solution {name} finished. Hit {hits}/{total_triples}. Hit percentage: {(hits/total_triples)*100}%") - evaluation_results[name] = { - "triples": evaluation_result_triples, - "result": {"total_expected_triples": total_triples, "hits": hits, "hit_percentage": hits/total_triples}, - "score": calculate_metrics({"triples": evaluation_result_triples}) - } + print(f"Solution {name} finished. Hit {hits}/{total_triples}. Hit percentage: {(hits/total_triples)*100}%") + evaluation_results[name] = { + "triples": evaluation_result_triples, + "result": {"total_expected_triples": total_triples, "hits": hits, "hit_percentage": hits/total_triples}, + "score": calculate_metrics({"triples": evaluation_result_triples}) + } - with open("relation_extraction/Evaluation/evaluation_results.json", "w") as f: - json.dump(evaluation_results, f, indent=4) + with open("relation_extraction/Evaluation/evaluation_results.json", "w") as f: + json.dump(evaluation_results, f, indent=4) diff --git a/relation_extraction/evaluation/evaluation_results.json b/relation_extraction/evaluation/evaluation_results.json index 55a5c22..6529d65 100644 --- a/relation_extraction/evaluation/evaluation_results.json +++ b/relation_extraction/evaluation/evaluation_results.json @@ -1,133 +1,297 @@ { - "naive": { + "multilingual": { "triples": [ { - "sentence": "Turn Me On is a 35.1 minute long album produced by Wharton Tiers that was followed by the album entitled Take it Off.", + "sentence": "Hjertevarme er et album p\u00e5 42,5 minutter produceret af Mikael Rasmussen, efterfulgt af albummet Solopgang.", "triples_from_solution": [ [ - "Turn_Me_On", - "album", - "35.1" + "Mikael_Rasmussen", + "producer", + "Hjertevarme" + ], + [ + "Mikael_Rasmussen", + "producer", + "Hjertevarme" + ], + [ + "Hjertevarme", + "producedBy", + "Mikael_Rasmussen" + ], + [ + "Solopgang", + "follows", + "Hjertevarme" + ], + [ + "Hjertevarme", + "producedBy", + "Mikael_Rasmussen" + ], + [ + "Solopgang", + "follows", + "Hjertevarme" ] ], "expected_triples": [ [ - "Turn_Me_On", + "Hjertevarme", "runtime", - "35.1" + "42.5" ], [ - "Turn_Me_On", + "Hjertevarme", "producer", - "Wharton_Tiers" + "Mikael_Rasmussen" ], [ - "Turn_Me_On", + "Hjertevarme", "followedBy", - "Take_It_Off!" + "Solopgang" ] ], "contains_hits": 0 }, { - "sentence": "The location of Trane is Swords, Dublin.", + "sentence": "HyggeHjem har sin placering i Aarhus.", "triples_from_solution": [ [ - "Trane", + "HyggeHjem", "location", - "Swords,_Dublin" + "Aarhus" + ], + [ + "HyggeHjem", + "location", + "Aarhus" + ], + [ + "HyggeHjem", + "location", + "Aarhus" ] ], "expected_triples": [ [ - "Trane", + "HyggeHjem", "location", - "Swords,_Dublin" + "Aarhus" ] ], "contains_hits": 1 }, { - "sentence": "The Ciudad Ayala city, a part of Morelos with population density and population of 1604.0 and 1,777,539 respectively, has a UTC offset of -6. The government type of Ciudad Ayala is council-manager government and City Manager is one of the leaders.", + "sentence": "Byen Odense, en del af Fyn med en befolkningst\u00e6thed p\u00e5 1500,0 og en metro-befolkning p\u00e5 178.329, har en UTC-offset p\u00e5 +1. Regeringstypen i Odense er kommunalregering, og borgmesteren er en af lederne.", "triples_from_solution": [ [ - "Ciudad_Ayala", - "part", - "1777539" + "Odense", + "location", + "Fyn" + ], + [ + "178329", + "population", + "Odense" ] ], "expected_triples": [ [ - "Ciudad_Ayala", + "Odense", "populationMetro", - "1777539" + "178329" ], [ - "Ciudad_Ayala", + "Odense", "leaderTitle", - "\"City_Manager\"" + "\"Borgmester\"" ], [ - "Ciudad_Ayala", + "Odense", "type", - "City" + "By" ], [ - "Ciudad_Ayala", + "Odense", "populationDensity", - "1604.0" + "1500.0" ], [ - "Ciudad_Ayala", + "Odense", "governmentType", - "Council-manager_government" + "Kommunalregering" ], [ - "Ciudad_Ayala", + "Odense", "utcOffset", - "\u22126" + "+1" ], [ - "Ciudad_Ayala", + "Odense", "isPartOf", - "Morelos" + "Fyn" ] ], "contains_hits": 0 }, { - "sentence": "The 17068.8 millimeter long ALCO RS-3 has a diesel-electric transmission.", + "sentence": "DanskeFart\u00f8jer er 24.000 millimeter langt og har en elektrisk motortype.", "triples_from_solution": [ [ - "ALCO_RS-3", - "transmission", - "Diesel-electric_transmission" + "DanskeFart\u00f8jer", + "length", + "24000" + ], + [ + "DanskeFart\u00f8jer", + "location", + "24000" + ], + [ + "DanskeFart\u00f8jer", + "location", + "24000" + ], + [ + "DanskeFart\u00f8jer", + "length", + "24000" + ], + [ + "DanskeFart\u00f8jer", + "length", + "24000" + ], + [ + "DanskeFart\u00f8jer", + "length", + "24000" ] ], "expected_triples": [ [ - "ALCO_RS-3", + "DanskeFart\u00f8jer", "powerType", - "Diesel-electric_transmission" + "Elektrisk" ], [ - "ALCO_RS-3", + "DanskeFart\u00f8jer", "length", - "17068.8" + "24000" + ] + ], + "contains_hits": 1 + }, + { + "sentence": "K\u00f8benhavn T\u00e5rn, beliggende i K\u00f8benhavn, Danmark, er designet af Lars Mikkelsen. Adressen p\u00e5 t\u00e5rnet er \"K\u00f8bmagergade 52\" og nuv\u00e6rende lejere er Danmarks Radio.", + "triples_from_solution": [ + [ + "K\u00f8benhavn_T\u00e5rn", + "location", + "K\u00f8benhavn" + ], + [ + "Lars_Mikkelsen", + "designer", + "K\u00f8benhavn_T\u00e5rn" + ] + ], + "expected_triples": [ + [ + "K\u00f8benhavn_T\u00e5rn", + "architect", + "Lars_Mikkelsen" + ], + [ + "K\u00f8benhavn_T\u00e5rn", + "address", + "\"K\u00f8bmagergade_52\"" + ], + [ + "K\u00f8benhavn_T\u00e5rn", + "location", + "K\u00f8benhavn" + ], + [ + "Danmarks_Radio", + "country", + "Denmark" + ] + ], + "contains_hits": 1 + }, + { + "sentence": "Jens Larsen blev f\u00f8dt i Aalborg og d\u00f8de i K\u00f8benhavn. Etniske grupper i K\u00f8benhavn inkluderer danskere.", + "triples_from_solution": [ + [ + "Jens_Larsen", + "location", + "Aalborg" + ], + [ + "Jens_Larsen", + "location", + "Aalborg" + ] + ], + "expected_triples": [ + [ + "Jens_Larsen", + "birthPlace", + "Aalborg" + ], + [ + "Jens_Larsen", + "deathPlace", + "K\u00f8benhavn" + ], + [ + "K\u00f8benhavn", + "ethnicGroup", + "Dansk" + ] + ], + "contains_hits": 0 + }, + { + "sentence": "Det Tabte Rige er en film redigeret af Anna J\u00f8rgensen.", + "triples_from_solution": [ + [ + "Det_Tabte_Rige", + "editor", + "Anna_J\u00f8rgensen" + ], + [ + "Det_Tabte_Rige", + "editor", + "Anna_J\u00f8rgensen" + ], + [ + "Det_Tabte_Rige", + "film", + "Anna_J\u00f8rgensen" + ] + ], + "expected_triples": [ + [ + "Det_Tabte_Rige", + "editing", + "Anna_J\u00f8rgensen" ] ], "contains_hits": 0 } ], "result": { - "total_expected_triples": 13, - "hits": 1, - "hit_percentage": 0.07692307692307693 + "total_expected_triples": 21, + "hits": 3, + "hit_percentage": 0.14285714285714285 }, "score": { - "precision": 0.25, - "recall": 0.07692307692307693, - "F1_score": 0.11764705882352941 + "precision": 0.125, + "recall": 0.14285714285714285, + "F1_score": 0.13333333333333333 } } } \ No newline at end of file diff --git a/relation_extraction/evaluation/testdataMini.xml b/relation_extraction/evaluation/testdataMini.xml index 45aee75..d40d704 100644 --- a/relation_extraction/evaluation/testdataMini.xml +++ b/relation_extraction/evaluation/testdataMini.xml @@ -55,5 +55,119 @@ The 17068.8 millimeter long ALCO RS-3 has a diesel-electric transmission. + + + Alan_B._Miller_Hall | architect | Robert_A._M._Stern + Alan_B._Miller_Hall | address | "101 Ukrop Way"@en + Alan_B._Miller_Hall | currentTenants | Mason_School_of_Business + Alan_B._Miller_Hall | location | Virginia + Mason_School_of_Business | country | United_States + + + Alan_B._Miller_Hall | architect | Robert_A._M._Stern + Alan_B._Miller_Hall | address | "101 Ukrop Way" + Alan_B._Miller_Hall | currentTenants | Mason_School_of_Business + Alan_B._Miller_Hall | location | Virginia + Mason_School_of_Business | country | United_States + + Alan B. Miller Hall, in Virginia, USA, was designed by Robert A.M. Stern. The address of the hall is "101 Ukrop Way" and the current tenants are the Mason School of Business. + + + + Liselotte_Grschebina | birthPlace | Karlsruhe + Liselotte_Grschebina | deathPlace | Israel + Israel | ethnicGroup | Arab_citizens_of_Israel + + + Liselotte_Grschebina | birthPlace | Karlsruhe + Liselotte_Grschebina | deathPlace | Israel + Israel | ethnicGroup | Arab_citizens_of_Israel + + Liselotte Grschebina was born in Karlsruhe and died in Israel. Ethnic groups in Israel include Arabs. + + + + It's_Great_to_Be_Young_(1956_film) | editing | Max_Benedict + + + It's_Great_to_Be_Young_(1956_film) | editing | Max_Benedict + + It’s Great to Be Young is a film edited by Max Benedict. + + + + Turkey | leaderTitle | President + Nurhan_Atasoy | birthPlace | Turkey + + + Turkey | leaderTitle | President + Nurhan_Atasoy | birthPlace | Turkey + + Nurhan Atasoy was born in Turkey led by the President. + + + + Agremiação_Sportiva_Arapiraquense | league | Campeonato_Brasileiro_Série_C + Campeonato_Brasileiro_Série_C | country | Brazil + Agremiação_Sportiva_Arapiraquense | capacity | "17000"^^xsd:nonNegativeInteger + Agremiação_Sportiva_Arapiraquense | manager | Vica + + + Agremiação_Sportiva_Arapiraquense | league | Campeonato_Brasileiro_Série_C + Campeonato_Brasileiro_Série_C | country | Brazil + Agremiação_Sportiva_Arapiraquense | numberOfMembers | 17000 + Agremiação_Sportiva_Arapiraquense | manager | Vica + + Agremiação Sportiva Arapiraquense managed by Vica has 17000 members and play in the Campeonato Brasileiro Série C league which is from Brazil. + + + + Bananaman | creator | Steve_Bright + Bananaman | network | BBC + Bananaman | firstAired | "1983-10-03"^^xsd:date + + + Bananaman | creator | Steve_Bright + Bananaman | broadcastedBy | BBC + Bananaman | firstAired | "1983-10-03" + + Bananaman first aired on the 10th of March, 1983 and was created by Steve Bright. It was broadcast by the BBC. + + + + English_Without_Tears | cinematography | Bernard_Knowles + English_Without_Tears | writer | Terence_Rattigan + English_Without_Tears | musicComposer | Nicholas_Brodszky + English_Without_Tears | producer | Anatole_de_Grunwald + English_Without_Tears | director | Harold_French + + + English_Without_Tears | cinematography | Bernard_Knowles + English_Without_Tears | writer | Terence_Rattigan + English_Without_Tears | musicComposer | Nicholas_Brodszky + English_Without_Tears | producer | Anatole_de_Grunwald + English_Without_Tears | director | Harold_French + + The movie English Without Tears is written by Terence Rattigan and directed by Harold French. Anatole de Grunwald is the producer and Bernard Knowles is the cinematographer. Nicholas Brodszky was a composer of the songs. + + + + 11th_Mississippi_Infantry_Monument | established | 2000 + 11th_Mississippi_Infantry_Monument | region | Adams_County,_Pennsylvania + 11th_Mississippi_Infantry_Monument | municipality | Gettysburg,_Pennsylvania + 11th_Mississippi_Infantry_Monument | category | Contributing_property + Adams_County,_Pennsylvania | north | Cumberland_County,_Pennsylvania + 11th_Mississippi_Infantry_Monument | country | "United States"@en + + + 11th_Mississippi_Infantry_Monument | established | 2000 + 11th_Mississippi_Infantry_Monument | location | Adams_County,_Pennsylvania + 11th_Mississippi_Infantry_Monument | municipality | Gettysburg,_Pennsylvania + 11th_Mississippi_Infantry_Monument | category | Contributing_property + Adams_County,_Pennsylvania | hasToItsNorth | Cumberland_County,_Pennsylvania + 11th_Mississippi_Infantry_Monument | country | "United States" + + The 11th Mississippi Infantry Monument, built in 2000, is placed in the municipality of Gettysburg in Pennsylvania which is in Adams County, USA. The 11th Mississippi Infantry Monument is classified as a Contributing Property. Cumberland county, Pennsylvania is to the north of Adams County. + diff --git a/relation_extraction/multilingual/llm_messenger.py b/relation_extraction/multilingual/llm_messenger.py index e5415dd..783bc4e 100644 --- a/relation_extraction/multilingual/llm_messenger.py +++ b/relation_extraction/multilingual/llm_messenger.py @@ -11,21 +11,6 @@ def API_endpoint(): def send_request(request): HEADERS = {"Access-Authorization": os.getenv("ACCESS_SECRET")} response = requests.post(url=LLMMessenger.API_endpoint(), json=request, headers=HEADERS) - - # # Put the location of to the GGUF model that you've download from HuggingFace (https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true) here - # model_path = "./relation_extraction/multilingual/llama-2-7b-chat.Q2_K.gguf" - - # # Create a llama model - # model = Llama(model_path=model_path, n_ctx=4096) - - # prompt = f"""[INST] <> - # {request["system_message"]} - # <> - # {request["user_message"]} [/INST]""" - - # # Run the model - # output = model(prompt, max_tokens=request["max_tokens"], echo=True) - return response def process_message(response): diff --git a/relation_extraction/ontology_messenger.py b/relation_extraction/ontology_messenger.py index 6faa8e5..da76701 100644 --- a/relation_extraction/ontology_messenger.py +++ b/relation_extraction/ontology_messenger.py @@ -1,8 +1,6 @@ import requests import re import os -from dotenv import load_dotenv - from relation_extraction.API_handler import APIHandler class OntologyMessenger(APIHandler): @@ -10,7 +8,6 @@ def API_endpoint(): return "http://knox-proxy01.srv.aau.dk/knox-api/triples" def send_request(): - load_dotenv() "Function to extract relations based on the specified pattern" print("Getting relations from online ontology...") relations = []