Merge remote-tracking branch 'origin/main'

Knox-AAU · Dec 15, 2023 · 2964686 · 2964686
2 parents cef947f + 03f0934
commit 2964686
Show file tree

Hide file tree

Showing 11 changed files with 375 additions and 106 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -4,12 +4,6 @@ WORKDIR /code
 
 COPY . .
 
-# Install necessary build tools and dependencies
-RUN apt-get update && apt-get install -y \
- build-essential \
- cmake \
- && rm -rf /var/lib/apt/lists/*
-
 RUN pip install --no-cache-dir -r requirements_docker.txt
 
 CMD ["python", "-u", "-m", "server.server", "--host", "0.0.0.0", "--port", "4444", "--reload"]
diff --git a/concept_linking/requirements.txt b/concept_linking/requirements.txt
@@ -1,9 +1,9 @@
 # Tools
 #Requirements for LlamaServer
--r tools/LlamaServer/requirements.txt
+#-r tools/LlamaServer/requirements.txt
 
 #Requirements for OntologyGraphBuilder
--r tools/OntologyGraphBuilder/requirements.txt
+#-r tools/OntologyGraphBuilder/requirements.txt
 
 # Solutions
 #Requirements for MachineLearning

diff --git a/concept_linking/solutions/PromptEngineering/main.py b/concept_linking/solutions/PromptEngineering/main.py
@@ -8,13 +8,16 @@
 from relation_extraction.knowledge_graph_messenger import KnowledgeGraphMessenger
 from concept_linking.tools.triple_helper import *
 
-# Local API url
+# Local API url python
 api_url = "http://127.0.0.1:5000/llama"
 
+# Local API url docker
+# api_url = "http://llama-cpu-server:5000/llama"
+
 # Remote API url
 # api_url = "http://knox-proxy01.srv.aau.dk/llama-api/llama"
 
-headers = {"Content-Type": "application/json"}
+headers = {"Access-Authorization": os.getenv("ACCESS_SECRET"), "Content-Type": "application/json"}
 
 PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 
@@ -181,7 +184,7 @@ def perform_entity_type_classification(post_json, output_file_path=None, output_
 
 
 if __name__ == '__main__':
- input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN.json")
+ input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN_small.json")
  output_file = os.path.join(PROJECT_ROOT, "data/files/PromptEngineering/output.json")
 
  f = open(input_file, encoding="utf-8")

diff --git a/concept_linking/tools/LlamaServer/Dockerfile b/concept_linking/tools/LlamaServer/Dockerfile
@@ -1,22 +1,26 @@
 # Use python as base image
-FROM python
-
+FROM python:3.11-slim
 # Set the working directory in the container
 WORKDIR /app
 
 # Copy only the necessary files
 COPY llama_cpu_server.py .
 COPY requirements.txt .
 
+#Install necessary build tools and dependencies for running C++(llama_cpp)
+# This can be removed when app is in production and remote llama api server is reliable and used instead of local llama
+# Install dependencies and curl
+RUN apt-get update && apt-get install -y build-essential cmake curl && rm -rf /var/lib/apt/lists/*
+
+
 # Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Check if the model file exists, and if not, download it using the provided function
-RUN python -c "from llama_cpu_server import download_model; download_model('llama-2-7b-chat.Q2_K.gguf', 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true')"
-
+# Download the model file from the URL if it doesn't exist
+RUN test -e /app/llama-2-7b-chat.Q2_K.gguf || curl -o llama-2-7b-chat.Q2_K.gguf -LJO 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true'
 
 # Expose port 5000 outside of the container
 EXPOSE 5000
 
 # Run llama_cpu_server.py when the container launches
-CMD ["python", "llama_cpu_server.py"]
+CMD ["python", "-u", "-m", "llama_cpu_server", "--host", "0.0.0.0", "--port", "5000", "--reload"]
diff --git a/concept_linking/tools/LlamaServer/docker-compose.yml b/concept_linking/tools/LlamaServer/docker-compose.yml
@@ -1,11 +1,9 @@
-version: '3'
-
 services:
  llama-cpu-server:
- build:
- context: .
- dockerfile: Dockerfile
- ports:
- - "5000:5000"
+ build: .
+ container_name: llama-server
+ command: python -u -m llama_cpu_server --host 0.0.0.0 --port 5000 --reload
  volumes:
  - ./concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf:/app/concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf
+ ports:
+ - "5000:5000"
diff --git a/relation_extraction/evaluation/DanskEvaluering.xml b/relation_extraction/evaluation/DanskEvaluering.xml
@@ -48,7 +48,7 @@
  <originaltripleset>
  <otriple>DanskeFartøjer | powerType | Elektrisk</otriple>
  <otriple>
- DanskeFartøjer | length | "24000"^^<http://dbpedia.org/datatype/millimetre>
+ DanskeFartøjer | length | "24000"^^&lt;http://dbpedia.org/datatype/millimetre&gt;
  </otriple>
  </originaltripleset>
  <modifiedtripleset>
@@ -61,14 +61,12 @@
  <originaltripleset>
  <otriple>København_Tårn | architect | Lars Mikkelsen</otriple>
  <otriple>København_Tårn | address | "Købmagergade 52"@da</otriple>
- <otriple>København_Tårn | currentTenants | Danmarks Radio</otriple>
  <otriple>København_Tårn | location | København</otriple>
  <otriple>Danmarks Radio | country | Denmark</otriple>
  </originaltripleset>
  <modifiedtripleset>
  <mtriple>København_Tårn | architect | Lars Mikkelsen</mtriple>
  <mtriple>København_Tårn | address | "Købmagergade 52"</mtriple>
- <mtriple>København_Tårn | currentTenants | Danmarks Radio</mtriple>
  <mtriple>København_Tårn | location | København</mtriple>
  <mtriple>Danmarks Radio | country | Denmark</mtriple>
  </modifiedtripleset>

diff --git a/relation_extraction/evaluation/evaluation.py b/relation_extraction/evaluation/evaluation.py
@@ -5,6 +5,7 @@
 from relation_extraction.NaiveMVP.main import parse_data
 from relation_extraction.multilingual.llm_messenger import LLMMessenger
 import re
+import copy
 import datetime
 import json
 
@@ -21,7 +22,7 @@ def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 3,
 
 def convert_testdata_to_input_format():
  objs = []
- tree = ET.parse('relation_extraction/Evaluation/testdataMini.xml')
+ tree = ET.parse('relation_extraction/evaluation/DanskEvaluering.xml')
  root = tree.getroot()
  for entry in root.findall('.//entry'):
  sentence = entry.findall('lex')[0].text
@@ -42,7 +43,13 @@ def calculate_metrics(data):
  FP = 0
  FN = 0
 
- for element in data["triples"]:
+ data_without_duplicates = copy.deepcopy(data)
+
+ for triples in data_without_duplicates["triples"]:
+ triples["triples_from_solution"] = set(tuple(triple) for triple in triples["triples_from_solution"])
+ triples["triples_from_solution"] = list(list(triple) for triple in triples["triples_from_solution"])
+
+ for element in data_without_duplicates["triples"]:
  TP += element["contains_hits"]
  FP += len(element["triples_from_solution"]) - element["contains_hits"]
  FN += len(element["expected_triples"]) - element["contains_hits"]
@@ -99,27 +106,32 @@ def main():
  split_relations = [ontology_relations[i:i + chunk_size] for i in range(0, len(ontology_relations), chunk_size)] #Split the relations into lists of size chunk_size
  res = []
  for split_relation in split_relations:
- res.append(solution(input_obj, split_relation, ontology_relations))
+ part_res = solution(input_obj, split_relation, ontology_relations)
+ for triple in part_res:
+ triple[1] = triple[1].replace("http://dbpedia.org/ontology/", "")
+ res.extend(part_res)
  res_hits = 0
- for triple in res:
+ convert_to_set_res = set(tuple(triples) for triples in res)
+ removed_duplicates_res = list(list(triples) for triples in convert_to_set_res)
+ for triple in removed_duplicates_res:
  if triple in expected_triples:
  res_hits += 1
  hits +=1
-
+ 
  evaluation_result_triples.append({"sentence":sentence, "triples_from_solution": res, "expected_triples": expected_triples, "contains_hits": res_hits})
  eta = round((((datetime.datetime.now()-dt).total_seconds()/60)/((i+1)/len(input_objs)))*(1-((i+1)/len(input_objs))),5)
  progress_suffix = f"Complete. Timeusage: {round((datetime.datetime.now()-dt).total_seconds()/60,5)} minutes. Eta {eta} minutes."
  printProgressBar(i + 1, len(input_objs), prefix = 'Progress:', suffix = progress_suffix, length = 50)
 
- print(f"Solution {name} finished. Hit {hits}/{total_triples}. Hit percentage: {(hits/total_triples)*100}%")
- evaluation_results[name] = {
- "triples": evaluation_result_triples,
- "result": {"total_expected_triples": total_triples, "hits": hits, "hit_percentage": hits/total_triples},
- "score": calculate_metrics({"triples": evaluation_result_triples})
- }
+  print(f"Solution {name} finished. Hit {hits}/{total_triples}. Hit percentage: {(hits/total_triples)*100}%")
+  evaluation_results[name] = {
+  "triples": evaluation_result_triples,
+  "result": {"total_expected_triples": total_triples, "hits": hits, "hit_percentage": hits/total_triples},
+  "score": calculate_metrics({"triples": evaluation_result_triples})
+  }
 
- with open("relation_extraction/Evaluation/evaluation_results.json", "w") as f:
- json.dump(evaluation_results, f, indent=4)
+  with open("relation_extraction/Evaluation/evaluation_results.json", "w") as f:
+  json.dump(evaluation_results, f, indent=4)