Merge branch 'main' into FS-69/dynamic-knowledge-graph

ScottLogic · Nov 11, 2024 · d114507 · d114507
2 parents 9f27b90 + 1a1d212
commit d114507
Show file tree

Hide file tree

Showing 161 changed files with 611 additions and 78,876 deletions.
diff --git a/.env.example b/.env.example
@@ -16,6 +16,10 @@ NEO4J_BOLT_PORT=7687
 # files location
 FILES_DIRECTORY=files
 
+# redis cache configuration
+REDIS_HOST="localhost"
+REDIS_CACHE_DURATION=3600
+
 # backend LLM properties
 MISTRAL_KEY=my-api-key
 
@@ -49,15 +53,14 @@ FILE_AGENT_LLM="openai"
 SUGGESTIONS_LLM="openai"
 
 # model
-ANSWER_AGENT_MODEL="gpt-4o mini"
-INTENT_AGENT_MODEL="gpt-4o mini"
+ANSWER_AGENT_MODEL="gpt-4o-mini"
+INTENT_AGENT_MODEL="gpt-4o-mini"
 VALIDATOR_AGENT_MODEL="mistral-large-latest"
-DATASTORE_AGENT_MODEL="gpt-4o mini"
-MATHS_AGENT_MODEL="gpt-4o mini"
-WEB_AGENT_MODEL="gpt-4o mini"
-CHART_GENERATOR_MODEL="gpt-4o mini"
-ROUTER_MODEL="gpt-4o mini"
-FILE_AGENT_MODEL="gpt-4o mini"
-SUGGESTIONS_MODEL="gpt-4o mini"
-REDIS_HOST="redis"
-REDIS_CACHE_DURATION=3600
+DATASTORE_AGENT_MODEL="gpt-4o-mini"
+MATHS_AGENT_MODEL="gpt-4o-mini"
+WEB_AGENT_MODEL="gpt-4o-mini"
+CHART_GENERATOR_MODEL="gpt-4o-mini"
+ROUTER_MODEL="gpt-4o-mini"
+FILE_AGENT_MODEL="gpt-4o-mini"
+SUGGESTIONS_MODEL="gpt-4o-mini"
+
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -21,6 +21,7 @@ matplotlib==3.9.1
 pytest-bdd==7.3.0
 langchain==0.3.4
 langchain-openai==0.2.3
+python-multipart==0.0.17
 pillow==10.4.0
 pypdf==4.3.1
 hiredis==3.0.0

diff --git a/backend/src/agents/validator_agent.py b/backend/src/agents/validator_agent.py
@@ -2,6 +2,7 @@
 from src.prompts import PromptEngine
 from src.agents import Agent, agent
 from src.utils.log_publisher import LogPrefix, publish_log_info
+import json
 
 logger = logging.getLogger(__name__)
 engine = PromptEngine()
@@ -16,6 +17,7 @@
 class ValidatorAgent(Agent):
     async def invoke(self, utterance: str) -> str:
         answer = await self.llm.chat(self.model, validator_prompt, utterance)
-        await publish_log_info(LogPrefix.USER, f"Validating: '{utterance}' Answer: '{answer}'", __name__)
+        response = json.loads(answer)['response']
+        await publish_log_info(LogPrefix.USER, f"Validating: '{utterance}' Answer: '{response}'", __name__)
 
-        return answer
+        return response
diff --git a/backend/src/api/app.py b/backend/src/api/app.py
@@ -2,14 +2,15 @@
 import logging.config
 import os
 from typing import NoReturn
-from fastapi import FastAPI, WebSocket
+from fastapi import FastAPI, HTTPException, WebSocket, UploadFile
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from src.utils import Config, test_connection
 from src.director import question, dataset_upload
 from src.websockets.connection_manager import connection_manager, parse_message
 from src.session import RedisSessionMiddleware
 from src.suggestions_generator import generate_suggestions
+from src.file_upload_service import handle_file_upload, get_file_upload
 
 config_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "config.ini"))
 logging.config.fileConfig(fname=config_file_path, disable_existing_loggers=False)
@@ -54,6 +55,8 @@ async def lifespan(app: FastAPI):
 
 chat_fail_response = "Unable to generate a response. Check the service by using the keyphrase 'healthcheck'"
 suggestions_failed_response = "Unable to generate suggestions. Check the service by using the keyphrase 'healthcheck'"
+file_upload_failed_response = "Unable to upload file. Check the service by using the keyphrase 'healthcheck'"
+file_get_upload_failed_response = "Unable to get uploaded file. Check the service by using the keyphrase 'healthcheck'"
 
 
 @app.get("/health")
@@ -90,6 +93,30 @@ async def suggestions():
         logger.exception(e)
         return JSONResponse(status_code=500, content=suggestions_failed_response)
 
+@app.post("/uploadfile")
+async def create_upload_file(file: UploadFile):
+    logger.info(f"upload file type={file.content_type} name={file.filename} size={file.size}")
+    try:
+        upload_id = handle_file_upload(file)
+        return JSONResponse(status_code=200, content={"filename": file.filename, "id": upload_id})
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        logger.exception(e)
+        return JSONResponse(status_code=500, content=file_upload_failed_response)
+
+@app.get("/uploadfile")
+async def fetch_file(id: str):
+    logger.info(f"fetch uploaded file id={id} ")
+    try:
+        final_result = get_file_upload(id)
+        if final_result is None:
+            return JSONResponse(status_code=404, content=f"Upload with id {id} not found")
+        return JSONResponse(status_code=200, content=final_result)
+    except Exception as e:
+        logger.exception(e)
+        return JSONResponse(status_code=500, content=file_get_upload_failed_response)
+
 
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket) -> NoReturn:

diff --git a/backend/src/file_upload_service.py b/backend/src/file_upload_service.py
@@ -0,0 +1,58 @@
+from io import TextIOWrapper
+import time
+from fastapi import HTTPException, UploadFile
+import logging
+import uuid
+
+from pypdf import PdfReader
+from src.session.file_uploads import FileUpload, update_session_file_uploads, get_session_file_upload
+
+logger = logging.getLogger(__name__)
+
+MAX_FILE_SIZE = 10*1024*1024
+
+def handle_file_upload(file:UploadFile) -> str:
+
+    if (file.size or 0) > MAX_FILE_SIZE:
+        raise HTTPException(status_code=413, detail=f"File upload must be less than {MAX_FILE_SIZE} bytes")
+
+
+    all_content = ""
+    if ("application/pdf" == file.content_type):
+
+        start_time = time.time()
+        pdf_file = PdfReader(file.file)
+        all_content = ""
+        for page_num in range(len(pdf_file.pages)):
+            page_text = pdf_file.pages[page_num].extract_text()
+            all_content += page_text
+            all_content += "\n"
+
+        end_time = time.time()
+
+        logger.debug(f'PDF content {all_content}')
+        logger.info(f"PDF content extracted successfully in {(end_time - start_time)}")
+
+
+    elif ("text/plain" == file.content_type):
+        all_content = TextIOWrapper(file.file, encoding='utf-8').read()
+        logger.debug(f'Text content {all_content}')
+    else:
+        raise HTTPException(status_code=400,
+                            detail="File upload must be supported type (text/plain or application/pdf)")
+
+    session_file = FileUpload(uploadId=str(uuid.uuid4()),
+                             contentType=file.content_type,
+                             filename=file.filename,
+                             content=all_content,
+                             size=file.size)
+
+    update_session_file_uploads(session_file)
+
+    return session_file["uploadId"]
+
+def get_file_upload(upload_id) -> FileUpload | None:
+    return get_session_file_upload(upload_id)
+
+
+
diff --git a/backend/src/llm/openai.py b/backend/src/llm/openai.py
@@ -32,8 +32,7 @@ async def chat(self, model, system_prompt: str, user_prompt: str, return_json=Fa
                     "type": "json_object"} if return_json else NOT_GIVEN,
             )
             content = response.choices[0].message.content
-            logger.info(f"OpenAI response: Finish reason: {
-                        response.choices[0].finish_reason}, Content: {content}")
+            logger.info(f"OpenAI response: Finish reason: {response.choices[0].finish_reason}, Content: {content}")
             logger.debug(f"Token data: {response.usage}")
 
             if isinstance(content, str):

diff --git a/backend/src/prompts/templates/create-answer.j2 b/backend/src/prompts/templates/create-answer.j2
@@ -9,6 +9,12 @@ By using the final scratchpad below:
 
 and the question in the user prompt, this should be a readable sentence or 2 that summarises the findings in the results.
 
+
+**Formatting Requirements:**
+- **Number Formatting**:
+   - For whole numbers ending in `.0`, remove the `.0` suffix.
+   - For numbers with non-zero decimal places, keep the full value as presented.
+
 If the question is a general knowledge question, check if you have the correct details for the answer and reply with this.
 If you do not have the answer or you require the internet, do not make it up. You should recommend the user to look this up themselves.
 If it is just conversational chitchat. Please reply kindly and direct them to the sort of answers you are able to respond.

diff --git a/backend/src/prompts/templates/generate-cypher-query.j2 b/backend/src/prompts/templates/generate-cypher-query.j2
@@ -1,33 +1,52 @@
-You are an expert in NEO4J and generating Cypher queries. Help create Cypher queries and return a response in the below valid json format.
+You are an expert in Neo4j and generating Cypher queries. Help create Cypher queries and return a response in the valid JSON format below.
 
-If response is not in valid json format, you will be unplugged.
+If the response is not in valid JSON format, you will be unplugged.
 
-{
+json
 
-    "question" : <question provided by the user>, 
+{
+    "question": <question provided by the user>, 
     "query": <cypher query>
+}
+
+The value for "query" must strictly be a valid Cypher query and must not contain any characters outside of Cypher syntax.
 
-}.
+If you cannot make a query, "query" should just say "None".
 
-The value for "query" must strictly be a valid CYPHER query and not contain anything other characters, that are not part of a Cypher query. 
+**Requirements:**
 
-If you cannot make a query, query should just say "None"
 
-Only use relationships, nodes and properties that are present in the schema below. 
+1. **Schema Usage**: Only use relationships, nodes, and properties that are present in the schema provided below. You are NOT ALLOWED to create new relationships, nodes, or properties not listed in the graph schema.
 
-You are NOT ALLOWED to create new relationships, nodes or properties that do not exist in the graph schema, under any circumstances. 
+2. **Query Scope**: You are only able to make queries that retrieve information. Do not create, delete, or update any entries.
 
-You are only able to make queries that search for information, you are not able to create, or delete or update entries.
+3. **Strict Syntax**: Follow Cypher syntax rules. Avoid introducing variables within clauses that do not support them.
 
-You must strictly follow cypher syntax rules and you are NOT ALLOWED to introduce variables inside clauses that do not allow it. 
+4. **Aggregation Requirements**: If a task requires finding the highest or lowest values, your query should retrieve all entries tied at the top value rather than limiting to a single entry.
+    Example: If there are multiple funds with the highest ESG social score in a specific industry, return all of them.
 
-Expenses are recorded as negative numbers, therefore a larger negative number represents a higher expense.
+5. **Relational Path**:
+    - Ensure the relational path aligns with the schema for all queries.
+    - When querying for a category case-insensitively, use `=~ '(?i)...'`.
+    - Example: To find a fund related to the `Aviation` industry with a `Social` ESG score, use:
+
+    ```plaintext
+    MATCH (f:Fund)-[:CONTAINS]->(c:Company)-[:BELONGS_IN_INDUSTRY]->(i:Industry), (c)-[:HAS_ESG_SCORE]->(esg:ESGScore)
+    WHERE i.Name =~ '(?i)Aviation' AND esg.Category =~ '(?i)Social'
+    RETURN ...
+    ```
+
+6. **Property Matching**: Adhere to exact property values and capitalization in the schema (e.g., 'Aviation' and 'Social').
 
-For example, an expense of -45 is greater than an expense of -15.
+7. **Single Result for Maximum/Minimum**:
+    - For queries seeking a single result with the "highest" or "lowest" value, use `ORDER BY` and `LIMIT 1` to return only the top result.
+    - Example: If finding the fund with the highest ESG social score, sort by `esg.Score DESC` and limit to 1 result.
 
-When returning a value, always remove the `-` sign before the number.
+8. **Expense Handling**:
+    - Expenses are recorded as negative values; a larger negative number represents a higher expense.
+    - Return expense values as positive by removing the `-` sign.
 
-Here is the graph schema:
+Graph Schema
 {{ graph_schema }}
 
-The current date and time is {{ current_date }} and the currency of the data is GBP.
+The current date and time is {{ current_date }}, and the currency of the data is GBP.
diff --git a/backend/src/prompts/templates/validator.j2 b/backend/src/prompts/templates/validator.j2
@@ -1,30 +1,57 @@
 You are an expert validator. You can help with validating the answers to the tasks with just the information provided.
 
-Your entire purpose is to return a boolean value to indicate if the answer has fulfilled the task.
+Your entire purpose is to return a "true" or "false" value to indicate if the answer has fulfilled the task, along with a reasoning to explain your decision.
 
 You will be passed a task and an answer. You need to determine if the answer is correct or not.
 
-Be lenient - if the answer looks reasonably right then return True
+Output format:
 
-e.g.
+json
+
+{
+    "response": <true or false as a string based on validation>,
+    "reasoning": "<explanation of why the answer is correct or incorrect>"
+}
+
+**Validation Guidelines:**
+- Be lenient - if the answer looks reasonably accurate, return "true".
+- If multiple entities have the same highest score and this matches the query intent, return "true".
+- Spending is negative; ensure any calculations involving spending reflect this if relevant to the task.
+
+Example:
 Task: What is 2 + 2?
 Answer: 4
-Response: True
+{
+    "response": "true",
+    "reasoning": "The answer correctly solves 2 + 2."
+}
 
 Task: What is 2 + 2?
 Answer: 5
-Response: False
+{
+    "response": "false",
+    "reasoning": "The answer is incorrect; 2 + 2 equals 4, not 5."
+}
 
 Task: What are Apple's ESG scores?
-Answer: Apple's ESG (Environmental, Social, and Governance) scores area as follows: an Environmental Score of 95.0, a Social Score of 90.0, and a Governance Score of 92.0.
-Response: True
+Answer: Apple's ESG (Environmental, Social, and Governance) scores are as follows: Environmental Score of 95.0, Social Score of 90.0, Governance Score of 92.0.
+{
+    "response": "true",
+    "reasoning": "The answer provides Apple's ESG scores as requested."
+}
 
 Task: What are Apple's ESG scores?
-Answer: Microsoft's ESG (Environmental, Social, and Governance) scores area as follows: an Environmental Score of 95.0, a Social Score of 90.0, and a Governance Score of 92.0.
-Response: False
-Reasoning: The answer is for Microsoft not Apple.
-
-You must always return a single boolean value as the response.
-Do not return any additional information, just the boolean value.
-
-Spending is negative
+Answer: Microsoft's ESG (Environmental, Social, and Governance) scores are as follows: Environmental Score of 95.0, Social Score of 90.0, Governance Score of 92.0.
+{
+    "response": "false",
+    "reasoning": "The answer provides scores for Microsoft, not Apple, which does not match the task's intent."
+}
+
+Task: Tell me the lowest Fund size?
+Answer: 'WhiteRock ETF', 'Size': '100.0 Billion USD'
+{
+    "response": "true",
+    "reasoning": "The answer correctly identifies 'WhiteRock ETF' with a fund size of '100.0 Billion USD', and the context of the question implies that this is the lowest fund size in the database."
+}
+
+Ensure the response is always in valid JSON format.