pull all cahnges across

ScottLogic · Nov 11, 2024 · 45ff842 · 45ff842
1 parent 1a1d212
commit 45ff842
Show file tree

Hide file tree

Showing 14 changed files with 906 additions and 110 deletions.
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -7,6 +7,9 @@ WORKDIR /backend
 # Copy just the requirements into the working directory so it gets cached by itself
 COPY ./requirements.txt ./requirements.txt
 
+# Copy the esg_poc.csv, directory should match what local run of application will need
+COPY ./datasets/ ./datasets/
+
 # Install the dependencies from the requirements file
 RUN pip install --no-cache-dir --upgrade -r /backend/requirements.txt
 

diff --git a/backend/datasets/README.md b/backend/datasets/README.md
@@ -0,0 +1,14 @@
+# Datasets
+
+This is a temporary folder for preloading the esg dataset into InferESG.
+
+This will be replaced by file upload from the UI.
+
+## Bloomberg.csv
+This was downloaded from https://data.mendeley.com/datasets/tgmppk9kkt/1
+
+## Bloomberg_2.csv
+This was generated by chat gpt from the original bloomberg.csv
+
+## esg_poc.csv
+This was crafted by hand by the original InferESG PoC team. 
diff --git a/backend/datasets/bloomberg.csv b/backend/datasets/bloomberg.csv
diff --git a/backend/src/agents/__init__.py b/backend/src/agents/__init__.py
@@ -9,6 +9,7 @@
 from .answer_agent import AnswerAgent
 from .chart_generator_agent import ChartGeneratorAgent
 from .file_agent import FileAgent
+from .knowledge_graph_generator_agent import KnowledgeGraphAgent
 
 
 config = Config()
@@ -26,6 +27,10 @@ def get_answer_agent() -> Agent:
     return AnswerAgent(config.answer_agent_llm, config.answer_agent_model)
 
 
+def get_knowledge_graph_agent() -> KnowledgeGraphAgent:  # Something about this feels wrong
+    return KnowledgeGraphAgent(config.knowledge_graph_agent_llm, config.knowledge_graph_agent_model)
+
+
 def agent_details(agent) -> dict:
     return {"name": agent.name, "description": agent.description}
 
@@ -55,6 +60,7 @@ def get_agent_details():
     "get_intent_agent",
     "get_available_agents",
     "get_validator_agent",
+    "get_knowledge_graph_agent",
     "Parameter",
     "tool",
 ]
diff --git a/backend/src/agents/knowledge_graph_generator_agent.py b/backend/src/agents/knowledge_graph_generator_agent.py
@@ -0,0 +1,39 @@
+import json
+import logging
+
+from src.prompts import PromptEngine
+from src.agents import Agent, agent
+
+logger = logging.getLogger(__name__)
+engine = PromptEngine()
+
+
+@agent(
+    name="KnowledgeGraphAgent",
+    description="This agent is responsible for generating knowledge graphs to import csv datasets",
+    tools=[],
+)
+class KnowledgeGraphAgent(Agent):
+    async def generate_knowledge_graph(self, file_path: str, csv_data: list[list[str]]) -> dict[str, str]:
+
+        reduced_data_set = csv_data[slice(50)]
+
+        create_model = engine.load_prompt(
+            "generate-knowledge-graph-model",
+            csv_input=reduced_data_set
+        )
+
+        model_response = await self.llm.chat(self.model, create_model, user_prompt="")
+
+        model = json.loads(model_response)["model"]
+
+        create_query = engine.load_prompt(
+            "generate-knowledge-graph-query",
+            csv_input=reduced_data_set,
+            model_input=model
+        )
+
+        query_response = await self.llm.chat(self.model, create_query, user_prompt="")
+
+        query = json.loads(query_response)["cypher_query"]
+        return {"cypher_query": query, "model": model}
diff --git a/backend/src/api/app.py b/backend/src/api/app.py
@@ -1,19 +1,15 @@
 from contextlib import asynccontextmanager
 import json
-import logging
 import logging.config
 import os
-from azure.storage.blob import BlobServiceClient
 from typing import NoReturn
 from fastapi import FastAPI, HTTPException, WebSocket, UploadFile
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
-from src.utils.graph_db_utils import populate_db
 from src.utils import Config, test_connection
-from src.director import question
+from src.director import question, dataset_upload
 from src.websockets.connection_manager import connection_manager, parse_message
 from src.session import RedisSessionMiddleware
-from src.utils.cyper_import_data_from_csv import import_data_from_csv_script
 from src.suggestions_generator import generate_suggestions
 from src.file_upload_service import handle_file_upload, get_file_upload
 
@@ -27,22 +23,9 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
-        if (
-            config.azure_storage_connection_string is None
-            or config.azure_storage_container_name is None
-            or config.azure_initial_data_filename is None
-        ):
-            raise Exception("Missing Azure Environment variables. Please check the README.md for guidance.")
-
-        blob_service_client = BlobServiceClient.from_connection_string(config.azure_storage_connection_string)
-        container_client = blob_service_client.get_container_client(config.azure_storage_container_name)
-        blob_client = container_client.get_blob_client(config.azure_initial_data_filename)
-        download_stream = blob_client.download_blob()
-        annual_transactions = download_stream.readall().decode("utf-8")
-        populate_db(import_data_from_csv_script, json.loads(annual_transactions))
+        await dataset_upload()
     except Exception as e:
-        logger.exception(f"Failed to populate database with initial data from Azure: {e}")
-        populate_db(import_data_from_csv_script, {})
+        logger.exception(f"Failed to populate database with initial data from file: {e}")
     yield
 
 
@@ -85,6 +68,7 @@ async def health_check():
     finally:
         return response
 
+
 @app.get("/chat")
 async def chat(utterance: str):
     logger.info(f"Chat method called with utterance: {utterance}")
@@ -106,6 +90,7 @@ async def suggestions():
         logger.exception(e)
         return JSONResponse(status_code=500, content=suggestions_failed_response)
 
+
 @app.post("/uploadfile")
 async def create_upload_file(file: UploadFile):
     logger.info(f"upload file type={file.content_type} name={file.filename} size={file.size}")
@@ -118,6 +103,7 @@ async def create_upload_file(file: UploadFile):
         logger.exception(e)
         return JSONResponse(status_code=500, content=file_upload_failed_response)
 
+
 @app.get("/uploadfile")
 async def fetch_file(id: str):
     logger.info(f"fetch uploaded file id={id} ")

diff --git a/backend/src/director.py b/backend/src/director.py
@@ -2,10 +2,11 @@
 import logging
 from src.utils import clear_scratchpad, update_scratchpad, get_scratchpad
 from src.session import update_session_chat
-from src.agents import get_intent_agent, get_answer_agent
+from src.agents import get_intent_agent, get_answer_agent, get_knowledge_graph_agent
 from src.prompts import PromptEngine
 from src.supervisors import solve_all
 from src.utils import Config
+from src.utils.graph_db_utils import populate_db
 from src.websockets.connection_manager import connection_manager
 
 logger = logging.getLogger(__name__)
@@ -42,3 +43,17 @@ async def question(question: str) -> str:
     clear_scratchpad()
 
     return final_answer
+
+async def dataset_upload() -> None:
+    dataset_file = "./datasets/bloomberg.csv"
+
+    with open(dataset_file, 'r') as file:
+        csv_data = [
+            [entry for entry in line.strip('\n').split(",")]
+            for line in file
+        ]
+
+    knowledge_graph_config = await get_knowledge_graph_agent().generate_knowledge_graph(dataset_file, csv_data)
+
+
+    populate_db(knowledge_graph_config["cypher_query"], csv_data)
diff --git a/backend/src/prompts/templates/generate-knowledge-graph-model.j2 b/backend/src/prompts/templates/generate-knowledge-graph-model.j2
@@ -0,0 +1,45 @@
+You are an expert in Neo4j and Cypher. I need your help to analyze and model ESG data based on the structure of the provided input.
+
+The input data is formatted as JSON:
+{
+"all_data": {{ csv_input }}
+}
+Here, all_data is a list of lists, where each inner list corresponds to a row of data, and the first row contains the headers.
+
+# Objective:
+Analyze the structure of this data to produce an intuitive Neo4j model, focusing on identifying core entities, attributes, and relationships. Then generate a Cypher query to import the data according to this model.
+
+## Data Model Inference
+Output a comprehensive model of the data in the "model" field of the JSON response.
+
+## Data Structure:
+* Report: Each row in the dataset represents an ESG report about a company. This can be represented as a node.
+* Identify Key Entities: Based on the data headers, determine the main entity types (e.g., Company, Fund, or other core entities in the dataset) and map out each unique entity's attributes. Favour full names over abbreviations.
+* Identify Common Categories: Based on the data, determine common categories that appear in the data. Look for recurring values that appear in the same csv column and map these out as nodes with relationships to the main entity types.
+* Determine Relationships: Define the relationships between these entities, such as associating entities with reports, linking entities to specific time periods, or establishing hierarchical or categorical groupings within the data.
+* Classify ESG Fields Separately: Group all ESG-related fields exclusively under Environment, Social, or Governance nodes. Ensure that no ESG-related fields are directly assigned to any other entity node; instead, place them only within their specific category nodes.
+* Attribute Assignment: Ensure that each attribute is assigned to only one appropriate entity or category node without duplicating fields across nodes.
+Output Model Structure:
+
+## Describe Entities:
+* Example: Company: Represents each company with attributes like name and identifier.
+* Example: Year: Represents each distinct time period, based on date-related fields.
+* Example: Environment, Social, Governance: Each node represents one ESG category, containing only fields relevant to that specific category (e.g., CO2 emissions for Environment, injury rate for Social, and shareholder rights for Governance).
+* Describe Relationships:
+* Example: (Entity1)-[:HAS_ENTITY2]->(Entity2): Links between main entities, such as companies, years, or categories.
+* Example: (MainEntity)-[:HAS_ENVIRONMENT]->(Environment): Connects main entities to the Environment category node for environment-specific metrics.
+* Example: (MainEntity)-[:HAS_SOCIAL]->(Social): Links main entities to the Social category node for social-specific metrics.
+* Example: (MainEntity)-[:HAS_GOVERNANCE]->(Governance): Links main entities to the Governance category node for governance-specific metrics.
+* Example: (MainEntity)-[:BELONGS_TO]->(CategoryNode): Links main entities to a categorical node for any recurring category (such as industries or sectors).
+Please provide the inferred model structure in the "model" field of the JSON output, specifying entities, attributes, and relationships. The output must explicitly link each header in the input data to the corresponding part of the model.
+
+## Expected Output Format:
+{ model: "The model identifies 'Company' as the primary entity with unique identifiers for each company. Each data row is represented by a 'Report' node, with 'Year' nodes representing temporal relationships, and 'Industry' nodes for each unique industry type. ESG metrics are grouped into separate 'Environment', 'Social', and 'Governance' nodes. Relationships include: (Report)-[:IS_A]->(Company), (Report)-[:IN_YEAR]->(Year), (Company)-[:IN_INDUSTRY]->(Industry), (Report)-[:HAS_ENVIRONMENT]->(Environment), (Report)-[:HAS_SOCIAL]->(Social), and (Report)-[:HAS_GOVERNANCE]->(Governance). \n\nThe attributes for each entity are as follows:\n\n* Company: Identifier (RIC), Company Name\n\n* Year: Date\n\n* Industry: Name\n\n* Report: ESG_score, BVPS, Market_cap, Shares, Net_income, RETURN_ON_ASSET, QUICK_RATIO, ASSET_GROWTH, FNCL_LVRG, PE_RATIO\n\n* Environment: Env_score, Scope_1, Scope_2, CO2_emissions, Energy_use, Water_use, Water_recycle, Toxic_chem_red\n\n* Social: Social_score, Injury_rate, Women_Employees, Human_Rights, Strikes, Turnover_empl, Board_Size, Bribery, Recycling_Initiatives\n\n* Governance: Gov_score, Shareholder_Rights, Board_gen_div\n\nRelationships:\n\n* (Report)-[:IS_A]->(Company): Links each report to the relevant company.\n\n* (Report)-[:IN_YEAR]->(Year): Links each report to a specific year.\n\n* (Company)-[:IN_INDUSTRY]->(Industry): Links each company to its industry node.\n\n* (Report)-[:HAS_ENVIRONMENT]->(Environment): Links each report to an Environment node containing environmental metrics.\n\n* (Report)-[:HAS_SOCIAL]->(Social): Links each report to a Social node containing social metrics.\n\n* (Report)-[:HAS_GOVERNANCE]->(Governance): Links each report to a Governance node containing governance metrics.\n\nEach header in the input data is assigned to one of these nodes based on the model’s grouping structure." }
+
+Important Notes:
+* Ensure the model clearly identifies which header field relates to which part of the Neo4j graph.
+* Do not assign ESG-specific fields (e.g., emissions, board diversity) directly to any main entity nodes. Instead, place them within their respective category nodes (Environment, Social, Governance).
+* Identify any recurring categories in the data and create nodes for them instead of representing them as individual attributes (e.g., sector, industry, type, fund). Link main entities to these nodes with appropriate relationships.
+* Avoid duplicating fields across nodes. Assign each field only to the most appropriate entity or category node.
+* Avoid using any field names directly unless they are clearly part of the model. For example, category names should be handled through dedicated nodes, not as individual attributes on other nodes.
+* The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output.
diff --git a/backend/src/prompts/templates/generate-knowledge-graph-query.j2 b/backend/src/prompts/templates/generate-knowledge-graph-query.j2
@@ -0,0 +1,49 @@
+You are an expert in Cypher and Neo4j. I need your help to generate a Cypher query for importing ESG data into a Neo4j graph database.
+
+Inputs:
+
+1. CSV Data: You will receive the CSV data, which consists of a list of lists where the first list contains the headers, and the subsequent lists represent the rows of data.
+2. Model Description: You will receive a description of the model structure to create from the data. This model will define the core entities, attributes, and relationships based on the headers from the provided data.
+
+The format will be:
+{
+"model": {{ model_input }},
+"all_data": {{ csv_input }}
+}
+
+all_data is the raw CSV data with headers.
+
+# Objective:
+Generate a Cypher query based on the provided model structure and data.
+
+## Cypher Query Generation Steps:
+1. Process Data:
+* Use data.all_data[0] as headers to identify the fields.
+* Process each row of data and map the header fields to their corresponding parts of the model based on the model input.
+* Reminder: Avoid duplicating fields across multiple nodes. For example, if the same field appears in multiple places (e.g., company name, industry), create and reference a single node for that field rather than creating it multiple times.
+* Extract parts of fields, such as the year from a full date, to avoid ambiguity when creating nodes like Year (use substring(row[2], 0, 4) to extract the year from a date string).
+* Ensure that each node, such as Year or Industry, is only created once and reused in relationships to avoid redundant nodes.
+
+2. Generate Cypher Query:
+* Based on the model and data input, create a Cypher query to:
+* For each primary entity (e.g., Company, Fund, Industry, Year...), use MERGE to ensure only one instance is created, even if some rows contain null values for other attributes.
+* For each Environment, Social, and Governance category, use CREATE to ensure each report has its unique instance.
+* Use COALESCE to handle missing values and provide default values (e.g., COALESCE(row[10], 'Unknown') for industry).
+* Establish relationships as defined by the model, using MERGE for any reusable nodes but CREATE for nodes specific to each report.
+* Use separate WITH clauses as needed to prevent redeclaration errors
+
+3. Handling Missing Data:
+* Use default values for missing data where necessary, as defined in the model description.
+* Important: Ensure that nodes are created or referenced even when certain attributes are missing, so no row is excluded based on missing data.
+
+## Output:
+Please output only the Cypher query in the following JSON format:
+
+{"cypher_query": "WITH $data AS data UNWIND data.all_data[1..] AS row WITH data.all_data[0] AS headers, row WITH headers ... [cypher query here]"}
+
+Important Notes:
+* Ensure the query is well-formed and that the relationships and node creation follow the structure and model described in the input. The query should handle all data rows and be ready for execution in a Neo4j environment.
+* Avoid duplicating nodes by creating reusable references for fields that should not be repeated across multiple nodes (e.g., industry, company name).
+* The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output.
+* The Environment, Social, and Governance nodes should each be distinct for each report, even if they contain empty or partially filled attributes, to avoid multiple reports connecting to the same empty nodes.
+* The query must not skip any rows of the data, it is allowed to create nodes with empty values. You will be unplugged if your query results in missing rows.
diff --git a/backend/src/utils/config.py b/backend/src/utils/config.py
@@ -18,9 +18,6 @@ def __init__(self):
         self.neo4j_uri = default_neo4j_uri
         self.neo4j_user = None
         self.neo4j_password = None
-        self.azure_storage_connection_string = None
-        self.azure_storage_container_name = None
-        self.azure_initial_data_filename = None
         self.answer_agent_llm = None
         self.intent_agent_llm = None
         self.validator_agent_llm = None
@@ -31,6 +28,7 @@ def __init__(self):
         self.file_agent_llm = None
         self.router_llm = None
         self.suggestions_llm = None
+        self.knowledge_graph_agent_llm = None
         self.validator_agent_model = None
         self.intent_agent_model = None
         self.answer_agent_model = None
@@ -43,6 +41,7 @@ def __init__(self):
         self.redis_host = default_redis_host
         self.redis_cache_duration = default_redis_cache_duration
         self.suggestions_model = None
+        self.knowledge_graph_agent_model = None
         self.load_env()
 
     def load_env(self):
@@ -55,14 +54,11 @@ def load_env(self):
             self.mistral_url = os.getenv("MISTRAL_URL")
             self.mistral_key = os.getenv("MISTRAL_KEY")
             self.mistral_model = os.getenv("MODEL")
-            self.openai_key  = os.getenv("OPENAI_KEY")
+            self.openai_key = os.getenv("OPENAI_KEY")
             self.neo4j_uri = os.getenv("NEO4J_URI", default_neo4j_uri)
             self.neo4j_user = os.getenv("NEO4J_USERNAME")
             self.neo4j_password = os.getenv("NEO4J_PASSWORD")
             self.files_directory = os.getenv("FILES_DIRECTORY", default_files_directory)
-            self.azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
-            self.azure_storage_container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")
-            self.azure_initial_data_filename = os.getenv("AZURE_INITIAL_DATA_FILENAME")
             self.answer_agent_llm = os.getenv("ANSWER_AGENT_LLM")
             self.intent_agent_llm = os.getenv("INTENT_AGENT_LLM")
             self.validator_agent_llm = os.getenv("VALIDATOR_AGENT_LLM")
@@ -73,6 +69,7 @@ def load_env(self):
             self.maths_agent_llm = os.getenv("MATHS_AGENT_LLM")
             self.router_llm = os.getenv("ROUTER_LLM")
             self.suggestions_llm = os.getenv("SUGGESTIONS_LLM")
+            self.knowledge_graph_agent_llm = os.getenv("KNOWLEDGE_GRAPH_AGENT_LLM")
             self.answer_agent_model = os.getenv("ANSWER_AGENT_MODEL")
             self.intent_agent_model = os.getenv("INTENT_AGENT_MODEL")
             self.validator_agent_model = os.getenv("VALIDATOR_AGENT_MODEL")
@@ -85,6 +82,7 @@ def load_env(self):
             self.redis_host = os.getenv("REDIS_HOST", default_redis_host)
             self.redis_cache_duration = os.getenv("REDIS_CACHE_DURATION", default_redis_cache_duration)
             self.suggestions_model = os.getenv("SUGGESTIONS_MODEL")
+            self.knowledge_graph_agent_model = os.getenv("KNOWLEDGE_GRAPH_AGENT_MODEL")
         except FileNotFoundError:
             raise FileNotFoundError("Please provide a .env file. See the Getting Started guide on the README.md")
         except Exception: