Skip to content

Commit

Permalink
pull all cahnges across
Browse files Browse the repository at this point in the history
  • Loading branch information
IMladjenovic committed Nov 11, 2024
1 parent 1a1d212 commit 45ff842
Show file tree
Hide file tree
Showing 14 changed files with 906 additions and 110 deletions.
3 changes: 3 additions & 0 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ WORKDIR /backend
# Copy just the requirements into the working directory so it gets cached by itself
COPY ./requirements.txt ./requirements.txt

# Copy the esg_poc.csv, directory should match what local run of application will need
COPY ./datasets/ ./datasets/

# Install the dependencies from the requirements file
RUN pip install --no-cache-dir --upgrade -r /backend/requirements.txt

Expand Down
14 changes: 14 additions & 0 deletions backend/datasets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Datasets

This is a temporary folder for preloading the esg dataset into InferESG.

This will be replaced by file upload from the UI.

## Bloomberg.csv
This was downloaded from https://data.mendeley.com/datasets/tgmppk9kkt/1

## Bloomberg_2.csv
This was generated by chat gpt from the original bloomberg.csv

## esg_poc.csv
This was crafted by hand by the original InferESG PoC team.
721 changes: 721 additions & 0 deletions backend/datasets/bloomberg.csv

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions backend/src/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .answer_agent import AnswerAgent
from .chart_generator_agent import ChartGeneratorAgent
from .file_agent import FileAgent
from .knowledge_graph_generator_agent import KnowledgeGraphAgent


config = Config()
Expand All @@ -26,6 +27,10 @@ def get_answer_agent() -> Agent:
return AnswerAgent(config.answer_agent_llm, config.answer_agent_model)


def get_knowledge_graph_agent() -> KnowledgeGraphAgent: # Something about this feels wrong
return KnowledgeGraphAgent(config.knowledge_graph_agent_llm, config.knowledge_graph_agent_model)


def agent_details(agent) -> dict:
return {"name": agent.name, "description": agent.description}

Expand Down Expand Up @@ -55,6 +60,7 @@ def get_agent_details():
"get_intent_agent",
"get_available_agents",
"get_validator_agent",
"get_knowledge_graph_agent",
"Parameter",
"tool",
]
39 changes: 39 additions & 0 deletions backend/src/agents/knowledge_graph_generator_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import json
import logging

from src.prompts import PromptEngine
from src.agents import Agent, agent

logger = logging.getLogger(__name__)
engine = PromptEngine()


@agent(
name="KnowledgeGraphAgent",
description="This agent is responsible for generating knowledge graphs to import csv datasets",
tools=[],
)
class KnowledgeGraphAgent(Agent):
async def generate_knowledge_graph(self, file_path: str, csv_data: list[list[str]]) -> dict[str, str]:

reduced_data_set = csv_data[slice(50)]

create_model = engine.load_prompt(
"generate-knowledge-graph-model",
csv_input=reduced_data_set
)

model_response = await self.llm.chat(self.model, create_model, user_prompt="")

model = json.loads(model_response)["model"]

create_query = engine.load_prompt(
"generate-knowledge-graph-query",
csv_input=reduced_data_set,
model_input=model
)

query_response = await self.llm.chat(self.model, create_query, user_prompt="")

query = json.loads(query_response)["cypher_query"]
return {"cypher_query": query, "model": model}
26 changes: 6 additions & 20 deletions backend/src/api/app.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
from contextlib import asynccontextmanager
import json

Check failure on line 2 in backend/src/api/app.py

View workflow job for this annotation

GitHub Actions / Linting Backend

Ruff (F401)

backend/src/api/app.py:2:8: F401 `json` imported but unused
import logging
import logging.config
import os
from azure.storage.blob import BlobServiceClient
from typing import NoReturn
from fastapi import FastAPI, HTTPException, WebSocket, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from src.utils.graph_db_utils import populate_db
from src.utils import Config, test_connection
from src.director import question
from src.director import question, dataset_upload
from src.websockets.connection_manager import connection_manager, parse_message
from src.session import RedisSessionMiddleware
from src.utils.cyper_import_data_from_csv import import_data_from_csv_script
from src.suggestions_generator import generate_suggestions
from src.file_upload_service import handle_file_upload, get_file_upload

Expand All @@ -27,22 +23,9 @@
@asynccontextmanager
async def lifespan(app: FastAPI):
try:
if (
config.azure_storage_connection_string is None
or config.azure_storage_container_name is None
or config.azure_initial_data_filename is None
):
raise Exception("Missing Azure Environment variables. Please check the README.md for guidance.")

blob_service_client = BlobServiceClient.from_connection_string(config.azure_storage_connection_string)
container_client = blob_service_client.get_container_client(config.azure_storage_container_name)
blob_client = container_client.get_blob_client(config.azure_initial_data_filename)
download_stream = blob_client.download_blob()
annual_transactions = download_stream.readall().decode("utf-8")
populate_db(import_data_from_csv_script, json.loads(annual_transactions))
await dataset_upload()
except Exception as e:
logger.exception(f"Failed to populate database with initial data from Azure: {e}")
populate_db(import_data_from_csv_script, {})
logger.exception(f"Failed to populate database with initial data from file: {e}")
yield


Expand Down Expand Up @@ -85,6 +68,7 @@ async def health_check():
finally:
return response


@app.get("/chat")
async def chat(utterance: str):
logger.info(f"Chat method called with utterance: {utterance}")
Expand All @@ -106,6 +90,7 @@ async def suggestions():
logger.exception(e)
return JSONResponse(status_code=500, content=suggestions_failed_response)


@app.post("/uploadfile")
async def create_upload_file(file: UploadFile):
logger.info(f"upload file type={file.content_type} name={file.filename} size={file.size}")
Expand All @@ -118,6 +103,7 @@ async def create_upload_file(file: UploadFile):
logger.exception(e)
return JSONResponse(status_code=500, content=file_upload_failed_response)


@app.get("/uploadfile")
async def fetch_file(id: str):
logger.info(f"fetch uploaded file id={id} ")
Expand Down
17 changes: 16 additions & 1 deletion backend/src/director.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import logging
from src.utils import clear_scratchpad, update_scratchpad, get_scratchpad
from src.session import update_session_chat
from src.agents import get_intent_agent, get_answer_agent
from src.agents import get_intent_agent, get_answer_agent, get_knowledge_graph_agent
from src.prompts import PromptEngine
from src.supervisors import solve_all
from src.utils import Config
from src.utils.graph_db_utils import populate_db
from src.websockets.connection_manager import connection_manager

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,3 +43,17 @@ async def question(question: str) -> str:
clear_scratchpad()

return final_answer

async def dataset_upload() -> None:
dataset_file = "./datasets/bloomberg.csv"

with open(dataset_file, 'r') as file:
csv_data = [
[entry for entry in line.strip('\n').split(",")]
for line in file
]

knowledge_graph_config = await get_knowledge_graph_agent().generate_knowledge_graph(dataset_file, csv_data)

Check failure on line 56 in backend/src/director.py

View workflow job for this annotation

GitHub Actions / Type Checking Backend

Cannot access attribute "generate_knowledge_graph" for class "Agent"   Attribute "generate_knowledge_graph" is unknown (reportAttributeAccessIssue)


populate_db(knowledge_graph_config["cypher_query"], csv_data)
45 changes: 45 additions & 0 deletions backend/src/prompts/templates/generate-knowledge-graph-model.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
You are an expert in Neo4j and Cypher. I need your help to analyze and model ESG data based on the structure of the provided input.

The input data is formatted as JSON:
{
"all_data": {{ csv_input }}
}
Here, all_data is a list of lists, where each inner list corresponds to a row of data, and the first row contains the headers.

# Objective:
Analyze the structure of this data to produce an intuitive Neo4j model, focusing on identifying core entities, attributes, and relationships. Then generate a Cypher query to import the data according to this model.

## Data Model Inference
Output a comprehensive model of the data in the "model" field of the JSON response.

## Data Structure:
* Report: Each row in the dataset represents an ESG report about a company. This can be represented as a node.
* Identify Key Entities: Based on the data headers, determine the main entity types (e.g., Company, Fund, or other core entities in the dataset) and map out each unique entity's attributes. Favour full names over abbreviations.
* Identify Common Categories: Based on the data, determine common categories that appear in the data. Look for recurring values that appear in the same csv column and map these out as nodes with relationships to the main entity types.
* Determine Relationships: Define the relationships between these entities, such as associating entities with reports, linking entities to specific time periods, or establishing hierarchical or categorical groupings within the data.
* Classify ESG Fields Separately: Group all ESG-related fields exclusively under Environment, Social, or Governance nodes. Ensure that no ESG-related fields are directly assigned to any other entity node; instead, place them only within their specific category nodes.
* Attribute Assignment: Ensure that each attribute is assigned to only one appropriate entity or category node without duplicating fields across nodes.
Output Model Structure:

## Describe Entities:
* Example: Company: Represents each company with attributes like name and identifier.
* Example: Year: Represents each distinct time period, based on date-related fields.
* Example: Environment, Social, Governance: Each node represents one ESG category, containing only fields relevant to that specific category (e.g., CO2 emissions for Environment, injury rate for Social, and shareholder rights for Governance).
* Describe Relationships:
* Example: (Entity1)-[:HAS_ENTITY2]->(Entity2): Links between main entities, such as companies, years, or categories.
* Example: (MainEntity)-[:HAS_ENVIRONMENT]->(Environment): Connects main entities to the Environment category node for environment-specific metrics.
* Example: (MainEntity)-[:HAS_SOCIAL]->(Social): Links main entities to the Social category node for social-specific metrics.
* Example: (MainEntity)-[:HAS_GOVERNANCE]->(Governance): Links main entities to the Governance category node for governance-specific metrics.
* Example: (MainEntity)-[:BELONGS_TO]->(CategoryNode): Links main entities to a categorical node for any recurring category (such as industries or sectors).
Please provide the inferred model structure in the "model" field of the JSON output, specifying entities, attributes, and relationships. The output must explicitly link each header in the input data to the corresponding part of the model.

## Expected Output Format:
{ model: "The model identifies 'Company' as the primary entity with unique identifiers for each company. Each data row is represented by a 'Report' node, with 'Year' nodes representing temporal relationships, and 'Industry' nodes for each unique industry type. ESG metrics are grouped into separate 'Environment', 'Social', and 'Governance' nodes. Relationships include: (Report)-[:IS_A]->(Company), (Report)-[:IN_YEAR]->(Year), (Company)-[:IN_INDUSTRY]->(Industry), (Report)-[:HAS_ENVIRONMENT]->(Environment), (Report)-[:HAS_SOCIAL]->(Social), and (Report)-[:HAS_GOVERNANCE]->(Governance). \n\nThe attributes for each entity are as follows:\n\n* Company: Identifier (RIC), Company Name\n\n* Year: Date\n\n* Industry: Name\n\n* Report: ESG_score, BVPS, Market_cap, Shares, Net_income, RETURN_ON_ASSET, QUICK_RATIO, ASSET_GROWTH, FNCL_LVRG, PE_RATIO\n\n* Environment: Env_score, Scope_1, Scope_2, CO2_emissions, Energy_use, Water_use, Water_recycle, Toxic_chem_red\n\n* Social: Social_score, Injury_rate, Women_Employees, Human_Rights, Strikes, Turnover_empl, Board_Size, Bribery, Recycling_Initiatives\n\n* Governance: Gov_score, Shareholder_Rights, Board_gen_div\n\nRelationships:\n\n* (Report)-[:IS_A]->(Company): Links each report to the relevant company.\n\n* (Report)-[:IN_YEAR]->(Year): Links each report to a specific year.\n\n* (Company)-[:IN_INDUSTRY]->(Industry): Links each company to its industry node.\n\n* (Report)-[:HAS_ENVIRONMENT]->(Environment): Links each report to an Environment node containing environmental metrics.\n\n* (Report)-[:HAS_SOCIAL]->(Social): Links each report to a Social node containing social metrics.\n\n* (Report)-[:HAS_GOVERNANCE]->(Governance): Links each report to a Governance node containing governance metrics.\n\nEach header in the input data is assigned to one of these nodes based on the model’s grouping structure." }

Important Notes:
* Ensure the model clearly identifies which header field relates to which part of the Neo4j graph.
* Do not assign ESG-specific fields (e.g., emissions, board diversity) directly to any main entity nodes. Instead, place them within their respective category nodes (Environment, Social, Governance).
* Identify any recurring categories in the data and create nodes for them instead of representing them as individual attributes (e.g., sector, industry, type, fund). Link main entities to these nodes with appropriate relationships.
* Avoid duplicating fields across nodes. Assign each field only to the most appropriate entity or category node.
* Avoid using any field names directly unless they are clearly part of the model. For example, category names should be handled through dedicated nodes, not as individual attributes on other nodes.
* The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output.
49 changes: 49 additions & 0 deletions backend/src/prompts/templates/generate-knowledge-graph-query.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
You are an expert in Cypher and Neo4j. I need your help to generate a Cypher query for importing ESG data into a Neo4j graph database.

Inputs:

1. CSV Data: You will receive the CSV data, which consists of a list of lists where the first list contains the headers, and the subsequent lists represent the rows of data.
2. Model Description: You will receive a description of the model structure to create from the data. This model will define the core entities, attributes, and relationships based on the headers from the provided data.

The format will be:
{
"model": {{ model_input }},
"all_data": {{ csv_input }}
}

all_data is the raw CSV data with headers.

# Objective:
Generate a Cypher query based on the provided model structure and data.

## Cypher Query Generation Steps:
1. Process Data:
* Use data.all_data[0] as headers to identify the fields.
* Process each row of data and map the header fields to their corresponding parts of the model based on the model input.
* Reminder: Avoid duplicating fields across multiple nodes. For example, if the same field appears in multiple places (e.g., company name, industry), create and reference a single node for that field rather than creating it multiple times.
* Extract parts of fields, such as the year from a full date, to avoid ambiguity when creating nodes like Year (use substring(row[2], 0, 4) to extract the year from a date string).
* Ensure that each node, such as Year or Industry, is only created once and reused in relationships to avoid redundant nodes.

2. Generate Cypher Query:
* Based on the model and data input, create a Cypher query to:
* For each primary entity (e.g., Company, Fund, Industry, Year...), use MERGE to ensure only one instance is created, even if some rows contain null values for other attributes.
* For each Environment, Social, and Governance category, use CREATE to ensure each report has its unique instance.
* Use COALESCE to handle missing values and provide default values (e.g., COALESCE(row[10], 'Unknown') for industry).
* Establish relationships as defined by the model, using MERGE for any reusable nodes but CREATE for nodes specific to each report.
* Use separate WITH clauses as needed to prevent redeclaration errors

3. Handling Missing Data:
* Use default values for missing data where necessary, as defined in the model description.
* Important: Ensure that nodes are created or referenced even when certain attributes are missing, so no row is excluded based on missing data.

## Output:
Please output only the Cypher query in the following JSON format:

{"cypher_query": "WITH $data AS data UNWIND data.all_data[1..] AS row WITH data.all_data[0] AS headers, row WITH headers ... [cypher query here]"}

Important Notes:
* Ensure the query is well-formed and that the relationships and node creation follow the structure and model described in the input. The query should handle all data rows and be ready for execution in a Neo4j environment.
* Avoid duplicating nodes by creating reusable references for fields that should not be repeated across multiple nodes (e.g., industry, company name).
* The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output.
* The Environment, Social, and Governance nodes should each be distinct for each report, even if they contain empty or partially filled attributes, to avoid multiple reports connecting to the same empty nodes.
* The query must not skip any rows of the data, it is allowed to create nodes with empty values. You will be unplugged if your query results in missing rows.
12 changes: 5 additions & 7 deletions backend/src/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ def __init__(self):
self.neo4j_uri = default_neo4j_uri
self.neo4j_user = None
self.neo4j_password = None
self.azure_storage_connection_string = None
self.azure_storage_container_name = None
self.azure_initial_data_filename = None
self.answer_agent_llm = None
self.intent_agent_llm = None
self.validator_agent_llm = None
Expand All @@ -31,6 +28,7 @@ def __init__(self):
self.file_agent_llm = None
self.router_llm = None
self.suggestions_llm = None
self.knowledge_graph_agent_llm = None
self.validator_agent_model = None
self.intent_agent_model = None
self.answer_agent_model = None
Expand All @@ -43,6 +41,7 @@ def __init__(self):
self.redis_host = default_redis_host
self.redis_cache_duration = default_redis_cache_duration
self.suggestions_model = None
self.knowledge_graph_agent_model = None
self.load_env()

def load_env(self):
Expand All @@ -55,14 +54,11 @@ def load_env(self):
self.mistral_url = os.getenv("MISTRAL_URL")
self.mistral_key = os.getenv("MISTRAL_KEY")
self.mistral_model = os.getenv("MODEL")
self.openai_key = os.getenv("OPENAI_KEY")
self.openai_key = os.getenv("OPENAI_KEY")
self.neo4j_uri = os.getenv("NEO4J_URI", default_neo4j_uri)
self.neo4j_user = os.getenv("NEO4J_USERNAME")
self.neo4j_password = os.getenv("NEO4J_PASSWORD")
self.files_directory = os.getenv("FILES_DIRECTORY", default_files_directory)
self.azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
self.azure_storage_container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")
self.azure_initial_data_filename = os.getenv("AZURE_INITIAL_DATA_FILENAME")
self.answer_agent_llm = os.getenv("ANSWER_AGENT_LLM")
self.intent_agent_llm = os.getenv("INTENT_AGENT_LLM")
self.validator_agent_llm = os.getenv("VALIDATOR_AGENT_LLM")
Expand All @@ -73,6 +69,7 @@ def load_env(self):
self.maths_agent_llm = os.getenv("MATHS_AGENT_LLM")
self.router_llm = os.getenv("ROUTER_LLM")
self.suggestions_llm = os.getenv("SUGGESTIONS_LLM")
self.knowledge_graph_agent_llm = os.getenv("KNOWLEDGE_GRAPH_AGENT_LLM")
self.answer_agent_model = os.getenv("ANSWER_AGENT_MODEL")
self.intent_agent_model = os.getenv("INTENT_AGENT_MODEL")
self.validator_agent_model = os.getenv("VALIDATOR_AGENT_MODEL")
Expand All @@ -85,6 +82,7 @@ def load_env(self):
self.redis_host = os.getenv("REDIS_HOST", default_redis_host)
self.redis_cache_duration = os.getenv("REDIS_CACHE_DURATION", default_redis_cache_duration)
self.suggestions_model = os.getenv("SUGGESTIONS_MODEL")
self.knowledge_graph_agent_model = os.getenv("KNOWLEDGE_GRAPH_AGENT_MODEL")
except FileNotFoundError:
raise FileNotFoundError("Please provide a .env file. See the Getting Started guide on the README.md")
except Exception:
Expand Down
Loading

0 comments on commit 45ff842

Please sign in to comment.