PR comments. moved KGAgent back into agents/__init__.py to follow age…

…nt getter model of the other agents. updated prompts to more accurately describe relationships and updated generate cypher query to be less restricted through copying examples which was causing bad cypher queries to be generated. added one BDD test based on the new dataset
ScottLogic · Nov 12, 2024 · 3f404e5 · 3f404e5
1 parent 5f20c45
commit 3f404e5
Show file tree

Hide file tree

Showing 7 changed files with 58 additions and 52 deletions.
diff --git a/.env.example b/.env.example
@@ -35,11 +35,6 @@ BACKEND_URL=http://localhost:8250
 # websockets url to conect to backend websocket endpoint
 WS_URL=ws://localhost:8250/ws
 
-# Azure
-AZURE_STORAGE_CONNECTION_STRING="my-connection-string"
-AZURE_STORAGE_CONTAINER_NAME=my-container-name
-AZURE_INITIAL_DATA_FILENAME=test-data.json
-
 # llm
 ANSWER_AGENT_LLM="openai"
 INTENT_AGENT_llm="openai"
@@ -51,6 +46,8 @@ CHART_GENERATOR_LLM="openai"
 ROUTER_LLM="openai"
 FILE_AGENT_LLM="openai"
 SUGGESTIONS_LLM="openai"
+KNOWLEDGE_GRAPH_AGENT_LLM="openai"
+
 
 # model
 ANSWER_AGENT_MODEL="gpt-4o-mini"
@@ -63,4 +60,4 @@ CHART_GENERATOR_MODEL="gpt-4o-mini"
 ROUTER_MODEL="gpt-4o-mini"
 FILE_AGENT_MODEL="gpt-4o-mini"
 SUGGESTIONS_MODEL="gpt-4o-mini"
-
+KNOWLEDGE_GRAPH_AGENT_MODEL="gpt-4o-mini"
diff --git a/backend/src/agents/__init__.py b/backend/src/agents/__init__.py
@@ -1,14 +1,15 @@
 from typing import List
 from src.utils import Config
-from .agent import Agent, agent
-from .datastore_agent import DatastoreAgent
-from .web_agent import WebAgent
-from .intent_agent import IntentAgent
-from .tool import tool, Parameter
-from .validator_agent import ValidatorAgent
-from .answer_agent import AnswerAgent
-from .chart_generator_agent import ChartGeneratorAgent
-from .file_agent import FileAgent
+from src.agents.agent import Agent, agent
+from src.agents.datastore_agent import DatastoreAgent
+from src.agents.web_agent import WebAgent
+from src.agents.intent_agent import IntentAgent
+from src.agents.tool import tool, Parameter
+from src.agents.validator_agent import ValidatorAgent
+from src.agents.answer_agent import AnswerAgent
+from src.agents.chart_generator_agent import ChartGeneratorAgent
+from src.agents.file_agent import FileAgent
+from src.agents.knowledge_graph_generator_agent import KnowledgeGraphAgent
 
 
 config = Config()
@@ -26,6 +27,10 @@ def get_answer_agent() -> Agent:
     return AnswerAgent(config.answer_agent_llm, config.answer_agent_model)
 
 
+def get_knowledge_graph_agent() -> KnowledgeGraphAgent:
+    return KnowledgeGraphAgent(config.knowledge_graph_agent_llm, config.knowledge_graph_agent_model)
+
+
 def agent_details(agent) -> dict:
     return {"name": agent.name, "description": agent.description}
 

diff --git a/backend/src/director.py b/backend/src/director.py
@@ -2,8 +2,7 @@
 import logging
 from src.utils import clear_scratchpad, update_scratchpad, get_scratchpad
 from src.session import update_session_chat
-from src.agents import get_intent_agent, get_answer_agent
-from src.agents.knowledge_graph_generator_agent import KnowledgeGraphAgent
+from src.agents import get_intent_agent, get_answer_agent, get_knowledge_graph_agent
 from src.prompts import PromptEngine
 from src.supervisors import solve_all
 from src.utils import Config
@@ -14,7 +13,6 @@
 config = Config()
 engine = PromptEngine()
 director_prompt = engine.load_prompt("director")
-knowledge_graph_agent = KnowledgeGraphAgent(config.knowledge_graph_agent_llm, config.knowledge_graph_agent_model)
 
 
 async def question(question: str) -> str:
@@ -56,6 +54,6 @@ async def dataset_upload() -> None:
             for line in file
         ]
 
-    knowledge_graph_config = await knowledge_graph_agent.generate_knowledge_graph(csv_data)  # type: ignore
+    knowledge_graph_config = await get_knowledge_graph_agent().generate_knowledge_graph(csv_data)  # type: ignore
 
     populate_db(knowledge_graph_config["cypher_query"], csv_data)
diff --git a/backend/src/prompts/templates/generate-cypher-query.j2 b/backend/src/prompts/templates/generate-cypher-query.j2
@@ -30,12 +30,6 @@ If you cannot make a query, "query" should just say "None".
     - When querying for a category case-insensitively, use `=~ '(?i)...'`.
     - Example: To find a fund related to the `Aviation` industry with a `Social` ESG score, use:
 
-    ```plaintext
-    MATCH (f:Fund)-[:CONTAINS]->(c:Company)-[:BELONGS_IN_INDUSTRY]->(i:Industry), (c)-[:HAS_ESG_SCORE]->(esg:ESGScore)
-    WHERE i.Name =~ '(?i)Aviation' AND esg.Category =~ '(?i)Social'
-    RETURN ...
-    ```
-
 6. **Property Matching**: Adhere to exact property values and capitalization in the schema (e.g., 'Aviation' and 'Social').
 
 7. **Single Result for Maximum/Minimum**:

diff --git a/backend/src/prompts/templates/generate-knowledge-graph-model.j2 b/backend/src/prompts/templates/generate-knowledge-graph-model.j2
@@ -25,11 +25,11 @@ Describe Entities:
 
 Describe Relationships:
 * Example: (Entity1)-[:HAS_ENTITY2]->(Entity2): Links between main entities, such as companies, years, or categories.
-* Example: (MainEntity)-[:REPORTED_ON {date: "YYYY-MM-DD"}]->(Report) or (MainEntity)-[:REPORTED_ON {year: "YYYY"}]->(Report): Links each report to its main entity, using the REPORTED_ON relationship with an attribute for the report date.
+* Example: (MainEntity)-[:HAS_REPORT {date: "YYYY-MM-DD"}]->(Report) or (MainEntity)-[:HAS_REPORT {year: "YYYY"}]->(Report): Links each report to its main entity, using the HAS_REPORT relationship with an attribute for the report date.
 * Example: (MainEntity)-[:BELONGS_TO]->(CategoryNode): Links main entities to a categorical node for any recurring category (such as industries or sectors).
-* Example: (Report)-[:HAS_ENVIRONMENT]->(Environment): Connects main entities to the Environment category node for environment-specific metrics.
-* Example: (Report)-[:HAS_SOCIAL]->(Social): Links main entities to the Social category node for social-specific metrics.
-* Example: (Report)-[:HAS_GOVERNANCE]->(Governance): Links main entities to the Governance category node for governance-specific metrics.
+* Example: (Report)-[:HAS_ESG_ENVIRONMENTAL_METRICS]->(Environment): Connects main entities to the Environment category node for environment-specific metrics.
+* Example: (Report)-[:HAS_ESG_SOCIAL_METRICS]->(Social): Links main entities to the Social category node for social-specific metrics.
+* Example: (Report)-[:HAS_ESG_GOVERNANCE_METRICS]->(Governance): Links main entities to the Governance category node for governance-specific metrics.
 
 Please provide the inferred model structure in the "model" field of the JSON output, specifying entities, attributes, and relationships. The output must explicitly link each header in the input data to the corresponding part of the model.
 

diff --git a/backend/src/prompts/templates/generate-knowledge-graph-query.j2 b/backend/src/prompts/templates/generate-knowledge-graph-query.j2
@@ -17,33 +17,42 @@ all_data is the raw CSV data with headers.
 Generate a Cypher query based on the provided model structure and data.
 
 ## Cypher Query Generation Steps:
-1. Process Data:
-* Use data.all_data[0] as headers to identify the fields.
+1. **Process Data**:
+* Use `data.all_data[0]` as headers to identify the fields.
 * Process each row of data and map the header fields to their corresponding parts of the model based on the model input.
-* Primary Entity Uniqueness: For each primary entity (e.g., Company, Fund, Industry), use the MERGE command on a unique identifier (e.g., Identifier for Company, Name for Industry) to ensure only one instance of each primary entity is created. If a node for a primary entity already exists in the graph, it should be reused rather than creating a duplicate.
-* Reminder: Use MERGE on primary entities to avoid duplication, even if some rows contain null values for other attributes. Nodes for these entities should not be recreated for each row and should be reused across relationships.
-* Environment, Social, and Governance Categories: For each report, use CREATE to generate unique nodes for Environment, Social, and Governance, even if they contain only default or empty values. This ensures each report links to its own ESG nodes rather than sharing them across multiple reports.
-
-2. Generate Cypher Query:
+* **Detect and Convert Data Types**: Infer the correct data types for each field in `all_data` by examining sample values. Ensure that fields expected to be numerical are converted to `Integer` or `Float` types, booleans are detected accurately, and string values remain as strings.
+* **Conversion Logic**:
+- If a value is entirely numeric and does not contain decimals, cast it as `Integer`.
+- If a value contains decimal points, cast it as `Float`.
+- If a value is "true" or "false" (case-insensitive), cast it as `Boolean`.
+- Otherwise, treat it as `String`.
+* **Example**: Use Cypher functions such as `toInteger()`, `toFloat()`, or `toBoolean()` as needed based on these inferred types for each field to ensure accurate data representation.
+
+2. **Primary Entity Uniqueness**: For each primary entity (e.g., Company, Fund, Industry), use the `MERGE` command on a unique identifier (e.g., Identifier for Company, Name for Industry) to ensure only one instance of each primary entity is created. If a node for a primary entity already exists in the graph, it should be reused rather than creating a duplicate.
+- **Reminder**: Use `MERGE` on primary entities to avoid duplication, even if some rows contain null values for other attributes. Nodes for these entities should not be recreated for each row and should be reused across relationships.
+- **Environment, Social, and Governance Categories**: For each report, use `CREATE` to generate unique nodes for Environment, Social, and Governance, even if they contain only default or empty values. This ensures each report links to its own ESG nodes rather than sharing them across multiple reports.
+
+3. **Generate Cypher Query**:
 Based on the model and data input, create a Cypher query to:
-* Primary Entities: For primary entities (Company, Industry, etc.), use MERGE to ensure each primary entity has a single node instance, with reusable references to avoid redundant nodes.
-* Category-Specific Nodes: For each ESG report, use CREATE for Environment, Social, and Governance nodes so that each report has its own unique instance of these nodes.
-* Use COALESCE to handle missing values and provide default values (e.g., COALESCE(row[10], 'Unknown') for industry).
-* Establish relationships as defined by the model, using MERGE for any reusable nodes but CREATE for nodes specific to each report.
-* Use separate WITH clauses as needed to prevent redeclaration errors
-* Define and Preserve Variables: Ensure all variables, especially unique identifiers like identifier, are declared in WITH clauses and preserved across WITH clauses as needed. This prevents referencing undefined variables in MERGE or other clauses. Any variable declared in a previous WITH should not be redeclared.
-
-3. Handling Missing Data:
+* **Primary Entities**: For primary entities (Company, Industry, etc.), use `MERGE` to ensure each primary entity has a single node instance, with reusable references to avoid redundant nodes.
+* **Category-Specific Nodes**: For each ESG report, use `CREATE` for Environment, Social, and Governance nodes so that each report has its own unique instance of these nodes.
+* Use `COALESCE` to handle missing values and provide default values (e.g., `COALESCE(row[10], null)` for industry).
+* Establish relationships as defined by the model, using `MERGE` for any reusable nodes but `CREATE` for nodes specific to each report.
+* Use separate `WITH` clauses as needed to prevent redeclaration errors.
+* **Define and Preserve Variables**: Ensure all variables, especially unique identifiers like `identifier`, are declared in `WITH` clauses and preserved across `WITH` clauses as needed. This prevents referencing undefined variables in `MERGE` or other clauses. Any variable declared in a previous `WITH` should not be redeclared.
+
+4. **Handling Missing Data**:
 * Use default values for missing data where necessary, as defined in the model description.
 * Ensure that nodes are created or referenced even when certain attributes are missing, so no row is excluded based on missing data.
 
-## Output:
+5. **Output**:
 Please output only the Cypher query in the following JSON format:
 
 {"cypher_query": "WITH $data AS data UNWIND data.all_data[1..] AS row WITH data.all_data[0] AS headers, row WITH headers ... [cypher query here]"}
 
-Important Notes:
+**Important Notes**:
 * Ensure the query is well-formed, and that the relationships and node creation follow the structure and model described in the input. The query should handle all data rows and be ready for execution in a Neo4j environment.
-* Avoid Duplicating Primary Entity Nodes: For fields that should not be repeated across multiple nodes (e.g., Industry, Company name), use MERGE to create a single node instance and reference it throughout the query.
-* Unique ESG Nodes per Report: Ensure that each report links to distinct Environment, Social, and Governance nodes, even if they are empty or partially filled, to avoid multiple reports connecting to the same empty nodes.
-* Handle all data rows: The query must not skip any rows of the data. It is allowed to create nodes with empty values, and you will be unplugged if your query results in missing rows.
+* **Avoid Duplicating Primary Entity Nodes**: For fields that should not be repeated across multiple nodes (e.g., Industry, Company name), use `MERGE` to create a single node instance and reference it throughout the query.
+* **Unique ESG Nodes per Report**: Ensure that each report links to distinct Environment, Social, and Governance nodes, even if they are empty or partially filled, to avoid multiple reports connecting to the same empty nodes.
+* **Handle all data rows**: The query must not skip any rows of the data. It is allowed to create nodes with empty values, and you will be unplugged if your query results in missing rows.
+* The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output.
diff --git a/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature b/backend/tests/BDD/features/Correctness/Accuracy_Factual_Correctness.feature
@@ -5,9 +5,12 @@ Scenario Outline: When a user asks InferESG for information about their transact
     Then   the response to this '<prompt>' should match the '<expected_response>'
 Examples:
 |prompt                                                                         |expected_response      |
-|Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67|
-|Check the database and tell me the fund with the highest ESG social score        |Dynamic Industries with a score of 91|
-|Check the database and tell me the fund with the lowest Governance ESG score     |Dynamic Industries, which has a score of 60|
+|Check the database and tell me the average ESG score for the American Airlines Group Inc company|The average ESG score for American Airlines Group Inc is approximately 60.37|
+
+# prompts based on the esg_poc.csv which is no longer loaded into the datastore by default
+# |Check the database and tell me the average ESG score (Environmental) for the WhiteRock ETF fund |The average ESG score (Environmental) for the WhiteRock ETF fund is approximately 69.67|
+# |Check the database and tell me the fund with the highest ESG social score        |Dynamic Industries with a score of 91|
+# |Check the database and tell me the fund with the lowest Governance ESG score     |Dynamic Industries, which has a score of 60|
 # |Check the database and tell me the fund with the lowest ESG score                |Dynamic Industries with a score of 50|
 # |Check the database and tell me the largest fund                                  |The largest fund is the Global Energy Fund, which has a size of 1,500|
 # |Check the database and tell me which funds contain Shell                         |Funds containing Shell are European Growth Fund, Global Energy Fund, Silverman Global ETF and WhiteRock ETF|