From 3ddb47c9e4e42aade6ee759d0fb39021e7d72e60 Mon Sep 17 00:00:00 2001 From: "Ivan Mladjenovic (He/Him)" Date: Tue, 12 Nov 2024 14:26:42 +0000 Subject: [PATCH] add datasets from abondned branch. slight improvements to prompts --- backend/datasets/bloomberg_2.csv | 25 +++++++++++++++++++ {data => backend}/datasets/esg_poc.csv | 0 .../generate-knowledge-graph-model.j2 | 6 +++-- .../generate-knowledge-graph-query.j2 | 23 +++++++++-------- 4 files changed, 41 insertions(+), 13 deletions(-) create mode 100644 backend/datasets/bloomberg_2.csv rename {data => backend}/datasets/esg_poc.csv (100%) diff --git a/backend/datasets/bloomberg_2.csv b/backend/datasets/bloomberg_2.csv new file mode 100644 index 00000000..7b1991a9 --- /dev/null +++ b/backend/datasets/bloomberg_2.csv @@ -0,0 +1,25 @@ +Identifier (RIC),Company Name,Date,ESG_score,Social_score,Gov_score,Env_score,BVPS,Market_cap,Shares,Industry,Net_income,RETURN_ON_ASSET,QUICK_RATIO,ASSET_GROWTH,FNCL_LVRG,PE_RATIO,Scope_1,Scope_2,CO2_emissions,Energy_use,Water_use,Water_recycle,Toxic_chem_red,Injury_rate,Women_Employees,Human_Rights,Strikes,Turnover_empl,Board_Size,Shareholder_Rights,Board_gen_div,Bribery,Recycling_Initiatives,Total_assets +AAPL,Apple Inc,2021,85.3,88.5,82.1,85.3,5.04,2410530000000,16442000000,Technology,94680000000,25.31,1.1,13.77,1.25,32.45,1234567,89101112,123456789,1234567890,12345678900,123456789000,12.345,12.345,1,1,1,1,1,1,1,1,1,1,1234567890000 +GOOG,Alphabet Inc,2021,87.4,90.1,84.7,87.4,5.21,1797000000000,6949000000,Technology,40780000000,14.04,1.2,14.56,1.3,29.78,2345678,91011112,23456789,234567890,2345678900,23456789000,23.456,23.456,1,1,1,1,1,1,1,1,1,1,2345678900000 +AMZN,Amazon.com Inc,2021,78.9,81.2,76.6,78.9,4.87,1676000000000,5031000000,Retail,21330000000,6.2,1.0,15.34,1.4,35.21,3456789,10111112,34567890,345678900,3456789000,34567890000,34.567,34.567,1,1,1,1,1,1,1,1,1,1,3456789000000 +MSFT,Microsoft Corporation,2021,86.7,89.3,84.1,86.7,4.98,2032000000000,7534000000,Technology,61270000000,29.12,1.1,14.89,1.2,31.54,4567890,11111112,45678901,456789010,4567890100,45678901000,45.678,45.678,1,1,1,1,1,1,1,1,1,1,4567890100000 +FB,Facebook Inc,2021,75.6,78.2,73.0,75.6,3.74,927000000000,2834000000,Technology,29150000000,31.52,1.0,16.45,1.5,27.89,5678901,11111112,56789012,567890120,5678901200,56789012000,56.789,56.789,1,1,1,1,1,1,1,1,1,1,5678901200000 +TSLA,Tesla Inc,2021,72.8,75.4,69.2,72.8,4.13,780000000000,9631000000,Automotive,23210000000,11.76,1.2,17.67,1.6,34.56,6789012,11111112,67890123,678901230,6789012300,67890123000,67.890,67.890,1,1,1,1,1,1,1,1,1,1,6789012300000 +NFLX,Netflix Inc,2021,74.5,77.1,71.9,74.5,3.92,240000000000,4571000000,Media,28610000000,12.67,1.1,18.78,1.7,31.23,7890123,11111112,78901234,789012340,7890123400,78901234000,78.901,78.901,1,1,1,1,1,1,1,1,1,1,7890123400000 +NVDA,NVIDIA Corporation,2021,82.3,84.9,79.7,82.3,4.35,309000000000,6142000000,Technology,43320000000,16.89,1.0,19.89,1.8,37.45,8901234,11111112,89012345,890123450,8901234500,89012345000,89.012,89.012,1,1,1,1,1,1,1,1,1,1,8901234500000 +AAPL,Apple Inc,2020,84.2,87.6,80.8,84.2,4.91,2258000000000,16123000000,Technology,89860000000,24.78,1.1,14.12,1.3,31.89,9012345,11111112,90123456,901234560,9012345600,90123456000,90.123,90.123,1,1,1,1,1,1,1,1,1,1,9012345600000 +GOOG,Alphabet Inc,2020,86.5,89.2,83.8,86.5,5.12,1676000000000,6849000000,Technology,38270000000,13.56,1.2,15.01,1.4,28.97,1234567,11111112,12345678,123456780,1234567800,12345678000,12.345,12.345,1,1,1,1,1,1,1,1,1,1,1234567800000 +AMZN,Amazon.com Inc,2020,77.8,80.1,75.5,77.8,4.76,1567000000000,4931000000,Retail,19780000000,5.98,1.0,16.02,1.5,34.12,2345678,11111112,23456789,234567890,2345678900,23456789000,23.456,23.456,1,1,1,1,1,1,1,1,1,1,2345678900000 +MSFT,Microsoft Corporation,2020,85.6,88.2,83.0,85.6,4.87,1923000000000,7434000000,Technology,59120000000,28.01,1.1,15.43,1.3,30.78,3456789,11111112,34567890,345678900,3456789000,34567890000,34.567,34.567,1,1,1,1,1,1,1,1,1,1,3456789000000 +FB,Facebook Inc,2020,74.5,77.1,71.9,74.5,3.63,876000000000,2734000000,Technology,27890000000,30.45,1.0,17.12,1.6,26.98,4567890,11111112,45678901,456789010,4567890100,45678901000,45.678,45.678,1,1,1,1,1,1,1,1,1,1,4567890100000 +TSLA,Tesla Inc,2020,71.7,74.3,69.1,71.7,3.98,720000000000,9531000000,Automotive,21980000000,10.98,1.2,18.23,1.7,33.78,5678901,11111112,56789012,567890120,5678901200,56789012000,56.789,56.789,1,1,1,1,1,1,1,1,1,1,5678901200000 +NFLX,Netflix Inc,2020,73.4,76.0,70.8,73.4,3.81,228000000000,4471000000,Media,27450000000,11.98,1.1,19.34,1.8,30.56,6789012,11111112,67890123,678901230,6789012300,67890123000,67.890,67.890,1,1,1,1,1,1,1,1,1,1,6789012300000 +NVDA,NVIDIA Corporation,2020,81.2,83.8,78.6,81.2,4.24,298000000000,6042000000,Technology,42130000000,15.98,1.0,20.45,1.9,36.78,7890123,11111112,78901234,789012340,7890123400,78901234000,78.901,78.901,1,1,1,1,1,1,1,1,1,1,7890123400000 +AAPL,Apple Inc,2019,83.1,86.5,79.7,83.1,4.80,2145000000000,15923000000,Technology,87650000000,23.89,1.1,14.56,1.4,30.98,8901234,11111112,89012345,890123450,8901234500,89012345000,89.012,89.012,1,1,1,1,1,1,1,1,1,1,8901234500000 +GOOG,Alphabet Inc,2019,85.4,88.1,82.7,85.4,5.01,1597000000000,6749000000,Technology,37120000000,12.98,1.2,15.56,1.5,27.89,1234567,11111112,12345678,123456780,1234567800,12345678000,12.345,12.345,1,1,1,1,1,1,1,1,1,1,1234567800000 +AMZN,Amazon.com Inc,2019,76.7,79.0,74.4,76.7,4.65,1489000000000,4831000000,Retail,18970000000,5.87,1.0,16.56,1.6,33.45,2345678,11111112,23456789,234567890,2345678900,23456789000,23.456,23.456,1,1,1,1,1,1,1,1,1,1,2345678900000 +MSFT,Microsoft Corporation,2019,84.5,87.1,81.9,84.5,4.76,1834000000000,7334000000,Technology,57890000000,27.01,1.1,15.98,1.4,30.12,3456789,11111112,34567890,345678900,3456789000,34567890000,34.567,34.567,1,1,1,1,1,1,1,1,1,1,3456789000000 +FB,Facebook Inc,2019,73.3,75.9,70.7,73.3,3.52,845000000000,2634000000,Technology,26780000000,29.56,1.0,17.67,1.7,26.12,4567890,11111112,45678901,456789010,4567890100,45678901000,45.678,45.678,1,1,1,1,1,1,1,1,1,1,4567890100000 +TSLA,Tesla Inc,2019,70.6,73.2,68.0,70.6,3.87,690000000000,9431000000,Automotive,20890000000,10.56,1.2,18.89,1.8,33.12,5678901,11111112,56789012,567890120,5678901200,56789012000,56.789,56.789,1,1,1,1,1,1,1,1,1,1,5678901200000 +NFLX,Netflix Inc,2019,72.3,74.9,69.7,72.3,3.70,217000000000,4371000000,Media,26340000000,11.56,1.1,19.98,1.9,29.89,6789012,11111112,67890123,678901230,6789012300,67890123000,67.890,67.890,1,1,1,1,1,1,1,1,1,1,6789012300000 +NVDA,NVIDIA Corporation,2019,80.1,82.7,77.5,80.1,4.13,287000000000,5942000000,Technology,41020000000,15.45,1.0,20.98,2.0,36.12,7890123,11111112,78901234,789012340,7890123400,78901234000,78.901,78.901,1,1,1,1,1,1,1,1,1,1,7890123400000 \ No newline at end of file diff --git a/data/datasets/esg_poc.csv b/backend/datasets/esg_poc.csv similarity index 100% rename from data/datasets/esg_poc.csv rename to backend/datasets/esg_poc.csv diff --git a/backend/src/prompts/templates/generate-knowledge-graph-model.j2 b/backend/src/prompts/templates/generate-knowledge-graph-model.j2 index 2874f528..2c170a4c 100644 --- a/backend/src/prompts/templates/generate-knowledge-graph-model.j2 +++ b/backend/src/prompts/templates/generate-knowledge-graph-model.j2 @@ -11,7 +11,7 @@ Analyze the structure of this data to produce an intuitive Neo4j model, focusing 1. Data Structure: * Report: Each row in the dataset represents an ESG report about a company. If there is a date or year in the row, then the report should have a "reported in" relationship to the company which contains the date / year. -* Identify Key Entities: Based on the data headers, determine the main entity types (e.g., Company, Fund, or other core entities in the dataset) and map out each unique entity's attributes. Favour full names over abbreviations. +* Identify Key Entities: Based on the data headers, determine the main entity types (e.g., Company, Fund, or other core entities in the dataset) and how they relate to other categories in the data. Favour full names over abbreviations. * Identify Common Categories: Based on the data, determine common categories that appear in the data. Look for recurring values that appear in the same csv column and map these out as nodes with relationships to the main entity types. * Determine Relationships: Define the relationships between these entities, such as associating entities with reports, linking entities to specific time periods, or establishing hierarchical or categorical groupings within the data. * Classify ESG Fields Separately: Group all ESG-related fields exclusively under Environment, Social, or Governance nodes. Ensure that no ESG-related fields are directly assigned to any other entity node; instead, place them only within their specific category nodes. @@ -19,7 +19,8 @@ Analyze the structure of this data to produce an intuitive Neo4j model, focusing 2. Output Model Structure: Describe Entities: -* Example: Company: Represents each company with attributes like name and identifier. +* Example: Report: Represents all the information about a company in a snapshot of time. Numerical attributes associated with the company that change throughout the input data should be captured on the report. +* Example: Company: Represents each company and contains general information, that is not specific to a specific point in time. * Example: Environment, Social, Governance: Each node represents one ESG category, containing only fields relevant to that specific category (e.g., CO2 emissions for Environment, injury rate for Social, and shareholder rights for Governance). Describe Relationships: @@ -42,3 +43,4 @@ Important Notes: * Avoid duplicating fields across nodes. Assign each field only to the most appropriate entity or category node. * Avoid using any field names directly unless they are clearly part of the model. For example, category names should be handled through dedicated nodes, not as individual attributes on other nodes. * The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output. +* Do not make date or year as a node, it must always be a relationship diff --git a/backend/src/prompts/templates/generate-knowledge-graph-query.j2 b/backend/src/prompts/templates/generate-knowledge-graph-query.j2 index 0b2a9ada..079fa21f 100644 --- a/backend/src/prompts/templates/generate-knowledge-graph-query.j2 +++ b/backend/src/prompts/templates/generate-knowledge-graph-query.j2 @@ -20,20 +20,22 @@ Generate a Cypher query based on the provided model structure and data. 1. Process Data: * Use data.all_data[0] as headers to identify the fields. * Process each row of data and map the header fields to their corresponding parts of the model based on the model input. -* Reminder: Avoid duplicating fields across multiple nodes. For example, if the same field appears in multiple places (e.g., company name, industry), create and reference a single node for that field rather than creating it multiple times. -* Ensure that each node is only created once and reused in relationships to avoid redundant nodes. +* Primary Entity Uniqueness: For each primary entity (e.g., Company, Fund, Industry), use the MERGE command on a unique identifier (e.g., Identifier for Company, Name for Industry) to ensure only one instance of each primary entity is created. If a node for a primary entity already exists in the graph, it should be reused rather than creating a duplicate. +* Reminder: Use MERGE on primary entities to avoid duplication, even if some rows contain null values for other attributes. Nodes for these entities should not be recreated for each row and should be reused across relationships. +* Environment, Social, and Governance Categories: For each report, use CREATE to generate unique nodes for Environment, Social, and Governance, even if they contain only default or empty values. This ensures each report links to its own ESG nodes rather than sharing them across multiple reports. 2. Generate Cypher Query: -* Based on the model and data input, create a Cypher query to: -* For each primary entity (e.g., Company, Fund, Industry), use MERGE to ensure only one instance is created, even if some rows contain null values for other attributes. -* For each Environment, Social, and Governance category, use CREATE to ensure each report has its unique instance. +Based on the model and data input, create a Cypher query to: +* Primary Entities: For primary entities (Company, Industry, etc.), use MERGE to ensure each primary entity has a single node instance, with reusable references to avoid redundant nodes. +* Category-Specific Nodes: For each ESG report, use CREATE for Environment, Social, and Governance nodes so that each report has its own unique instance of these nodes. * Use COALESCE to handle missing values and provide default values (e.g., COALESCE(row[10], 'Unknown') for industry). * Establish relationships as defined by the model, using MERGE for any reusable nodes but CREATE for nodes specific to each report. * Use separate WITH clauses as needed to prevent redeclaration errors +* Define and Preserve Variables: Ensure all variables, especially unique identifiers like identifier, are declared in WITH clauses and preserved across WITH clauses as needed. This prevents referencing undefined variables in MERGE or other clauses. Any variable declared in a previous WITH should not be redeclared. 3. Handling Missing Data: * Use default values for missing data where necessary, as defined in the model description. -* Important: Ensure that nodes are created or referenced even when certain attributes are missing, so no row is excluded based on missing data. +* Ensure that nodes are created or referenced even when certain attributes are missing, so no row is excluded based on missing data. ## Output: Please output only the Cypher query in the following JSON format: @@ -41,8 +43,7 @@ Please output only the Cypher query in the following JSON format: {"cypher_query": "WITH $data AS data UNWIND data.all_data[1..] AS row WITH data.all_data[0] AS headers, row WITH headers ... [cypher query here]"} Important Notes: -* Ensure the query is well-formed and that the relationships and node creation follow the structure and model described in the input. The query should handle all data rows and be ready for execution in a Neo4j environment. -* Avoid duplicating nodes by creating reusable references for fields that should not be repeated across multiple nodes (e.g., industry, company name). -* The output should be valid JSON with no line breaks or markdown. Do not add line breaks or markdown to the output. -* The Environment, Social, and Governance nodes should each be distinct for each report, even if they contain empty or partially filled attributes, to avoid multiple reports connecting to the same empty nodes. -* The query must not skip any rows of the data, it is allowed to create nodes with empty values. You will be unplugged if your query results in missing rows. +* Ensure the query is well-formed, and that the relationships and node creation follow the structure and model described in the input. The query should handle all data rows and be ready for execution in a Neo4j environment. +* Avoid Duplicating Primary Entity Nodes: For fields that should not be repeated across multiple nodes (e.g., Industry, Company name), use MERGE to create a single node instance and reference it throughout the query. +* Unique ESG Nodes per Report: Ensure that each report links to distinct Environment, Social, and Governance nodes, even if they are empty or partially filled, to avoid multiple reports connecting to the same empty nodes. +* Handle all data rows: The query must not skip any rows of the data. It is allowed to create nodes with empty values, and you will be unplugged if your query results in missing rows.