From 56fb9c8d8344f1c45b216357a21f893d7e71ffc2 Mon Sep 17 00:00:00 2001 From: natalie <77713883+n-h-diaz@users.noreply.github.com> Date: Fri, 20 Dec 2024 07:05:24 -0800 Subject: [PATCH] Update spanner queries (#1480) * Updates node queries to use only GQL. This helps improve performance for querying with filters * Removes extra join in observation with contained in place query. After speaking with Mingtian, he recommends adding observation to the graph to remove additional joins. This will require updating the graph schema, and there are some implications for write throughput, but we can potentially explore this (can discuss more later) * Updates some search tests (it's unclear if this is just flaky, so if it starts to fail again, we can consider updating the test to support potentially inconsistent ordering) --- .../golden/query/search_nodes_with_type.json | 28 +- .../query/search_nodes_without_type.json | 42 +-- internal/server/spanner/query.go | 303 +++++++----------- 3 files changed, 158 insertions(+), 215 deletions(-) diff --git a/internal/server/spanner/golden/query/search_nodes_with_type.json b/internal/server/spanner/golden/query/search_nodes_with_type.json index 5b8a4e606..96280acce 100644 --- a/internal/server/spanner/golden/query/search_nodes_with_type.json +++ b/internal/server/spanner/golden/query/search_nodes_with_type.json @@ -349,6 +349,13 @@ "StatisticalVariable" ] }, + { + "SubjectID": "oecd/IDD_INCACTOTAL_TOT_CURRENT_METH2011", + "Name": "Income Distribution Database: All age groups: mean disposable income (current prices), Total population, Current definition, Income definition until 2011", + "Types": [ + "StatisticalVariable" + ] + }, { "SubjectID": "oecd/IDD_INCCTOTAL_WA_INCOMPARABLE_METH2012", "Name": "Income Distribution Database: Mean disposable income (current prices), Working age population: 18-65, Previous definition - without overlap year, New income definition since 2012", @@ -370,6 +377,13 @@ "StatisticalVariable" ] }, + { + "SubjectID": "oecd/IDD_P90P10_TOT_CURRENT_METH2012", + "Name": "Income Distribution Database: P90/P10 disposable income decile ratio, Total population, Current definition, New income definition since 2012", + "Types": [ + "StatisticalVariable" + ] + }, { "SubjectID": "oecd/IDD_P90P50_TOT_INCOMPARABLE_METH2012", "Name": "Income Distribution Database: P90/P50 disposable income decile ratio, Total population, Previous definition - without overlap year, New income definition since 2012", @@ -391,20 +405,6 @@ "StatisticalVariable" ] }, - { - "SubjectID": "oecd/IDD_INCACTOTAL_TOT_CURRENT_METH2011", - "Name": "Income Distribution Database: All age groups: mean disposable income (current prices), Total population, Current definition, Income definition until 2011", - "Types": [ - "StatisticalVariable" - ] - }, - { - "SubjectID": "oecd/IDD_P90P10_TOT_CURRENT_METH2012", - "Name": "Income Distribution Database: P90/P10 disposable income decile ratio, Total population, Current definition, New income definition since 2012", - "Types": [ - "StatisticalVariable" - ] - }, { "SubjectID": "oecd/IMW_1EARNERC2C_2_1_0_0_0_1", "Name": "Incomes of minimum wage earners: Couple with 2 children - partner is out of work, Family income, divided by the income of an otherwise identical family working at the average wage, Net income (family level), Full-time employees working in ISIC sectors C-K (see metadata for details), Weighted average of minimum wage amounts throughout the calendar year, No, Yes", diff --git a/internal/server/spanner/golden/query/search_nodes_without_type.json b/internal/server/spanner/golden/query/search_nodes_without_type.json index 6575bef9d..4e2f77c7c 100644 --- a/internal/server/spanner/golden/query/search_nodes_without_type.json +++ b/internal/server/spanner/golden/query/search_nodes_without_type.json @@ -405,6 +405,13 @@ "StatisticalVariable" ] }, + { + "SubjectID": "dc/g/Household_HouseholdType-MarriedCoupleFamilyHousehold_Income_IncomeStatus-WithIncome", + "Name": "Household With Household Type = Married Couple Family Household, Income, Income Status = With Income", + "Types": [ + "StatVarGroup" + ] + }, { "SubjectID": "dc/g/Household_HouseholdType-NonfamilyHousehold_Income_IncomeStatus-WithIncome", "Name": "Household With Household Type = Nonfamily Household, Income, Income Status = With Income", @@ -447,6 +454,13 @@ "StatisticalVariable" ] }, + { + "SubjectID": "oecd/IDD_INCACTOTAL_TOT_CURRENT_METH2011", + "Name": "Income Distribution Database: All age groups: mean disposable income (current prices), Total population, Current definition, Income definition until 2011", + "Types": [ + "StatisticalVariable" + ] + }, { "SubjectID": "oecd/IDD_INCCTOTAL_WA_INCOMPARABLE_METH2012", "Name": "Income Distribution Database: Mean disposable income (current prices), Working age population: 18-65, Previous definition - without overlap year, New income definition since 2012", @@ -468,6 +482,13 @@ "StatisticalVariable" ] }, + { + "SubjectID": "oecd/IDD_P90P10_TOT_CURRENT_METH2012", + "Name": "Income Distribution Database: P90/P10 disposable income decile ratio, Total population, Current definition, New income definition since 2012", + "Types": [ + "StatisticalVariable" + ] + }, { "SubjectID": "oecd/IDD_P90P50_TOT_INCOMPARABLE_METH2012", "Name": "Income Distribution Database: P90/P50 disposable income decile ratio, Total population, Previous definition - without overlap year, New income definition since 2012", @@ -489,27 +510,6 @@ "StatisticalVariable" ] }, - { - "SubjectID": "dc/g/Household_HouseholdType-MarriedCoupleFamilyHousehold_Income_IncomeStatus-WithIncome", - "Name": "Household With Household Type = Married Couple Family Household, Income, Income Status = With Income", - "Types": [ - "StatVarGroup" - ] - }, - { - "SubjectID": "oecd/IDD_INCACTOTAL_TOT_CURRENT_METH2011", - "Name": "Income Distribution Database: All age groups: mean disposable income (current prices), Total population, Current definition, Income definition until 2011", - "Types": [ - "StatisticalVariable" - ] - }, - { - "SubjectID": "oecd/IDD_P90P10_TOT_CURRENT_METH2012", - "Name": "Income Distribution Database: P90/P10 disposable income decile ratio, Total population, Current definition, New income definition since 2012", - "Types": [ - "StatisticalVariable" - ] - }, { "SubjectID": "oecd/IMW_1EARNERC2C_2_1_0_0_0_1", "Name": "Incomes of minimum wage earners: Couple with 2 children - partner is out of work, Family income, divided by the income of an otherwise identical family working at the average wage, Net income (family level), Full-time employees working in ISIC sectors C-K (see metadata for details), Weighted average of minimum wage amounts throughout the calendar year, No, Yes", diff --git a/internal/server/spanner/query.go b/internal/server/spanner/query.go index 87031ef4c..a1f609075 100644 --- a/internal/server/spanner/query.go +++ b/internal/server/spanner/query.go @@ -58,193 +58,144 @@ var statements = struct { searchNodesByQueryAndTypes string }{ getPropsBySubjectID: ` - SELECT - DISTINCT subject_id, - predicate - FROM - Edge - WHERE - subject_id IN UNNEST(@ids) - ORDER BY - subject_id, - predicate + GRAPH DCGraph MATCH -[e:Edge + WHERE + e.subject_id IN UNNEST(@ids)]-> + RETURN DISTINCT + e.subject_id, + e.predicate `, getPropsByObjectID: ` - SELECT - DISTINCT object_id AS subject_id, - predicate - FROM - Edge - WHERE - object_id IN UNNEST(@ids) - AND object_value IS NULL - ORDER BY - subject_id, - predicate + GRAPH DCGraph MATCH -[e:Edge + WHERE + e.object_id IN UNNEST(@ids) + AND e.object_value IS NULL + ]-> + RETURN DISTINCT + e.object_id AS subject_id, + e.predicate + ORDER BY + subject_id, + predicate `, getEdgesBySubjectID: ` - SELECT - result.subject_id, - result.predicate, - COALESCE(result.object_id, '') AS object_id, - COALESCE(result.object_value, '') AS object_value, - COALESCE(result.provenance, '') AS provenance, - COALESCE(result.name, '') AS name, - COALESCE(result.types, []) AS types - FROM ( - SELECT - * - FROM - GRAPH_TABLE ( - DCGRAPH MATCH -[e:Edge - WHERE - e.subject_id IN UNNEST(@ids) - AND e.object_value IS NULL - AND e.subject_id != e.object_id%[1]s]->(n:Node) - RETURN e.subject_id, - e.predicate, - e.object_id, - '' as object_value, - e.provenance, - n.name, - n.types - ) + GRAPH DCGraph MATCH -[e:Edge + WHERE + e.subject_id IN UNNEST(@ids) + AND e.object_value IS NULL%[1]s]->(n:Node) + RETURN + e.subject_id, + e.predicate, + e.object_id, + '' as object_value, + COALESCE(e.provenance, '') AS provenance, + COALESCE(n.name, '') AS name, + COALESCE(n.types, []) AS types UNION ALL - SELECT - * - FROM - GRAPH_TABLE ( - DCGraph MATCH -[e:Edge - WHERE - e.subject_id IN UNNEST(@ids) - AND e.object_value IS NOT NULL%[1]s]-> - RETURN e.subject_id, - e.predicate, - '' as object_id, - e.object_value, - e.provenance, - '' AS name, - ARRAY[] AS types - ) - )result + MATCH -[e:Edge + WHERE + e.subject_id IN UNNEST(@ids) + AND e.object_value IS NOT NULL%[1]s]-> + RETURN + e.subject_id, + e.predicate, + '' as object_id, + e.object_value, + e.provenance, + '' AS name, + ARRAY[] AS types `, getChainedEdgesBySubjectID: fmt.Sprintf(` - SELECT - result.subject_id, - @result_predicate AS predicate, - COALESCE(result.object_id, '') AS object_id, - COALESCE(result.object_value, '') AS object_value, - '' AS provenance, - COALESCE(result.name, '') AS name, - ARRAY[] AS types - FROM ( - SELECT - * - FROM - GRAPH_TABLE ( - DCGRAPH MATCH (m:Node - WHERE - m.subject_id IN UNNEST(@ids))-[e:Edge - WHERE - e.predicate = @predicate]->{1,%d}(n:Node) - WHERE - m != n - RETURN DISTINCT m.subject_id, - n.subject_id as object_id, - '' as object_value, - n.name - ) + GRAPH DCGraph MATCH (m:Node + WHERE + m.subject_id IN UNNEST(@ids))-[e:Edge + WHERE + e.predicate = @predicate]->{1,%d}(n:Node) + WHERE + m != n + RETURN DISTINCT + m.subject_id, + n.subject_id as object_id, + '' as object_value, + COALESCE(n.name, '') AS name UNION ALL - SELECT - * - FROM - GRAPH_TABLE ( - DCGraph MATCH -[e:Edge - WHERE - e.subject_id IN UNNEST(@ids) - AND e.object_value IS NOT NULL - AND e.predicate = @predicate]-> - RETURN e.subject_id, - '' AS object_id, - e.object_value, - '' AS name - ) - )result + MATCH -[e:Edge + WHERE + e.subject_id IN UNNEST(@ids) + AND e.object_value IS NOT NULL + AND e.predicate = @predicate]-> + RETURN + e.subject_id, + '' AS object_id, + e.object_value, + '' AS name + NEXT + RETURN + subject_id, + @result_predicate AS predicate, + object_id, + object_value, + '' AS provenance, + name, + ARRAY[] AS types `, MAX_HOPS), getEdgesByObjectID: ` - SELECT - result.subject_id, - result.predicate, - result.object_id, - '' AS object_value, - COALESCE(result.provenance, '') AS provenance, - COALESCE(result.name, '') AS name, - COALESCE(result.types, []) AS types, - FROM - GRAPH_TABLE ( - DCGraph MATCH <-[e:Edge - WHERE - e.object_id IN UNNEST(@ids) - AND e.subject_id != e.object_id%s]-(n:Node) - RETURN e.object_id AS subject_id, - e.predicate, - e.subject_id AS object_id, - e.provenance, - n.name, - n.types - )result + GRAPH DCGraph MATCH <-[e:Edge + WHERE + e.object_id IN UNNEST(@ids) + AND e.subject_id != e.object_id%s]-(n:Node) + RETURN + e.object_id AS subject_id, + e.predicate, + e.subject_id AS object_id, + '' AS object_value, + COALESCE(e.provenance, '') AS provenance, + COALESCE(n.name, '') AS name, + COALESCE(n.types, []) AS types `, getChainedEdgesByObjectID: fmt.Sprintf(` - SELECT - result.subject_id, - @result_predicate AS predicate, - result.object_id, - '' AS object_value, - '' AS provenance, - COALESCE(result.name, '') AS name, - ARRAY[] AS types - FROM - GRAPH_TABLE ( - DCGraph MATCH (m:Node - WHERE m.subject_id IN UNNEST(@ids))<-[e:Edge + GRAPH DCGraph MATCH (m:Node + WHERE m.subject_id IN UNNEST(@ids))<-[e:Edge WHERE e.predicate = @predicate]-{1,%d}(n:Node) WHERE m!= n - RETURN DISTINCT m.subject_id, + RETURN DISTINCT + m.subject_id, n.subject_id AS object_id, - n.name - )result - `, MAX_HOPS), + COALESCE(n.name, '') AS name + NEXT + RETURN + subject_id, + @result_predicate AS predicate, + object_id, + '' AS object_value, + '' AS provenance, + name, + ARRAY[] AS types + `, MAX_HOPS), filterProps: ` - AND e.predicate IN UNNEST(@props) + AND e.predicate IN UNNEST(@props) `, filterObjects: ` - INNER JOIN ( - SELECT - * - FROM - GRAPH_TABLE ( - DCGraph MATCH -[e:Edge - WHERE - e.predicate = @prop%[1]d - AND e.object_id IN UNNEST(@val%[1]d)]-> - RETURN e.subject_id - ) - UNION DISTINCT - SELECT - * - FROM - GRAPH_TABLE ( - DCGraph MATCH -[e:Edge - WHERE - e.predicate = @prop%[1]d - AND e.object_value IN UNNEST(@val%[1]d)]-> - RETURN e.subject_id - ) - )filter%[1]d - ON - result.object_id = filter%[1]d.subject_id + NEXT + MATCH -[e:Edge + WHERE + e.predicate = @prop%[1]d + AND ( + e.object_id IN UNNEST(@val%[1]d) + OR e.object_value IN UNNEST(@val%[1]d) + )]-> + WHERE + e.subject_id = object_id + RETURN + subject_id, + predicate, + object_id, + object_value, + provenance, + name, + types `, getObsByVariableAndEntity: ` SELECT @@ -283,22 +234,14 @@ var statements = struct { AND e.predicate = 'linkedContainedInPlace']- RETURN e.subject_id as object_id - )result - - INNER JOIN ( - SELECT - * - FROM GRAPH_TABLE ( - DCGraph MATCH -[e:Edge + NEXT + MATCH -[e:Edge WHERE e.predicate = 'typeOf' AND e.object_id = @childPlaceType]-> - RETURN e.subject_id - ) - )filter1 - ON - result.object_id = filter1.subject_id - + WHERE e.subject_id = object_id + RETURN object_id + )result INNER JOIN ( SELECT *