From f31209819b6edd36418af625e18ef0a2b3c2e0b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 01/31] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From 1e331961aac082cc45af3738aa5ffe886026d8f5 Mon Sep 17 00:00:00 2001 From: "julien.jehannet" Date: Wed, 17 Jul 2024 13:41:50 +0200 Subject: [PATCH 02/31] refactor(ingest/glue): simplify database filtering --- .../src/datahub/ingestion/source/aws/glue.py | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 3b9b5dbf63e184..fcc8bb80ccb3ff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -678,12 +678,19 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]: else: paginator_response = paginator.paginate() - for page in paginator_response: - yield from page["DatabaseList"] + pattern = "DatabaseList" + if self.source_config.ignore_resource_links: + # exclude records that contain TargetDatabase struct key to ignore resource links + pattern += "[?!TargetDatabase]" + + for database in paginator_response.search(pattern): + if self.source_config.database_pattern.allowed(database["Name"]): + yield database - def get_tables_from_database(self, database_name: str) -> Iterable[Dict]: + def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]: # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html paginator = self.glue_client.get_paginator("get_tables") + database_name = database["Name"] if self.source_config.catalog_id: paginator_response = paginator.paginate( @@ -692,34 +699,25 @@ def get_tables_from_database(self, database_name: str) -> Iterable[Dict]: else: paginator_response = paginator.paginate(DatabaseName=database_name) - for page in paginator_response: - yield from page["TableList"] + yield from paginator_response.search("TableList") + # for table in paginator_response.search("TableList"): + # if resource links are detected, re-use database name from the outermost catalog + # otherwise, you will use external names instead of new aliased ones when creating full table names + # Note: we use an explicit source_config check but it is useless actually (filtering has been done) + # if not self.source_config.ignore_resource_links and "TargetDatabase" in database: + # table["DatabaseName"] = database["Name"] + # yield table def get_all_databases_and_tables( self, ) -> Tuple[Dict, List[Dict]]: - all_databases = self.get_all_databases() - - if self.source_config.ignore_resource_links: - all_databases = [ - database - for database in all_databases - if "TargetDatabase" not in database - ] - - allowed_databases = { - database["Name"]: database - for database in all_databases - if self.source_config.database_pattern.allowed(database["Name"]) - } - + all_databases = [*self.get_all_databases()] all_tables = [ - table - for database_name in allowed_databases - for table in self.get_tables_from_database(database_name) + tables + for database in all_databases + for tables in self.get_tables_from_database(database) ] - - return allowed_databases, all_tables + return all_databases, all_tables def get_lineage_if_enabled( self, mce: MetadataChangeEventClass @@ -1039,7 +1037,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: databases, tables = self.get_all_databases_and_tables() - for database in databases.values(): + for database in databases: yield from self.gen_database_containers(database) for table in tables: From e1a8629a07da2f04e6f6e3f9aef3f33837b0010e Mon Sep 17 00:00:00 2001 From: "julien.jehannet" Date: Wed, 17 Jul 2024 15:18:28 +0200 Subject: [PATCH 03/31] fix(ingest/glue): use database names from the current catalog in returned tables list --- .../src/datahub/ingestion/source/aws/glue.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index fcc8bb80ccb3ff..2f618b39360b6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -699,14 +699,16 @@ def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict else: paginator_response = paginator.paginate(DatabaseName=database_name) - yield from paginator_response.search("TableList") - # for table in paginator_response.search("TableList"): - # if resource links are detected, re-use database name from the outermost catalog - # otherwise, you will use external names instead of new aliased ones when creating full table names - # Note: we use an explicit source_config check but it is useless actually (filtering has been done) - # if not self.source_config.ignore_resource_links and "TargetDatabase" in database: - # table["DatabaseName"] = database["Name"] - # yield table + for table in paginator_response.search("TableList"): + # if resource links are detected, re-use database names from the current catalog + # otherwise, external names are used instead of aliased ones when creating full table names later + # Note: use an explicit source_config check but it is useless actually (filtering has been done) + if ( + not self.source_config.ignore_resource_links + and "TargetDatabase" in database + ): + table["DatabaseName"] = database["Name"] + yield table def get_all_databases_and_tables( self, From 6ea44e5d680109ddb828fc1e640e528323ff3229 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 04/31] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From 57f053d55438c8ace7befc5875fe5570a631d732 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 05/31] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From a227a987f8f74864dc63a3ddd6f4a04a80e9fba6 Mon Sep 17 00:00:00 2001 From: Julien Jehannet Date: Wed, 24 Jul 2024 16:01:15 +0200 Subject: [PATCH 06/31] fix(tests): change signature for `get_all_databases_and_tables()` --- metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 2 +- metadata-ingestion/tests/unit/test_glue_source.py | 4 ++-- metadata-ingestion/tests/unit/test_glue_source_stubs.py | 6 ++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 2f618b39360b6b..1dc6d16a0a81fa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -712,7 +712,7 @@ def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict def get_all_databases_and_tables( self, - ) -> Tuple[Dict, List[Dict]]: + ) -> Tuple[List[Mapping[str, Any]], List[Dict]]: all_databases = [*self.get_all_databases()] all_tables = [ tables diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index 45b9899eacaa77..e9ad9fe8e28d01 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -266,8 +266,8 @@ def test_platform_config(): @pytest.mark.parametrize( "ignore_resource_links, all_databases_and_tables_result", [ - (True, ({}, [])), - (False, ({"test-database": resource_link_database}, target_database_tables)), + (True, ([], [])), + (False, ([resource_link_database], target_database_tables)), ], ) def test_ignore_resource_links(ignore_resource_links, all_databases_and_tables_result): diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index 46ab65234c22df..c4f228646195cc 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -92,10 +92,8 @@ }, ] } -databases_1 = { - "flights-database": {"Name": "flights-database", "CatalogId": "123412341234"} -} -databases_2 = {"test-database": {"Name": "test-database", "CatalogId": "123412341234"}} +databases_1 = [{"Name": "flights-database", "CatalogId": "123412341234"}] +databases_2 = [{"Name": "test-database", "CatalogId": "123412341234"}] tables_1 = [ { "Name": "avro", From 6d156c74b35617cd163d2e22100b3cbf93ba930d Mon Sep 17 00:00:00 2001 From: Julien Jehannet Date: Thu, 25 Jul 2024 17:31:27 +0200 Subject: [PATCH 07/31] docs: rephrase some comments --- .../src/datahub/ingestion/source/aws/glue.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 1dc6d16a0a81fa..c8358521fd091d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -680,7 +680,7 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]: pattern = "DatabaseList" if self.source_config.ignore_resource_links: - # exclude records that contain TargetDatabase struct key to ignore resource links + # exclude resource links by using a JMESPath conditional query against the TargetDatabase struct key pattern += "[?!TargetDatabase]" for database in paginator_response.search(pattern): @@ -701,8 +701,9 @@ def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict for table in paginator_response.search("TableList"): # if resource links are detected, re-use database names from the current catalog - # otherwise, external names are used instead of aliased ones when creating full table names later - # Note: use an explicit source_config check but it is useless actually (filtering has been done) + # otherwise external resource names are picked out instead of aliased ones + # This will cause an incoherent situation when creating full table names later + # Note: use an explicit source_config check but it is useless actually (filtering has already been done) if ( not self.source_config.ignore_resource_links and "TargetDatabase" in database From 28f1a937c515d1e7ca495b4e3d1214de2c0411f7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 16 Oct 2024 19:18:32 -0700 Subject: [PATCH 08/31] chore(ingest): reorganize unit tests (#11636) --- .../tests/unit/{ => api}/test_apis.py | 0 .../test_entity_filter_report.py} | 0 .../tests/unit/{ => api}/test_pipeline.py | 10 +++---- .../unit/{ => api}/test_plugin_system.py | 0 .../{test_report.py => test_source_report.py} | 0 .../tests/unit/{ => api}/test_workunit.py | 0 .../{ => bigquery}/test_bigquery_lineage.py | 0 .../{ => bigquery}/test_bigquery_profiler.py | 0 .../{ => bigquery}/test_bigquery_source.py | 0 .../test_bigquery_sql_lineage.py | 0 .../{ => bigquery}/test_bigquery_usage.py | 0 .../test_bigqueryv2_usage_source.py | 0 .../test_bq_get_partition_range.py | 0 .../tests/unit/{ => cli}/test_check.py | 0 .../unit/{ => cli}/test_check_upgrade.py | 0 .../tests/unit/{ => cli}/test_cli_utils.py | 0 .../{ => config}/test_key_value_pattern.py | 0 .../tests/unit/glue/__init__.py | 0 .../tests/unit/{ => glue}/test_glue_source.py | 12 ++------ .../unit/{ => glue}/test_glue_source_stubs.py | 0 .../tests/unit/redshift/__init__.py | 0 .../{ => redshift}/redshift_query_mocker.py | 0 .../{ => redshift}/test_redshift_config.py | 0 .../{ => redshift}/test_redshift_lineage.py | 2 +- .../{ => redshift}/test_redshift_source.py | 0 .../tests/unit/sagemaker/__init__.py | 0 .../unit/sagemaker/test_sagemaker_source.py | 2 +- .../test_sagemaker_source_stubs.py | 0 .../tests/unit/{graph => sdk}/test_client.py | 0 .../unit/{ => sdk}/test_kafka_emitter.py | 0 .../tests/unit/{ => sdk}/test_mce_builder.py | 0 .../tests/unit/{ => sdk}/test_mcp_builder.py | 0 .../tests/unit/{ => sdk}/test_mcp_wrapper.py | 0 .../tests/unit/{ => sdk}/test_rest_emitter.py | 0 .../tests/unit/{ => serde}/test_codegen.py | 0 .../state}/test_ldap_state.py | 0 .../unit/{ => utilities}/test_cli_logging.py | 0 .../unit/{ => utilities}/test_ordered_set.py | 0 .../tests/unit/utilities/test_perf_timer.py | 28 ++++++++++--------- .../test_serialized_lru_cache.py | 0 .../{ => utilities}/test_topological_sort.py | 0 .../unit/{ => utilities}/test_utilities.py | 0 42 files changed, 25 insertions(+), 29 deletions(-) rename metadata-ingestion/tests/unit/{ => api}/test_apis.py (100%) rename metadata-ingestion/tests/unit/{test_report.py => api/test_entity_filter_report.py} (100%) rename metadata-ingestion/tests/unit/{ => api}/test_pipeline.py (97%) rename metadata-ingestion/tests/unit/{ => api}/test_plugin_system.py (100%) rename metadata-ingestion/tests/unit/api/{test_report.py => test_source_report.py} (100%) rename metadata-ingestion/tests/unit/{ => api}/test_workunit.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_lineage.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_profiler.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_source.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_sql_lineage.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_usage.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigqueryv2_usage_source.py (100%) rename metadata-ingestion/tests/unit/{ => bigquery}/test_bq_get_partition_range.py (100%) rename metadata-ingestion/tests/unit/{ => cli}/test_check.py (100%) rename metadata-ingestion/tests/unit/{ => cli}/test_check_upgrade.py (100%) rename metadata-ingestion/tests/unit/{ => cli}/test_cli_utils.py (100%) rename metadata-ingestion/tests/unit/{ => config}/test_key_value_pattern.py (100%) create mode 100644 metadata-ingestion/tests/unit/glue/__init__.py rename metadata-ingestion/tests/unit/{ => glue}/test_glue_source.py (97%) rename metadata-ingestion/tests/unit/{ => glue}/test_glue_source_stubs.py (100%) create mode 100644 metadata-ingestion/tests/unit/redshift/__init__.py rename metadata-ingestion/tests/unit/{ => redshift}/redshift_query_mocker.py (100%) rename metadata-ingestion/tests/unit/{ => redshift}/test_redshift_config.py (100%) rename metadata-ingestion/tests/unit/{ => redshift}/test_redshift_lineage.py (99%) rename metadata-ingestion/tests/unit/{ => redshift}/test_redshift_source.py (100%) create mode 100644 metadata-ingestion/tests/unit/sagemaker/__init__.py rename metadata-ingestion/tests/unit/{ => sagemaker}/test_sagemaker_source_stubs.py (100%) rename metadata-ingestion/tests/unit/{graph => sdk}/test_client.py (100%) rename metadata-ingestion/tests/unit/{ => sdk}/test_kafka_emitter.py (100%) rename metadata-ingestion/tests/unit/{ => sdk}/test_mce_builder.py (100%) rename metadata-ingestion/tests/unit/{ => sdk}/test_mcp_builder.py (100%) rename metadata-ingestion/tests/unit/{ => sdk}/test_mcp_wrapper.py (100%) rename metadata-ingestion/tests/unit/{ => sdk}/test_rest_emitter.py (100%) rename metadata-ingestion/tests/unit/{ => serde}/test_codegen.py (100%) rename metadata-ingestion/tests/unit/{ => stateful_ingestion/state}/test_ldap_state.py (100%) rename metadata-ingestion/tests/unit/{ => utilities}/test_cli_logging.py (100%) rename metadata-ingestion/tests/unit/{ => utilities}/test_ordered_set.py (100%) rename metadata-ingestion/tests/unit/{ => utilities}/test_serialized_lru_cache.py (100%) rename metadata-ingestion/tests/unit/{ => utilities}/test_topological_sort.py (100%) rename metadata-ingestion/tests/unit/{ => utilities}/test_utilities.py (100%) diff --git a/metadata-ingestion/tests/unit/test_apis.py b/metadata-ingestion/tests/unit/api/test_apis.py similarity index 100% rename from metadata-ingestion/tests/unit/test_apis.py rename to metadata-ingestion/tests/unit/api/test_apis.py diff --git a/metadata-ingestion/tests/unit/test_report.py b/metadata-ingestion/tests/unit/api/test_entity_filter_report.py similarity index 100% rename from metadata-ingestion/tests/unit/test_report.py rename to metadata-ingestion/tests/unit/api/test_entity_filter_report.py diff --git a/metadata-ingestion/tests/unit/test_pipeline.py b/metadata-ingestion/tests/unit/api/test_pipeline.py similarity index 97% rename from metadata-ingestion/tests/unit/test_pipeline.py rename to metadata-ingestion/tests/unit/api/test_pipeline.py index a462f281367973..432d8e11c1c0b4 100644 --- a/metadata-ingestion/tests/unit/test_pipeline.py +++ b/metadata-ingestion/tests/unit/api/test_pipeline.py @@ -224,9 +224,9 @@ def test_configure_with_file_sink_does_not_init_graph(self, mock_source, tmp_pat def test_run_including_fake_transformation(self): pipeline = Pipeline.create( { - "source": {"type": "tests.unit.test_pipeline.FakeSource"}, + "source": {"type": "tests.unit.api.test_pipeline.FakeSource"}, "transformers": [ - {"type": "tests.unit.test_pipeline.AddStatusRemovedTransformer"} + {"type": "tests.unit.api.test_pipeline.AddStatusRemovedTransformer"} ], "sink": {"type": "tests.test_helpers.sink_helpers.RecordingSink"}, "run_id": "pipeline_test", @@ -253,7 +253,7 @@ def test_run_including_registered_transformation(self): pipeline = Pipeline.create( { - "source": {"type": "tests.unit.test_pipeline.FakeSource"}, + "source": {"type": "tests.unit.api.test_pipeline.FakeSource"}, "transformers": [ { "type": "simple_add_dataset_ownership", @@ -297,7 +297,7 @@ def test_pipeline_return_code(self, tmp_path, source, strict_warnings, exit_code --- run_id: pipeline_test source: - type: tests.unit.test_pipeline.{source} + type: tests.unit.api.test_pipeline.{source} config: {{}} sink: type: console @@ -379,7 +379,7 @@ def test_pipeline_return_code(self, tmp_path, source, strict_warnings, exit_code def test_pipeline_process_commits(self, commit_policy, source, should_commit): pipeline = Pipeline.create( { - "source": {"type": f"tests.unit.test_pipeline.{source}"}, + "source": {"type": f"tests.unit.api.test_pipeline.{source}"}, "sink": {"type": "console"}, "run_id": "pipeline_test", } diff --git a/metadata-ingestion/tests/unit/test_plugin_system.py b/metadata-ingestion/tests/unit/api/test_plugin_system.py similarity index 100% rename from metadata-ingestion/tests/unit/test_plugin_system.py rename to metadata-ingestion/tests/unit/api/test_plugin_system.py diff --git a/metadata-ingestion/tests/unit/api/test_report.py b/metadata-ingestion/tests/unit/api/test_source_report.py similarity index 100% rename from metadata-ingestion/tests/unit/api/test_report.py rename to metadata-ingestion/tests/unit/api/test_source_report.py diff --git a/metadata-ingestion/tests/unit/test_workunit.py b/metadata-ingestion/tests/unit/api/test_workunit.py similarity index 100% rename from metadata-ingestion/tests/unit/test_workunit.py rename to metadata-ingestion/tests/unit/api/test_workunit.py diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bigquery_lineage.py rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_profiler.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bigquery_profiler.py rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_profiler.py diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_source.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bigquery_source.py rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_source.py diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_sql_lineage.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_sql_lineage.py diff --git a/metadata-ingestion/tests/unit/test_bigquery_usage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_usage.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bigquery_usage.py rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_usage.py diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py rename to metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py diff --git a/metadata-ingestion/tests/unit/test_bq_get_partition_range.py b/metadata-ingestion/tests/unit/bigquery/test_bq_get_partition_range.py similarity index 100% rename from metadata-ingestion/tests/unit/test_bq_get_partition_range.py rename to metadata-ingestion/tests/unit/bigquery/test_bq_get_partition_range.py diff --git a/metadata-ingestion/tests/unit/test_check.py b/metadata-ingestion/tests/unit/cli/test_check.py similarity index 100% rename from metadata-ingestion/tests/unit/test_check.py rename to metadata-ingestion/tests/unit/cli/test_check.py diff --git a/metadata-ingestion/tests/unit/test_check_upgrade.py b/metadata-ingestion/tests/unit/cli/test_check_upgrade.py similarity index 100% rename from metadata-ingestion/tests/unit/test_check_upgrade.py rename to metadata-ingestion/tests/unit/cli/test_check_upgrade.py diff --git a/metadata-ingestion/tests/unit/test_cli_utils.py b/metadata-ingestion/tests/unit/cli/test_cli_utils.py similarity index 100% rename from metadata-ingestion/tests/unit/test_cli_utils.py rename to metadata-ingestion/tests/unit/cli/test_cli_utils.py diff --git a/metadata-ingestion/tests/unit/test_key_value_pattern.py b/metadata-ingestion/tests/unit/config/test_key_value_pattern.py similarity index 100% rename from metadata-ingestion/tests/unit/test_key_value_pattern.py rename to metadata-ingestion/tests/unit/config/test_key_value_pattern.py diff --git a/metadata-ingestion/tests/unit/glue/__init__.py b/metadata-ingestion/tests/unit/glue/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py similarity index 97% rename from metadata-ingestion/tests/unit/test_glue_source.py rename to metadata-ingestion/tests/unit/glue/test_glue_source.py index bd8527eea33e41..bbef9c1510f00b 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py @@ -35,7 +35,7 @@ validate_all_providers_have_committed_successfully, ) from tests.test_helpers.type_helpers import PytestConfig -from tests.unit.test_glue_source_stubs import ( +from tests.unit.glue.test_glue_source_stubs import ( databases_1, databases_2, get_bucket_tagging, @@ -71,6 +71,8 @@ GMS_PORT = 8080 GMS_SERVER = f"http://localhost:{GMS_PORT}" +test_resources_dir = Path(__file__).parent + def glue_source( platform_instance: Optional[str] = None, @@ -247,7 +249,6 @@ def test_glue_ingest( write_metadata_file(tmp_path / mce_file, mce_objects) # Verify the output. - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / mce_file, @@ -312,8 +313,6 @@ def test_config_without_platform(): @freeze_time(FROZEN_TIME) def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" - deleted_actor_golden_mcs = "{}/glue_deleted_actor_mces_golden.json".format( test_resources_dir ) @@ -438,7 +437,6 @@ def test_glue_with_delta_schema_ingest( write_metadata_file(tmp_path / "glue_delta_mces.json", mce_objects) # Verify the output. - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "glue_delta_mces.json", @@ -475,7 +473,6 @@ def test_glue_with_malformed_delta_schema_ingest( write_metadata_file(tmp_path / "glue_malformed_delta_mces.json", mce_objects) # Verify the output. - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "glue_malformed_delta_mces.json", @@ -571,7 +568,6 @@ def test_glue_ingest_include_table_lineage( write_metadata_file(tmp_path / mce_file, mce_objects) # Verify the output. - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / mce_file, @@ -678,7 +674,6 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: write_metadata_file(tmp_path / mce_file, mce_objects) # Verify the output. - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / mce_file, @@ -716,7 +711,6 @@ def test_glue_ingest_with_profiling( write_metadata_file(tmp_path / mce_file, mce_objects) # Verify the output. - test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / mce_file, diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py similarity index 100% rename from metadata-ingestion/tests/unit/test_glue_source_stubs.py rename to metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py diff --git a/metadata-ingestion/tests/unit/redshift/__init__.py b/metadata-ingestion/tests/unit/redshift/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py similarity index 100% rename from metadata-ingestion/tests/unit/redshift_query_mocker.py rename to metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py diff --git a/metadata-ingestion/tests/unit/test_redshift_config.py b/metadata-ingestion/tests/unit/redshift/test_redshift_config.py similarity index 100% rename from metadata-ingestion/tests/unit/test_redshift_config.py rename to metadata-ingestion/tests/unit/redshift/test_redshift_config.py diff --git a/metadata-ingestion/tests/unit/test_redshift_lineage.py b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py similarity index 99% rename from metadata-ingestion/tests/unit/test_redshift_lineage.py rename to metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py index 78b7169a93f3c8..2e3eb8fde1292b 100644 --- a/metadata-ingestion/tests/unit/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py @@ -26,7 +26,7 @@ SqlParsingDebugInfo, SqlParsingResult, ) -from tests.unit.redshift_query_mocker import mock_cursor +from tests.unit.redshift.redshift_query_mocker import mock_cursor def test_get_sources_from_query(): diff --git a/metadata-ingestion/tests/unit/test_redshift_source.py b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py similarity index 100% rename from metadata-ingestion/tests/unit/test_redshift_source.py rename to metadata-ingestion/tests/unit/redshift/test_redshift_source.py diff --git a/metadata-ingestion/tests/unit/sagemaker/__init__.py b/metadata-ingestion/tests/unit/sagemaker/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py index 995d176c213b24..2450e6fa8fe564 100644 --- a/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py +++ b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py @@ -14,7 +14,7 @@ job_types, ) from tests.test_helpers import mce_helpers -from tests.unit.test_sagemaker_source_stubs import ( +from tests.unit.sagemaker.test_sagemaker_source_stubs import ( describe_endpoint_response_1, describe_endpoint_response_2, describe_feature_group_response_1, diff --git a/metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source_stubs.py similarity index 100% rename from metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py rename to metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source_stubs.py diff --git a/metadata-ingestion/tests/unit/graph/test_client.py b/metadata-ingestion/tests/unit/sdk/test_client.py similarity index 100% rename from metadata-ingestion/tests/unit/graph/test_client.py rename to metadata-ingestion/tests/unit/sdk/test_client.py diff --git a/metadata-ingestion/tests/unit/test_kafka_emitter.py b/metadata-ingestion/tests/unit/sdk/test_kafka_emitter.py similarity index 100% rename from metadata-ingestion/tests/unit/test_kafka_emitter.py rename to metadata-ingestion/tests/unit/sdk/test_kafka_emitter.py diff --git a/metadata-ingestion/tests/unit/test_mce_builder.py b/metadata-ingestion/tests/unit/sdk/test_mce_builder.py similarity index 100% rename from metadata-ingestion/tests/unit/test_mce_builder.py rename to metadata-ingestion/tests/unit/sdk/test_mce_builder.py diff --git a/metadata-ingestion/tests/unit/test_mcp_builder.py b/metadata-ingestion/tests/unit/sdk/test_mcp_builder.py similarity index 100% rename from metadata-ingestion/tests/unit/test_mcp_builder.py rename to metadata-ingestion/tests/unit/sdk/test_mcp_builder.py diff --git a/metadata-ingestion/tests/unit/test_mcp_wrapper.py b/metadata-ingestion/tests/unit/sdk/test_mcp_wrapper.py similarity index 100% rename from metadata-ingestion/tests/unit/test_mcp_wrapper.py rename to metadata-ingestion/tests/unit/sdk/test_mcp_wrapper.py diff --git a/metadata-ingestion/tests/unit/test_rest_emitter.py b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py similarity index 100% rename from metadata-ingestion/tests/unit/test_rest_emitter.py rename to metadata-ingestion/tests/unit/sdk/test_rest_emitter.py diff --git a/metadata-ingestion/tests/unit/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py similarity index 100% rename from metadata-ingestion/tests/unit/test_codegen.py rename to metadata-ingestion/tests/unit/serde/test_codegen.py diff --git a/metadata-ingestion/tests/unit/test_ldap_state.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_ldap_state.py similarity index 100% rename from metadata-ingestion/tests/unit/test_ldap_state.py rename to metadata-ingestion/tests/unit/stateful_ingestion/state/test_ldap_state.py diff --git a/metadata-ingestion/tests/unit/test_cli_logging.py b/metadata-ingestion/tests/unit/utilities/test_cli_logging.py similarity index 100% rename from metadata-ingestion/tests/unit/test_cli_logging.py rename to metadata-ingestion/tests/unit/utilities/test_cli_logging.py diff --git a/metadata-ingestion/tests/unit/test_ordered_set.py b/metadata-ingestion/tests/unit/utilities/test_ordered_set.py similarity index 100% rename from metadata-ingestion/tests/unit/test_ordered_set.py rename to metadata-ingestion/tests/unit/utilities/test_ordered_set.py diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py index 6129b3e37d8bc3..1de76a32fb708a 100644 --- a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py +++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py @@ -10,37 +10,39 @@ def test_perf_timer_simple(): with PerfTimer() as timer: - time.sleep(1) - assert approx(timer.elapsed_seconds()) == 1 + time.sleep(0.4) + assert approx(timer.elapsed_seconds()) == 0.4 - assert approx(timer.elapsed_seconds()) == 1 + assert approx(timer.elapsed_seconds()) == 0.4 def test_perf_timer_paused_timer(): with PerfTimer() as current_timer: - time.sleep(1) - assert approx(current_timer.elapsed_seconds()) == 1 + time.sleep(0.5) + assert approx(current_timer.elapsed_seconds()) == 0.5 with current_timer.pause(): - time.sleep(2) - assert approx(current_timer.elapsed_seconds()) == 1 - assert approx(current_timer.elapsed_seconds()) == 1 - time.sleep(1) + time.sleep(0.3) + assert approx(current_timer.elapsed_seconds()) == 0.5 + assert approx(current_timer.elapsed_seconds()) == 0.5 + time.sleep(0.2) - assert approx(current_timer.elapsed_seconds()) == 2 + assert approx(current_timer.elapsed_seconds()) == 0.7 def test_generator_with_paused_timer(): + n = 4 + def generator_function(): with PerfTimer() as inner_timer: time.sleep(1) - for i in range(10): + for i in range(n): time.sleep(0.2) with inner_timer.pause(): time.sleep(0.2) yield i - assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10 + assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * n with PerfTimer() as outer_timer: seq = generator_function() list([i for i in seq]) - assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10 + assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * n + 0.2 * n diff --git a/metadata-ingestion/tests/unit/test_serialized_lru_cache.py b/metadata-ingestion/tests/unit/utilities/test_serialized_lru_cache.py similarity index 100% rename from metadata-ingestion/tests/unit/test_serialized_lru_cache.py rename to metadata-ingestion/tests/unit/utilities/test_serialized_lru_cache.py diff --git a/metadata-ingestion/tests/unit/test_topological_sort.py b/metadata-ingestion/tests/unit/utilities/test_topological_sort.py similarity index 100% rename from metadata-ingestion/tests/unit/test_topological_sort.py rename to metadata-ingestion/tests/unit/utilities/test_topological_sort.py diff --git a/metadata-ingestion/tests/unit/test_utilities.py b/metadata-ingestion/tests/unit/utilities/test_utilities.py similarity index 100% rename from metadata-ingestion/tests/unit/test_utilities.py rename to metadata-ingestion/tests/unit/utilities/test_utilities.py From 1980256ade433d2506539faa275fd2295cf47a8b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 16 Oct 2024 20:47:48 -0700 Subject: [PATCH 09/31] fix(ingest): run sqllineage in process by default (#11650) --- .../ingestion/source/looker/lookml_config.py | 3 - .../src/datahub/ingestion/source/redash.py | 23 +- .../src/datahub/utilities/sql_parser.py | 2 +- .../lookml/lookml_mces_badsql_parser.json | 2820 ----------------- .../tests/integration/lookml/test_lookml.py | 48 - 5 files changed, 13 insertions(+), 2883 deletions(-) delete mode 100644 metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index 0bcee14ec77a1a..da837da1613864 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -124,9 +124,6 @@ class LookMLSourceConfig( description="List of regex patterns for LookML views to include in the extraction.", ) parse_table_names_from_sql: bool = Field(True, description="See note below.") - sql_parser: str = Field( - "datahub.utilities.sql_parser.DefaultSQLParser", description="See note below." - ) api: Optional[LookerAPIConfig] project_name: Optional[str] = Field( None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py index 38cf0bebcbc12f..5fd63e7f93f92a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redash.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py @@ -2,7 +2,6 @@ import math import sys from dataclasses import dataclass, field -from multiprocessing.pool import ThreadPool from typing import Dict, Iterable, List, Optional, Set, Type import dateutil.parser as dp @@ -43,6 +42,7 @@ from datahub.utilities.lossy_collections import LossyDict, LossyList from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.sql_parser import SQLParser +from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -646,11 +646,11 @@ def _emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]: self.report.total_dashboards = total_dashboards self.report.max_page_dashboards = max_page - dash_exec_pool = ThreadPool(self.config.parallelism) - for response in dash_exec_pool.imap_unordered( - self._process_dashboard_response, range(1, max_page + 1) - ): - yield from response + yield from ThreadedIteratorExecutor.process( + self._process_dashboard_response, + [(page,) for page in range(1, max_page + 1)], + max_workers=self.config.parallelism, + ) def _get_chart_type_from_viz_data(self, viz_data: Dict) -> str: """ @@ -769,11 +769,12 @@ def _emit_chart_mces(self) -> Iterable[MetadataWorkUnit]: logger.info(f"/api/queries total count {total_queries} and max page {max_page}") self.report.total_queries = total_queries self.report.max_page_queries = max_page - chart_exec_pool = ThreadPool(self.config.parallelism) - for response in chart_exec_pool.imap_unordered( - self._process_query_response, range(1, max_page + 1) - ): - yield from response + + yield from ThreadedIteratorExecutor.process( + self._process_query_response, + [(page,) for page in range(1, max_page + 1)], + max_workers=self.config.parallelism, + ) def add_config_to_report(self) -> None: self.report.api_page_limit = self.config.api_page_limit diff --git a/metadata-ingestion/src/datahub/utilities/sql_parser.py b/metadata-ingestion/src/datahub/utilities/sql_parser.py index 61693b52b350fb..b88f8fd8c73029 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_parser.py +++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py @@ -46,7 +46,7 @@ class SqlLineageSQLParser(SQLParser): def __init__( self, sql_query: str, - use_external_process: bool = True, + use_external_process: bool = False, use_raw_names: bool = False, ) -> None: super().__init__(sql_query, use_external_process) diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json deleted file mode 100644 index 5b39e8dd96ac2a..00000000000000 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ /dev/null @@ -1,2820 +0,0 @@ -[ -{ - "entityType": "container", - "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "looker", - "env": "PROD", - "project_name": "lkml_samples" - }, - "name": "lkml_samples", - "env": "PROD" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:looker" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "LookML Project" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Folders" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "SELECT\n is_latest,\n country,\n city,\n timestamp,\n measurement\n FROM\n my_table", - "viewLanguage": "sql" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),country)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),city)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),is_latest)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),timestamp)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),measurement)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "country", - "nullable": false, - "description": "The country", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "city", - "nullable": false, - "description": "City", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "is_latest", - "nullable": false, - "description": "Is latest data", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "yesno", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "timestamp", - "nullable": false, - "description": "Timestamp of measurement", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "time", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - }, - { - "tag": "urn:li:tag:Temporal" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "average_measurement", - "nullable": false, - "description": "My measurement", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "average", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "foo.view.lkml", - "looker.model": "data" - }, - "name": "my_view", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "SELECT\n country,\n city,\n timestamp,\n measurement\n FROM\n ${my_view.SQL_TABLE_NAME} AS my_view", - "viewLanguage": "sql" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_derived_view", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "country", - "nullable": false, - "description": "The country", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "city", - "nullable": false, - "description": "City", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "timestamp", - "nullable": false, - "description": "Timestamp of measurement", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "time", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - }, - { - "tag": "urn:li:tag:Temporal" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "average_measurement", - "nullable": false, - "description": "My measurement", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "average", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "bar.view.lkml", - "looker.model": "data" - }, - "name": "my_derived_view", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "view: include_able_view {\n sql_table_name: looker_schema.include_able ;;\n}\n", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", - "type": "VIEW" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "included_view_file.view.lkml", - "looker.model": "data" - }, - "name": "include_able_view", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n extends: [looker_events]\n\n measure: additional_measure {\n type: count\n }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n extends: [include_able_view]\n}\n", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", - "type": "VIEW" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "view_declarations.view.lkml", - "looker.model": "data" - }, - "name": "looker_events", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n extends: [looker_events]\n\n measure: additional_measure {\n type: count\n }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n extends: [include_able_view]\n}\n", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD),additional_measure)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD),additional_measure)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "extending_looker_events", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "additional_measure", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "count", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "view_declarations.view.lkml", - "looker.model": "data" - }, - "name": "extending_looker_events", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n extends: [looker_events]\n\n measure: additional_measure {\n type: count\n }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n extends: [include_able_view]\n}\n", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", - "type": "VIEW" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "view_declarations.view.lkml", - "looker.model": "data" - }, - "name": "autodetect_sql_name_based_on_view_name", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n extends: [looker_events]\n\n measure: additional_measure {\n type: count\n }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n extends: [include_able_view]\n}\n", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", - "type": "VIEW" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "view_declarations.view.lkml", - "looker.model": "data" - }, - "name": "test_include_external_view", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", - "viewLanguage": "sql" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/nested" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),date)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),platform)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),country)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "fragment_derived_view", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "date", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "unknown", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "aliased_platform", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "unknown", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "country", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "unknown", - "recursive": false, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "nested/fragment_derived.view.lkml", - "looker.model": "data" - }, - "name": "fragment_derived_view", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - }, - { - "id": "nested" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", - "viewLanguage": "sql" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),customer_id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),sale_price)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "customer_facts", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "customer_id", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "unknown", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "lifetime_spend", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "unknown", - "recursive": false, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "liquid.view.lkml", - "looker.model": "data" - }, - "name": "customer_facts", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "view: ability {\n sql_table_name: \"ECOMMERCE\".\"ABILITY\"\n ;;\n\n dimension: pk {\n type: number\n sql: ${TABLE}.\"PK\" ;;\n }\n\n measure: count {\n type: count\n drill_fields: []\n }\n}\n", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD),pk)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD),pk)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD),count)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD),count)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "ability", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "pk", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "number", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "count", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "count", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "ability.view.lkml", - "looker.model": "data" - }, - "name": "ability", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "view: owners {\n dimension: id {\n primary_key: yes\n sql: ${TABLE}.id ;;\n }\n dimension: owner_name {\n sql: ${TABLE}.owner_name ;;\n }\n}", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),id)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD),owner_name)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),owner_name)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "owners", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": true - }, - { - "fieldPath": "owner_name", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [ - "id" - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "owners.view.lkml", - "looker.model": "data" - }, - "name": "owners", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "explore_source: my_view_explore {\n bind_all_filters: yes\n\n column: country {\n field: my_view_explore.country\n }\n\n column: city {\n field: my_view_explore.city\n }\n\n column: is_latest {\n field: my_view_explore.is_latest\n }\n\n derived_column: derived_col {\n sql: coalesce(country, 'US') ;;\n }\n}", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.country)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),country)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.city)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),city)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.country)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),unique_countries)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.is_latest)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),derived_col)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "view_derived_explore", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "country", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "city", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "unique_countries", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "count_distinct", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "derived_col", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "sum", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "native_derived_table.view.lkml", - "looker.model": "data" - }, - "name": "view_derived_explore", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "view: flights {\n sql_table_name: flightstats.accidents ;;\n\n dimension: id {\n label: \"id\"\n primary_key: yes\n type: number\n sql: ${TABLE}.id ;;\n }\n}\n\n# override type of id parameter\nview: +flights {\n dimension: id {\n label: \"id\"\n primary_key: yes\n type: string\n sql: ${TABLE}.id ;;\n }\n}", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/Develop/lkml_samples/" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD),id)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "flights", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "nullable": false, - "description": "", - "label": "id", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "number", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": true - } - ], - "primaryKeys": [ - "id" - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "flights.view.lkml", - "looker.model": "data" - }, - "name": "flights", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { - "urn": "urn:li:tag:Dimension", - "aspects": [ - { - "com.linkedin.pegasus2avro.tag.TagProperties": { - "name": "Dimension", - "description": "A tag that is applied to all dimension fields." - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { - "urn": "urn:li:tag:Temporal", - "aspects": [ - { - "com.linkedin.pegasus2avro.tag.TagProperties": { - "name": "Temporal", - "description": "A tag that is applied to all time-based (temporal) fields such as timestamps or durations." - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { - "urn": "urn:li:tag:Measure", - "aspects": [ - { - "com.linkedin.pegasus2avro.tag.TagProperties": { - "name": "Measure", - "description": "A tag that is applied to all measures (metrics). Measures are typically the columns that you aggregate on" - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:Dimension", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:Measure", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:Temporal", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index e4eb564e3e86b7..94b3b103d0548c 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -10,7 +10,6 @@ from freezegun import freeze_time from looker_sdk.sdk.api40.models import DBConnection -from datahub.configuration.common import PipelineExecutionError from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file from datahub.ingestion.source.looker.looker_template_language import ( @@ -518,53 +517,6 @@ def ingestion_test( ) -@freeze_time(FROZEN_TIME) -def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time): - """Incorrect specification of sql parser should not fail ingestion""" - test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" - mce_out = "lookml_mces_badsql_parser.json" - pipeline = Pipeline.create( - { - "run_id": "lookml-test", - "source": { - "type": "lookml", - "config": { - "base_folder": str(test_resources_dir / "lkml_samples"), - "connection_to_platform_map": { - "my_connection": { - "platform": "snowflake", - "default_db": "default_db", - "default_schema": "default_schema", - } - }, - "parse_table_names_from_sql": True, - "project_name": "lkml_samples", - "sql_parser": "bad.sql.Parser", - "emit_reachable_views_only": False, - "process_refinements": False, - }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/{mce_out}", - }, - }, - } - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status(raise_warnings=False) - with pytest.raises(PipelineExecutionError): # we expect the source to have warnings - pipeline.raise_from_status(raise_warnings=True) - - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / mce_out, - golden_path=test_resources_dir / mce_out, - ) - - @freeze_time(FROZEN_TIME) def test_lookml_git_info(pytestconfig, tmp_path, mock_time): """Add github info to config""" From fc8465e18ea74a5185da1a49f92d30a99e5f1882 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 08:14:06 -0700 Subject: [PATCH 10/31] feat(ingest): add offline flag to SQL parser CLI (#11635) --- .../src/datahub/cli/check_cli.py | 35 +++++++++++++++---- .../datahub/sql_parsing/sqlglot_lineage.py | 4 ++- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 6e9bfddd350f92..39ed1b2bfea087 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -188,9 +188,13 @@ def sql_format(sql: str, platform: str) -> None: @click.option( "--sql", type=str, - required=True, help="The SQL query to parse", ) +@click.option( + "--sql-file", + type=click.Path(exists=True, dir_okay=False, readable=True), + help="The SQL file to parse", +) @click.option( "--platform", type=str, @@ -218,25 +222,44 @@ def sql_format(sql: str, platform: str) -> None: type=str, help="The default schema to use for unqualified table names", ) +@click.option( + "--online/--offline", + type=bool, + is_flag=True, + default=True, + help="Run in offline mode and disable schema-aware parsing.", +) @telemetry.with_telemetry() def sql_lineage( - sql: str, + sql: Optional[str], + sql_file: Optional[str], platform: str, default_db: Optional[str], default_schema: Optional[str], platform_instance: Optional[str], env: str, + online: bool, ) -> None: """Parse the lineage of a SQL query. - This performs schema-aware parsing in order to generate column-level lineage. - If the relevant tables are not in DataHub, this will be less accurate. + In online mode (the default), we perform schema-aware parsing in order to generate column-level lineage. + If offline mode is enabled or if the relevant tables are not in DataHub, this will be less accurate. """ - graph = get_default_graph() + from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result + + if sql is None: + if sql_file is None: + raise click.UsageError("Either --sql or --sql-file must be provided") + sql = pathlib.Path(sql_file).read_text() + + graph = None + if online: + graph = get_default_graph() - lineage = graph.parse_sql_lineage( + lineage = create_lineage_sql_parsed_result( sql, + graph=graph, platform=platform, platform_instance=platform_instance, env=env, diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 0806d0ec774fe7..273e9d0f9f0b1d 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -656,7 +656,9 @@ def _get_direct_raw_col_upstreams( # Parse the column name out of the node name. # Sqlglot calls .sql(), so we have to do the inverse. normalized_col = sqlglot.parse_one(node.name).this.name - if node.subfield: + if hasattr(node, "subfield") and node.subfield: + # The hasattr check is necessary, since it lets us be compatible with + # sqlglot versions that don't have the subfield attribute. normalized_col = f"{normalized_col}.{node.subfield}" direct_raw_col_upstreams.add( From e9060b8afbe056e8404c4c65daff28c307ef0f9a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 10:08:37 -0700 Subject: [PATCH 11/31] fix(ingest/redshift): reduce sequence limit for LISTAGG (#11621) Co-authored-by: treff7es Co-authored-by: Aseem Bansal --- .../ingestion/source/redshift/query.py | 281 +++++++++--------- .../unit/redshift/redshift_query_mocker.py | 40 +-- 2 files changed, 143 insertions(+), 178 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index 39370b93b561c5..f7fad574f7fbe7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -4,6 +4,12 @@ redshift_datetime_format = "%Y-%m-%d %H:%M:%S" +# See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext +# for why we need to limit the size of the query text. +# We use 290 instead instead of the standard 320, because escape characters can add to the length. +_QUERY_SEQUENCE_LIMIT = 290 + + class RedshiftCommonQuery: CREATE_TEMP_TABLE_CLAUSE = "create temp table" CREATE_TEMPORARY_TABLE_CLAUSE = "create temporary table" @@ -487,71 +493,70 @@ def list_unload_commands_sql( def list_insert_create_queries_sql( db_name: str, start_time: datetime, end_time: datetime ) -> str: - return """ - with query_txt as - ( - select - query, - pid, - LISTAGG(case - when LEN(RTRIM(text)) = 0 then text - else RTRIM(text) - end) within group ( - order by - sequence) as ddl - from - ( - select - query, - pid, - text, - sequence - from - STL_QUERYTEXT - where - sequence < 320 - order by - sequence - ) - group by - query, - pid - ) - select - distinct tbl as target_table_id, - sti.schema as target_schema, - sti.table as target_table, - sti.database as cluster, - usename as username, - ddl, - sq.query as query_id, - min(si.starttime) as timestamp, - ANY_VALUE(pid) as session_id - from - stl_insert as si - left join SVV_TABLE_INFO sti on - sti.table_id = tbl - left join svl_user_info sui on - si.userid = sui.usesysid - left join query_txt sq on - si.query = sq.query - left join stl_load_commits slc on - slc.query = si.query - where - sui.usename <> 'rdsdb' - and cluster = '{db_name}' - and slc.query IS NULL - and si.starttime >= '{start_time}' - and si.starttime < '{end_time}' - group by - target_table_id, - target_schema, - target_table, - cluster, - username, - ddl, - sq.query + return """\ +with query_txt as ( + select + query, + pid, + LISTAGG(case + when LEN(RTRIM(text)) = 0 then text + else RTRIM(text) + end) within group ( + order by sequence + ) as ddl + from ( + select + query, + pid, + text, + sequence + from + STL_QUERYTEXT + where + sequence < {_QUERY_SEQUENCE_LIMIT} + order by + sequence + ) + group by + query, + pid +) +select + distinct tbl as target_table_id, + sti.schema as target_schema, + sti.table as target_table, + sti.database as cluster, + usename as username, + ddl, + sq.query as query_id, + min(si.starttime) as timestamp, + ANY_VALUE(pid) as session_id +from + stl_insert as si +left join SVV_TABLE_INFO sti on + sti.table_id = tbl +left join svl_user_info sui on + si.userid = sui.usesysid +left join query_txt sq on + si.query = sq.query +left join stl_load_commits slc on + slc.query = si.query +where + sui.usename <> 'rdsdb' + and cluster = '{db_name}' + and slc.query IS NULL + and si.starttime >= '{start_time}' + and si.starttime < '{end_time}' +group by + target_table_id, + target_schema, + target_table, + cluster, + username, + ddl, + sq.query """.format( + _QUERY_SEQUENCE_LIMIT=_QUERY_SEQUENCE_LIMIT, # We need the original database name for filtering db_name=db_name, start_time=start_time.strftime(redshift_datetime_format), @@ -564,84 +569,82 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: end_time_str: str = end_time.strftime(redshift_datetime_format) - return rf"""-- DataHub Redshift Source temp table DDL query + return rf"""\ +-- DataHub Redshift Source temp table DDL query +select + * +from ( + select + session_id, + transaction_id, + start_time, + userid, + REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\n','\\n'), '(CREATE(?:[\\n\\s\\t]+(?:temp|temporary))?(?:[\\n\\s\\t]+)table(?:[\\n\\s\\t]+)[^\\n\\s\\t()-]+)', 0, 1, 'ipe'),'[\\n\\s\\t]+',' ',1,'p') as create_command, + query_text, + row_number() over ( + partition by session_id, TRIM(query_text) + order by start_time desc + ) rn + from ( + select + pid as session_id, + xid as transaction_id, + starttime as start_time, + type, + query_text, + userid + from ( select - * + starttime, + pid, + xid, + type, + userid, + LISTAGG(case + when LEN(RTRIM(text)) = 0 then text + else RTRIM(text) + end, + '') within group ( + order by sequence + ) as query_text from - ( - select - session_id, - transaction_id, - start_time, - userid, - REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\n','\\n'), '(CREATE(?:[\\n\\s\\t]+(?:temp|temporary))?(?:[\\n\\s\\t]+)table(?:[\\n\\s\\t]+)[^\\n\\s\\t()-]+)', 0, 1, 'ipe'),'[\\n\\s\\t]+',' ',1,'p') as create_command, - query_text, - row_number() over ( - partition by session_id, TRIM(query_text) - order by start_time desc - ) rn - from - ( - select - pid as session_id, - xid as transaction_id, - starttime as start_time, - type, - query_text, - userid - from - ( - select - starttime, - pid, - xid, - type, - userid, - LISTAGG(case - when LEN(RTRIM(text)) = 0 then text - else RTRIM(text) - end, - '') within group ( - order by sequence - ) as query_text - from - SVL_STATEMENTTEXT - where - type in ('DDL', 'QUERY') - AND starttime >= '{start_time_str}' - AND starttime < '{end_time_str}' - -- See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext - AND sequence < 320 - group by - starttime, - pid, - xid, - type, - userid - order by - starttime, - pid, - xid, - type, - userid - asc) - where - type in ('DDL', 'QUERY') - ) - where - (create_command ilike 'create temp table %' - or create_command ilike 'create temporary table %' - -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction - or create_command ilike 'create table %') - -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out. - and query_text not ilike 'CREATE TEMP TABLE volt_tt_%' - and create_command not like 'CREATE TEMP TABLE volt_tt_' - -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query - and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%' - - ) + SVL_STATEMENTTEXT where - rn = 1 + type in ('DDL', 'QUERY') + AND starttime >= '{start_time_str}' + AND starttime < '{end_time_str}' + AND sequence < {_QUERY_SEQUENCE_LIMIT} + group by + starttime, + pid, + xid, + type, + userid + order by + starttime, + pid, + xid, + type, + userid + asc + ) + where + type in ('DDL', 'QUERY') + ) + where + (create_command ilike 'create temp table %' + or create_command ilike 'create temporary table %' + -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction + or create_command ilike 'create table %') + -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out. + and query_text not ilike 'CREATE TEMP TABLE volt_tt_%' + and create_command not like 'CREATE TEMP TABLE volt_tt_' + -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query + and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%' + +) +where + rn = 1 """ # Add this join to the sql query for more metrics on completed queries diff --git a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py index ada76e624032ba..06b592d42914bf 100644 --- a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py +++ b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py @@ -56,45 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None: query_vs_cursor_mocker = { ( - "-- DataHub Redshift Source temp table DDL query\n select\n *\n " - "from\n (\n select\n session_id,\n " - " transaction_id,\n start_time,\n userid,\n " - " REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[" - "\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[" - "^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n " - " query_text,\n row_number() over (\n partition " - "by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n " - " from\n (\n select\n pid " - "as session_id,\n xid as transaction_id,\n starttime " - "as start_time,\n type,\n query_text,\n " - " userid\n from\n (\n " - "select\n starttime,\n pid,\n " - " xid,\n type,\n userid,\n " - " LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n " - " else RTRIM(text)\n end,\n " - " '') within group (\n order by sequence\n " - " ) as query_text\n from\n " - "SVL_STATEMENTTEXT\n where\n type in ('DDL', " - "'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n " - " AND starttime < '2024-01-10 12:00:00'\n -- See " - "https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl" - "-statementtext\n AND sequence < 320\n group by\n " - " starttime,\n pid,\n " - "xid,\n type,\n userid\n " - " order by\n starttime,\n pid,\n " - " xid,\n type,\n userid\n " - " asc)\n where\n type in ('DDL', " - "'QUERY')\n )\n where\n (create_command ilike " - "'create temp table %'\n or create_command ilike 'create temporary table %'\n " - " -- we want to get all the create table statements and not just temp tables " - "if non temp table is created and dropped in the same transaction\n or " - "create_command ilike 'create table %')\n -- Redshift creates temp tables with " - "the following names: volt_tt_%. We need to filter them out.\n and query_text not " - "ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP " - "TABLE volt_tt_'\n -- We need to filter out our query and it was not possible " - "earlier when we did not have any comment in the query\n and query_text not ilike " - "'%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl" - "-statementtext%'\n\n )\n where\n rn = 1\n " + "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n *\nfrom (\n select\n session_id,\n transaction_id,\n start_time,\n userid,\n REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n query_text,\n row_number() over (\n partition by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n from (\n select\n pid as session_id,\n xid as transaction_id,\n starttime as start_time,\n type,\n query_text,\n userid\n from (\n select\n starttime,\n pid,\n xid,\n type,\n userid,\n LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n else RTRIM(text)\n end,\n '') within group (\n order by sequence\n ) as query_text\n from\n SVL_STATEMENTTEXT\n where\n type in ('DDL', 'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n AND starttime < '2024-01-10 12:00:00'\n AND sequence < 290\n group by\n starttime,\n pid,\n xid,\n type,\n userid\n order by\n starttime,\n pid,\n xid,\n type,\n userid\n asc\n )\n where\n type in ('DDL', 'QUERY')\n )\n where\n (create_command ilike 'create temp table %'\n or create_command ilike 'create temporary table %'\n -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n or create_command ilike 'create table %')\n -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP TABLE volt_tt_'\n -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n rn = 1\n " ): mock_temp_table_cursor, "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor, } From 2101ff15f5a618b88c6efd30e06db3c268efb248 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Thu, 17 Oct 2024 19:57:02 +0200 Subject: [PATCH 12/31] fix(ingest/bigquery): Not setting platform instance for bigquery platform resources (#11659) --- .../ingestion/source/bigquery_v2/bigquery.py | 3 - .../bigquery_platform_resource_helper.py | 2 +- .../bigquery_v2/bigquery_mcp_golden.json | 90 +++++++++---------- 3 files changed, 45 insertions(+), 50 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index c30dade921d257..3c6202cc7cbfaf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -65,9 +65,6 @@ # We can't use close as it is not called if the ingestion is not successful def cleanup(config: BigQueryV2Config) -> None: if config._credentials_path is not None: - logger.debug( - f"Deleting temporary credential file at {config._credentials_path}" - ) os.unlink(config._credentials_path) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py index d2da895be985dc..9da2aceb19220a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py @@ -42,7 +42,7 @@ def platform_resource_key(self) -> PlatformResourceKey: return PlatformResourceKey( platform="bigquery", resource_type="BigQueryLabelInfo", - platform_instance=self.project, + platform_instance=None, primary_key=self.label.primary_key(), ) diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index b268926f155b74..640ee1bf436b03 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -201,7 +201,7 @@ }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2", + "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b", "changeType": "UPSERT", "aspectName": "platformResourceInfo", "aspect": { @@ -221,30 +221,29 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2", + "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:bigquery", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + "platform": "urn:li:dataPlatform:bigquery" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -254,13 +253,13 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3", + "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916", "changeType": "UPSERT", "aspectName": "platformResourceInfo", "aspect": { @@ -280,19 +279,50 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3", + "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:bigquery", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false } }, "systemMetadata": { @@ -395,38 +425,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", From bb4a50a08aed04550402d384fe12379ca2deaf10 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 11:50:42 -0700 Subject: [PATCH 13/31] fix(ingest/dbt): fix bug in CLL pruning (#11614) --- .../src/datahub/ingestion/source/dbt/dbt_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index c15f1deb43a3af..4cd3c934ce6348 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1074,8 +1074,8 @@ def add_node_to_cll_list(dbt_name: str) -> None: for upstream in all_nodes_map[dbt_name].upstream_nodes: schema_nodes.add(upstream) - upstream_node = all_nodes_map[upstream] - if upstream_node.is_ephemeral_model(): + upstream_node = all_nodes_map.get(upstream) + if upstream_node and upstream_node.is_ephemeral_model(): add_node_to_cll_list(upstream) cll_nodes.add(dbt_name) From 760f997100d5ddf81b6cbd99caf3754941b99dc6 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 12:33:04 -0700 Subject: [PATCH 14/31] fix(ingest/redshift): fix syntax error in temp sql (#11661) --- .../src/datahub/ingestion/source/redshift/exception.py | 8 ++++---- .../src/datahub/ingestion/source/redshift/query.py | 5 ++--- .../tests/unit/redshift/redshift_query_mocker.py | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py index 43ad5bfcefdf1b..ed0856fc1e2924 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py @@ -40,25 +40,25 @@ def report_redshift_failure( error_message = str(e).lower() if "permission denied" in error_message: if "svv_table_info" in error_message: - report.report_failure( + report.failure( title="Permission denied", message="Failed to extract metadata due to insufficient permission to access 'svv_table_info' table. Please ensure the provided database user has access.", exc=e, ) elif "svl_user_info" in error_message: - report.report_failure( + report.failure( title="Permission denied", message="Failed to extract metadata due to insufficient permission to access 'svl_user_info' table. Please ensure the provided database user has access.", exc=e, ) else: - report.report_failure( + report.failure( title="Permission denied", message="Failed to extract metadata due to insufficient permissions.", exc=e, ) else: - report.report_failure( + report.failure( title="Failed to extract some metadata", message="Failed to extract some metadata from Redshift.", exc=e, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index f7fad574f7fbe7..b18b526ef30fce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -569,8 +569,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: end_time_str: str = end_time.strftime(redshift_datetime_format) - return rf"""\ --- DataHub Redshift Source temp table DDL query + return rf"""-- DataHub Redshift Source temp table DDL query select * from ( @@ -645,7 +644,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: ) where rn = 1 - """ +""" # Add this join to the sql query for more metrics on completed queries # LEFT JOIN svl_query_metrics_summary sqms ON ss.query = sqms.query diff --git a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py index 06b592d42914bf..10c0250e37e37f 100644 --- a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py +++ b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py @@ -56,7 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None: query_vs_cursor_mocker = { ( - "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n *\nfrom (\n select\n session_id,\n transaction_id,\n start_time,\n userid,\n REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n query_text,\n row_number() over (\n partition by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n from (\n select\n pid as session_id,\n xid as transaction_id,\n starttime as start_time,\n type,\n query_text,\n userid\n from (\n select\n starttime,\n pid,\n xid,\n type,\n userid,\n LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n else RTRIM(text)\n end,\n '') within group (\n order by sequence\n ) as query_text\n from\n SVL_STATEMENTTEXT\n where\n type in ('DDL', 'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n AND starttime < '2024-01-10 12:00:00'\n AND sequence < 290\n group by\n starttime,\n pid,\n xid,\n type,\n userid\n order by\n starttime,\n pid,\n xid,\n type,\n userid\n asc\n )\n where\n type in ('DDL', 'QUERY')\n )\n where\n (create_command ilike 'create temp table %'\n or create_command ilike 'create temporary table %'\n -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n or create_command ilike 'create table %')\n -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP TABLE volt_tt_'\n -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n rn = 1\n " + "-- DataHub Redshift Source temp table DDL query\nselect\n *\nfrom (\n select\n session_id,\n transaction_id,\n start_time,\n userid,\n REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n query_text,\n row_number() over (\n partition by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n from (\n select\n pid as session_id,\n xid as transaction_id,\n starttime as start_time,\n type,\n query_text,\n userid\n from (\n select\n starttime,\n pid,\n xid,\n type,\n userid,\n LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n else RTRIM(text)\n end,\n '') within group (\n order by sequence\n ) as query_text\n from\n SVL_STATEMENTTEXT\n where\n type in ('DDL', 'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n AND starttime < '2024-01-10 12:00:00'\n AND sequence < 290\n group by\n starttime,\n pid,\n xid,\n type,\n userid\n order by\n starttime,\n pid,\n xid,\n type,\n userid\n asc\n )\n where\n type in ('DDL', 'QUERY')\n )\n where\n (create_command ilike 'create temp table %'\n or create_command ilike 'create temporary table %'\n -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n or create_command ilike 'create table %')\n -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP TABLE volt_tt_'\n -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n rn = 1\n" ): mock_temp_table_cursor, "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor, } From f111501d6111d94234bc90423c6921377d3b2a80 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 16:36:18 -0700 Subject: [PATCH 15/31] docs(airflow): add known limitations for automatic lineage (#11652) --- docs/lineage/airflow.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 35f2ff862e6958..829c048a8f8e24 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -132,7 +132,7 @@ conn_id = datahub_rest_default # or datahub_kafka_default ``` | Name | Default value | Description | -|----------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| -------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | enabled | true | If the plugin should be enabled. | | conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | | cluster | prod | name of the airflow cluster | @@ -191,6 +191,10 @@ These operators are supported by OpenLineage, but we haven't tested them yet: There's also a few operators (e.g. BashOperator, PythonOperator) that have custom extractors, but those extractors don't generate lineage. --> +Known limitations: + +- We do not fully support operators that run multiple SQL statements at once. In these cases, we'll only capture lineage from the first SQL statement. + ## Manual Lineage Annotation ### Using `inlets` and `outlets` From aed3aa228e2e6de7ef33c92b604425b66d8b55d9 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 17:50:59 -0700 Subject: [PATCH 16/31] perf(ingest): streamline CLL generation (#11645) --- metadata-ingestion/setup.py | 2 +- .../ingestion/source/redshift/lineage.py | 2 +- .../ingestion/source/tableau/tableau.py | 2 +- .../datahub/sql_parsing/sqlglot_lineage.py | 51 ++++++++++------- .../tableau/test_tableau_ingest.py | 17 ++++-- ...st_bigquery_subquery_column_inference.json | 57 +++++++++++++++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 15 +++++ 7 files changed, 118 insertions(+), 28 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index bfec2c00cb8647..365da21208ecce 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -101,7 +101,7 @@ sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1 - "acryl-sqlglot[rs]==25.20.2.dev6", + "acryl-sqlglot[rs]==25.25.2.dev9", } classification_lib = { diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index 0e4cb6f1599e52..fe491ccb318505 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -133,7 +133,7 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str, assert isinstance(parsed_query, sqlglot.exp.Alter) prev_name = parsed_query.this.name rename_clause = parsed_query.args["actions"][0] - assert isinstance(rename_clause, sqlglot.exp.RenameTable) + assert isinstance(rename_clause, sqlglot.exp.AlterRename) new_name = rename_clause.this.name schema = parsed_query.this.db or default_schema diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 9f011790990ec2..2c17a5a322f052 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -2131,7 +2131,7 @@ def _create_lineage_from_unsupported_csql( fine_grained_lineages: List[FineGrainedLineage] = [] if self.config.extract_column_level_lineage: - logger.info("Extracting CLL from custom sql") + logger.debug("Extracting CLL from custom sql") fine_grained_lineages = make_fine_grained_lineage_class( parsed_result, csql_urn, out_columns ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 273e9d0f9f0b1d..6a7ff5be6d1ea8 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -1,6 +1,5 @@ import dataclasses import functools -import itertools import logging import traceback from collections import defaultdict @@ -14,6 +13,8 @@ import sqlglot.optimizer.annotate_types import sqlglot.optimizer.optimizer import sqlglot.optimizer.qualify +import sqlglot.optimizer.qualify_columns +import sqlglot.optimizer.unnest_subqueries from datahub.cli.env_utils import get_boolean_env_variable from datahub.ingestion.graph.client import DataHubGraph @@ -63,24 +64,30 @@ SQL_LINEAGE_TIMEOUT_SECONDS = 10 -RULES_BEFORE_TYPE_ANNOTATION: tuple = tuple( - filter( - lambda func: func.__name__ - not in { - # Skip pushdown_predicates because it sometimes throws exceptions, and we - # don't actually need it for anything. - "pushdown_predicates", - # Skip normalize because it can sometimes be expensive. - "normalize", - }, - itertools.takewhile( - lambda func: func != sqlglot.optimizer.annotate_types.annotate_types, - sqlglot.optimizer.optimizer.RULES, - ), - ) +# These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES. +# If there's a change in their rules, we probably need to re-evaluate our list as well. +assert len(sqlglot.optimizer.optimizer.RULES) == 14 + +_OPTIMIZE_RULES = ( + sqlglot.optimizer.optimizer.qualify, + # We need to enable this in order for annotate types to work. + sqlglot.optimizer.optimizer.pushdown_projections, + # sqlglot.optimizer.optimizer.normalize, # causes perf issues + sqlglot.optimizer.optimizer.unnest_subqueries, + # sqlglot.optimizer.optimizer.pushdown_predicates, # causes perf issues + # sqlglot.optimizer.optimizer.optimize_joins, + # sqlglot.optimizer.optimizer.eliminate_subqueries, + # sqlglot.optimizer.optimizer.merge_subqueries, + # sqlglot.optimizer.optimizer.eliminate_joins, + # sqlglot.optimizer.optimizer.eliminate_ctes, + sqlglot.optimizer.optimizer.quote_identifiers, + # These three are run separately or not run at all. + # sqlglot.optimizer.optimizer.annotate_types, + # sqlglot.optimizer.canonicalize.canonicalize, + # sqlglot.optimizer.simplify.simplify, ) -# Quick check that the rules were loaded correctly. -assert 0 < len(RULES_BEFORE_TYPE_ANNOTATION) < len(sqlglot.optimizer.optimizer.RULES) + +_DEBUG_TYPE_ANNOTATIONS = False class _ColumnRef(_FrozenModel): @@ -385,11 +392,12 @@ def _sqlglot_force_column_normalizer( schema=sqlglot_db_schema, qualify_columns=True, validate_qualify_columns=False, + allow_partial_qualification=True, identify=True, # sqlglot calls the db -> schema -> table hierarchy "catalog", "db", "table". catalog=default_db, db=default_schema, - rules=RULES_BEFORE_TYPE_ANNOTATION, + rules=_OPTIMIZE_RULES, ) except (sqlglot.errors.OptimizeError, ValueError) as e: raise SqlUnderstandingError( @@ -408,6 +416,10 @@ def _sqlglot_force_column_normalizer( except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e: # This is not a fatal error, so we can continue. logger.debug("sqlglot failed to annotate or parse types: %s", e) + if _DEBUG_TYPE_ANNOTATIONS and logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Type annotated sql %s", statement.sql(pretty=True, dialect=dialect) + ) return statement, _ColumnResolver( sqlglot_db_schema=sqlglot_db_schema, @@ -907,6 +919,7 @@ def _sqlglot_lineage_inner( # At this stage we only want to qualify the table names. The columns will be dealt with later. qualify_columns=False, validate_qualify_columns=False, + allow_partial_qualification=True, # Only insert quotes where necessary. identify=False, ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 5a5552a78c56fa..3798df07000c8a 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -19,6 +19,7 @@ from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau.tableau import ( TableauConfig, @@ -37,7 +38,7 @@ FineGrainedLineageUpstreamType, UpstreamLineage, ) -from datahub.metadata.schema_classes import MetadataChangeProposalClass, UpstreamClass +from datahub.metadata.schema_classes import UpstreamClass from tests.test_helpers import mce_helpers, test_connection_helpers from tests.test_helpers.state_helpers import ( get_current_checkpoint_from_pipeline, @@ -939,11 +940,12 @@ def test_tableau_unsupported_csql(): database_override_map={"production database": "prod"} ) - def test_lineage_metadata( + def check_lineage_metadata( lineage, expected_entity_urn, expected_upstream_table, expected_cll ): - mcp = cast(MetadataChangeProposalClass, next(iter(lineage)).metadata) - assert mcp.aspect == UpstreamLineage( + mcp = cast(MetadataChangeProposalWrapper, list(lineage)[0].metadata) + + expected = UpstreamLineage( upstreams=[ UpstreamClass( dataset=expected_upstream_table, @@ -966,6 +968,9 @@ def test_lineage_metadata( ) assert mcp.entityUrn == expected_entity_urn + actual_aspect = mcp.aspect + assert actual_aspect == expected + csql_urn = "urn:li:dataset:(urn:li:dataPlatform:tableau,09988088-05ad-173c-a2f1-f33ba3a13d1a,PROD)" expected_upstream_table = "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.UserDetail,PROD)" expected_cll = { @@ -996,7 +1001,7 @@ def test_lineage_metadata( }, out_columns=[], ) - test_lineage_metadata( + check_lineage_metadata( lineage=lineage, expected_entity_urn=csql_urn, expected_upstream_table=expected_upstream_table, @@ -1014,7 +1019,7 @@ def test_lineage_metadata( }, out_columns=[], ) - test_lineage_metadata( + check_lineage_metadata( lineage=lineage, expected_entity_urn=csql_urn, expected_upstream_table=expected_upstream_table, diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json new file mode 100644 index 00000000000000..32b3830bf1412a --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json @@ -0,0 +1,57 @@ +{ + "query_type": "SELECT", + "query_type_props": {}, + "query_fingerprint": "4094ebd230c1d47c7e6879b05ab927e550923b1986eb58c5f3814396cf401d18", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "user_id", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)", + "column": "user_id" + } + ] + }, + { + "downstream": { + "table": null, + "column": "source", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)", + "column": "source" + } + ] + }, + { + "downstream": { + "table": null, + "column": "user_source", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)", + "column": "user_source" + } + ] + } + ], + "debug_info": { + "confidence": 0.2, + "generalized_statement": "SELECT user_id, source, user_source FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_ FROM invent_dw.UserDetail) AS source_user WHERE rank_ = ?" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index fb1d2a0bc50110..90cc863d6bd231 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -1253,3 +1253,18 @@ def test_snowflake_drop_schema() -> None: dialect="snowflake", expected_file=RESOURCE_DIR / "test_snowflake_drop_schema.json", ) + + +def test_bigquery_subquery_column_inference() -> None: + assert_sql_result( + """\ +SELECT user_id, source, user_source +FROM ( + SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_ + FROM invent_dw.UserDetail +) source_user +WHERE rank_ = 1 +""", + dialect="bigquery", + expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json", + ) From 59a2cee875551275facc659dd39d329ed5cbce3c Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 18 Oct 2024 10:33:28 +0530 Subject: [PATCH 17/31] feat(ingest): ensure sqlite file delete on clean exit (#11612) --- .../ingestion/source/bigquery_v2/bigquery.py | 9 +- .../source/bigquery_v2/bigquery_queries.py | 4 + .../source/bigquery_v2/queries_extractor.py | 18 +++- .../ingestion/source/redshift/lineage_v2.py | 7 +- .../ingestion/source/redshift/redshift.py | 23 +++-- .../source/snowflake/snowflake_connection.py | 3 +- .../source/snowflake/snowflake_queries.py | 64 ++++++++----- .../source/snowflake/snowflake_v2.py | 94 ++++++++++--------- .../sql_parsing/sql_parsing_aggregator.py | 7 ++ .../bigquery_v2/test_bigquery_queries.py | 16 ++++ .../snowflake/test_snowflake_queries.py | 24 +++++ .../unit/sql_parsing/test_sql_aggregator.py | 22 +++++ 12 files changed, 203 insertions(+), 88 deletions(-) create mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 3c6202cc7cbfaf..0e986acc81add8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -272,7 +272,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) - queries_extractor = BigQueryQueriesExtractor( + with BigQueryQueriesExtractor( connection=self.config.get_bigquery_client(), schema_api=self.bq_schema_extractor.schema_api, config=BigQueryQueriesExtractorConfig( @@ -288,9 +288,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=self.sql_parser_schema_resolver, discovered_tables=self.bq_schema_extractor.table_refs, - ) - self.report.queries_extractor = queries_extractor.report - yield from queries_extractor.get_workunits_internal() + ) as queries_extractor: + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() + else: if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py index ed27aae19ce963..47f21c9f32353a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py @@ -88,3 +88,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: def get_report(self) -> BigQueryQueriesSourceReport: return self.report + + def close(self) -> None: + self.queries_extractor.close() + self.connection.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index b4a443673b9a97..afaaaf51964f8e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -13,6 +13,7 @@ BaseTimeWindowConfig, get_time_bucket, ) +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -114,7 +115,7 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig): ) -class BigQueryQueriesExtractor: +class BigQueryQueriesExtractor(Closeable): """ Extracts query audit log and generates usage/lineage/operation workunits. @@ -181,6 +182,7 @@ def __init__( is_allowed_table=self.is_allowed_table, format_queries=False, ) + self.report.sql_aggregator = self.aggregator.report self.report.num_discovered_tables = ( len(self.discovered_tables) if self.discovered_tables else None @@ -273,12 +275,14 @@ def get_workunits_internal( self.report.num_unique_queries = len(queries_deduped) logger.info(f"Found {self.report.num_unique_queries} unique queries") - with self.report.audit_log_load_timer: + with self.report.audit_log_load_timer, queries_deduped: i = 0 for _, query_instances in queries_deduped.items(): for query in query_instances.values(): if i > 0 and i % 10000 == 0: - logger.info(f"Added {i} query log entries to SQL aggregator") + logger.info( + f"Added {i} query log equeries_dedupedntries to SQL aggregator" + ) if self.report.sql_aggregator: logger.info(self.report.sql_aggregator.as_string()) @@ -287,6 +291,11 @@ def get_workunits_internal( yield from auto_workunit(self.aggregator.gen_metadata()) + if not use_cached_audit_log: + queries.close() + shared_connection.close() + audit_log_file.unlink(missing_ok=True) + def deduplicate_queries( self, queries: FileBackedList[ObservedQuery] ) -> FileBackedDict[Dict[int, ObservedQuery]]: @@ -404,6 +413,9 @@ def _parse_audit_log_row(self, row: BigQueryJob) -> ObservedQuery: return entry + def close(self) -> None: + self.aggregator.close() + def _extract_query_text(row: BigQueryJob) -> str: # We wrap select statements in a CTE to make them parseable as DML statement. diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 4df64c80bad8a8..53f9383ec02a72 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -5,6 +5,7 @@ import redshift_connector from datahub.emitter import mce_builder +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig @@ -39,7 +40,7 @@ logger = logging.getLogger(__name__) -class RedshiftSqlLineageV2: +class RedshiftSqlLineageV2(Closeable): # does lineage and usage based on SQL parsing. def __init__( @@ -56,6 +57,7 @@ def __init__( self.context = context self.database = database + self.aggregator = SqlParsingAggregator( platform=self.platform, platform_instance=self.config.platform_instance, @@ -436,3 +438,6 @@ def generate(self) -> Iterable[MetadataWorkUnit]: message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.", context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}", ) + + def close(self) -> None: + self.aggregator.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index a9fc9ab8f3e993..76030cea984946 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -451,24 +451,23 @@ def _extract_metadata( ) if self.config.use_lineage_v2: - lineage_extractor = RedshiftSqlLineageV2( + with RedshiftSqlLineageV2( config=self.config, report=self.report, context=self.ctx, database=database, redundant_run_skip_handler=self.redundant_lineage_run_skip_handler, - ) - - yield from lineage_extractor.aggregator.register_schemas_from_stream( - self.process_schemas(connection, database) - ) + ) as lineage_extractor: + yield from lineage_extractor.aggregator.register_schemas_from_stream( + self.process_schemas(connection, database) + ) - self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) - yield from self.extract_lineage_v2( - connection=connection, - database=database, - lineage_extractor=lineage_extractor, - ) + self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) + yield from self.extract_lineage_v2( + connection=connection, + database=database, + lineage_extractor=lineage_extractor, + ) all_tables = self.get_all_tables() else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index d39e95a884dbc2..a9f454cfd3cdb3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -18,6 +18,7 @@ from datahub.configuration.connection_resolver import auto_connection_resolver from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, @@ -364,7 +365,7 @@ def get_connection(self) -> "SnowflakeConnection": ) from e -class SnowflakeConnection: +class SnowflakeConnection(Closeable): _connection: NativeSnowflakeConnection def __init__(self, connection: NativeSnowflakeConnection): diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 1445d02aa49dbd..e11073d77b46eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -1,3 +1,4 @@ +import contextlib import dataclasses import functools import json @@ -17,6 +18,7 @@ BaseTimeWindowConfig, BucketDuration, ) +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.report import Report from datahub.ingestion.api.source import Source, SourceReport @@ -121,7 +123,7 @@ class SnowflakeQueriesSourceReport(SourceReport): queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None -class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin): +class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable): def __init__( self, connection: SnowflakeConnection, @@ -143,28 +145,33 @@ def __init__( self._structured_report = structured_report - self.aggregator = SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.identifiers.identifier_config.platform_instance, - env=self.identifiers.identifier_config.env, - schema_resolver=schema_resolver, - graph=graph, - eager_graph_load=False, - generate_lineage=self.config.include_lineage, - generate_queries=self.config.include_queries, - generate_usage_statistics=self.config.include_usage_statistics, - generate_query_usage_statistics=self.config.include_query_usage_statistics, - usage_config=BaseUsageConfig( - bucket_duration=self.config.window.bucket_duration, - start_time=self.config.window.start_time, - end_time=self.config.window.end_time, - user_email_pattern=self.config.user_email_pattern, - # TODO make the rest of the fields configurable - ), - generate_operations=self.config.include_operations, - is_temp_table=self.is_temp_table, - is_allowed_table=self.is_allowed_table, - format_queries=False, + # The exit stack helps ensure that we close all the resources we open. + self._exit_stack = contextlib.ExitStack() + + self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.identifiers.identifier_config.platform_instance, + env=self.identifiers.identifier_config.env, + schema_resolver=schema_resolver, + graph=graph, + eager_graph_load=False, + generate_lineage=self.config.include_lineage, + generate_queries=self.config.include_queries, + generate_usage_statistics=self.config.include_usage_statistics, + generate_query_usage_statistics=self.config.include_query_usage_statistics, + usage_config=BaseUsageConfig( + bucket_duration=self.config.window.bucket_duration, + start_time=self.config.window.start_time, + end_time=self.config.window.end_time, + user_email_pattern=self.config.user_email_pattern, + # TODO make the rest of the fields configurable + ), + generate_operations=self.config.include_operations, + is_temp_table=self.is_temp_table, + is_allowed_table=self.is_allowed_table, + format_queries=False, + ) ) self.report.sql_aggregator = self.aggregator.report @@ -248,6 +255,10 @@ def get_workunits_internal( self.aggregator.add(query) yield from auto_workunit(self.aggregator.gen_metadata()) + if not use_cached_audit_log: + queries.close() + shared_connection.close() + audit_log_file.unlink(missing_ok=True) def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: # Derived from _populate_external_lineage_from_copy_history. @@ -426,6 +437,9 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: ) return entry + def close(self) -> None: + self._exit_stack.close() + class SnowflakeQueriesSource(Source): def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig): @@ -468,6 +482,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: def get_report(self) -> SnowflakeQueriesSourceReport: return self.report + def close(self) -> None: + self.connection.close() + self.queries_extractor.close() + # Make sure we don't try to generate too much info for a single query. _MAX_TABLES_PER_QUERY = 20 diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 0d7881f36554d1..dd7f73268fdc4f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -1,3 +1,4 @@ +import contextlib import functools import json import logging @@ -149,7 +150,12 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) - self.connection = self.config.get_connection() + # The exit stack helps ensure that we close all the resources we open. + self._exit_stack = contextlib.ExitStack() + + self.connection: SnowflakeConnection = self._exit_stack.enter_context( + self.config.get_connection() + ) # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) @@ -157,25 +163,27 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.aggregator: Optional[SqlParsingAggregator] = None if self.config.use_queries_v2 or self.config.include_table_lineage: - self.aggregator = SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - graph=self.ctx.graph, - eager_graph_load=( - # If we're ingestion schema metadata for tables/views, then we will populate - # schemas into the resolver as we go. We only need to do a bulk fetch - # if we're not ingesting schema metadata as part of ingestion. - not ( - self.config.include_technical_schema - and self.config.include_tables - and self.config.include_views - ) - and not self.config.lazy_schema_resolver - ), - generate_usage_statistics=False, - generate_operations=False, - format_queries=self.config.format_sql_queries, + self.aggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + graph=self.ctx.graph, + eager_graph_load=( + # If we're ingestion schema metadata for tables/views, then we will populate + # schemas into the resolver as we go. We only need to do a bulk fetch + # if we're not ingesting schema metadata as part of ingestion. + not ( + self.config.include_technical_schema + and self.config.include_tables + and self.config.include_views + ) + and not self.config.lazy_schema_resolver + ), + generate_usage_statistics=False, + generate_operations=False, + format_queries=self.config.format_sql_queries, + ) ) self.report.sql_aggregator = self.aggregator.report @@ -191,14 +199,16 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): pipeline_name=self.ctx.pipeline_name, run_id=self.ctx.run_id, ) - self.lineage_extractor = SnowflakeLineageExtractor( - config, - self.report, - connection=self.connection, - filters=self.filters, - identifiers=self.identifiers, - redundant_run_skip_handler=redundant_lineage_run_skip_handler, - sql_aggregator=self.aggregator, + self.lineage_extractor = self._exit_stack.enter_context( + SnowflakeLineageExtractor( + config, + self.report, + connection=self.connection, + filters=self.filters, + identifiers=self.identifiers, + redundant_run_skip_handler=redundant_lineage_run_skip_handler, + sql_aggregator=self.aggregator, + ) ) self.usage_extractor: Optional[SnowflakeUsageExtractor] = None @@ -213,13 +223,15 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): pipeline_name=self.ctx.pipeline_name, run_id=self.ctx.run_id, ) - self.usage_extractor = SnowflakeUsageExtractor( - config, - self.report, - connection=self.connection, - filter=self.filters, - identifiers=self.identifiers, - redundant_run_skip_handler=redundant_usage_run_skip_handler, + self.usage_extractor = self._exit_stack.enter_context( + SnowflakeUsageExtractor( + config, + self.report, + connection=self.connection, + filter=self.filters, + identifiers=self.identifiers, + redundant_run_skip_handler=redundant_usage_run_skip_handler, + ) ) self.profiling_state_handler: Optional[ProfilingHandler] = None @@ -444,10 +456,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self._snowflake_clear_ocsp_cache() - self.connection = self.config.get_connection() - if self.connection is None: - return - self.inspect_session_metadata(self.connection) snowsight_url_builder = None @@ -513,7 +521,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: schema_resolver = self.aggregator._schema_resolver - queries_extractor = SnowflakeQueriesExtractor( + queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor( connection=self.connection, config=SnowflakeQueriesExtractorConfig( window=self.config, @@ -535,6 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # it should be pretty straightforward to refactor this and only initialize the aggregator once. self.report.queries_extractor = queries_extractor.report yield from queries_extractor.get_workunits_internal() + queries_extractor.close() else: if self.config.include_table_lineage and self.lineage_extractor: @@ -723,7 +732,4 @@ def _snowflake_clear_ocsp_cache(self) -> None: def close(self) -> None: super().close() StatefulIngestionSourceBase.close(self) - if self.lineage_extractor: - self.lineage_extractor.close() - if self.usage_extractor: - self.usage_extractor.close() + self._exit_stack.close() diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 5f2709fe426605..0b7ad14a8c1b41 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -275,6 +275,8 @@ class SqlAggregatorReport(Report): tool_meta_report: Optional[ToolMetaExtractorReport] = None def compute_stats(self) -> None: + if self._aggregator._closed: + return self.schema_resolver_count = self._aggregator._schema_resolver.schema_count() self.num_unique_query_fingerprints = len(self._aggregator._query_map) @@ -345,6 +347,7 @@ def __init__( # The exit stack helps ensure that we close all the resources we open. self._exit_stack = contextlib.ExitStack() + self._closed: bool = False # Set up the schema resolver. self._schema_resolver: SchemaResolver @@ -456,12 +459,16 @@ def __init__( shared_connection=self._shared_connection, tablename="query_usage_counts", ) + self._exit_stack.push(self._query_usage_counts) # Tool Extractor self._tool_meta_extractor = ToolMetaExtractor() self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: + # Compute stats once before closing connections + self.report.compute_stats() + self._closed = True self._exit_stack.close() @property diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py index 9290100b0c521c..ef846f698f156e 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py @@ -1,4 +1,5 @@ import json +import os from datetime import datetime from pathlib import Path from unittest.mock import patch @@ -6,7 +7,9 @@ import pytest from freezegun import freeze_time +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.bigquery_v2.bigquery_queries import ( + BigQueryQueriesSource, BigQueryQueriesSourceReport, ) from datahub.metadata.urns import CorpUserUrn @@ -93,3 +96,16 @@ def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tm output_path=mcp_output_path, golden_path=mcp_golden_path, ) + + +@patch("google.cloud.bigquery.Client") +@patch("google.cloud.resourcemanager_v3.ProjectsClient") +def test_source_close_cleans_tmp(projects_client, client, tmp_path): + with patch("tempfile.tempdir", str(tmp_path)): + source = BigQueryQueriesSource.create( + {"project_ids": ["project1"]}, PipelineContext("run-id") + ) + assert len(os.listdir(tmp_path)) > 0 + # This closes QueriesExtractor which in turn closes SqlParsingAggregator + source.close() + assert len(os.listdir(tmp_path)) == 0 diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py new file mode 100644 index 00000000000000..82f5691bcee3de --- /dev/null +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py @@ -0,0 +1,24 @@ +import os +from unittest.mock import patch + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.snowflake.snowflake_queries import SnowflakeQueriesSource + + +@patch("snowflake.connector.connect") +def test_source_close_cleans_tmp(snowflake_connect, tmp_path): + with patch("tempfile.tempdir", str(tmp_path)): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + } + }, + PipelineContext("run-id"), + ) + assert len(os.listdir(tmp_path)) > 0 + # This closes QueriesExtractor which in turn closes SqlParsingAggregator + source.close() + assert len(os.listdir(tmp_path)) == 0 diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index 0d21936a74d072..849d550ef69c57 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -1,5 +1,7 @@ +import os import pathlib from datetime import datetime, timezone +from unittest.mock import patch import pytest from freezegun import freeze_time @@ -661,3 +663,23 @@ def test_basic_usage(pytestconfig: pytest.Config) -> None: outputs=mcps, golden_path=RESOURCE_DIR / "test_basic_usage.json", ) + + +def test_sql_aggreator_close_cleans_tmp(tmp_path): + frozen_timestamp = parse_user_datetime(FROZEN_TIME) + with patch("tempfile.tempdir", str(tmp_path)): + aggregator = SqlParsingAggregator( + platform="redshift", + generate_lineage=False, + generate_usage_statistics=True, + generate_operations=False, + usage_config=BaseUsageConfig( + start_time=get_time_bucket(frozen_timestamp, BucketDuration.DAY), + end_time=frozen_timestamp, + ), + generate_queries=True, + generate_query_usage_statistics=True, + ) + assert len(os.listdir(tmp_path)) > 0 + aggregator.close() + assert len(os.listdir(tmp_path)) == 0 From a85f7869ddebe2ea24d0d689f464abbb8e61a155 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Fri, 18 Oct 2024 01:12:05 -0700 Subject: [PATCH 18/31] =?UTF-8?q?fix(sdk):=20platform=20resource=20-=20sup?= =?UTF-8?q?port=20indexed=20queries=20when=20urns=20are=20i=E2=80=A6=20(#1?= =?UTF-8?q?1660)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../platformresource/platform_resource.py | 41 +++++++++- .../test_platform_resource.py | 74 +++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py index 1556a67a9e5555..349b0ff11d84f7 100644 --- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py +++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py @@ -85,6 +85,7 @@ def scroll_urns_by_filter( self, entity_type: str, extra_or_filters: List[Dict[str, str]], + extra_and_filters: List[Dict[str, str]] = [], ) -> Iterable[str]: """ Scroll through all urns that match the given filters @@ -92,10 +93,26 @@ def scroll_urns_by_filter( key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) assert key_aspect, f"No key aspect found for entity type {entity_type}" + if extra_or_filters and extra_and_filters: + raise ValueError( + "Only one of extra_or_filters and extra_and_filters should be provided" + ) count = 1000 - query = " OR ".join( - [f"{filter['field']}:{filter['value']}" for filter in extra_or_filters] + query = ( + " OR ".join( + [ + f"{filter['field']}:\"{filter['value']}\"" + for filter in extra_or_filters + ] + ) + if extra_or_filters + else " AND ".join( + [ + f"{filter['field']}:\"{filter['value']}\"" + for filter in extra_and_filters + ] + ) ) scroll_id = None while True: @@ -252,3 +269,23 @@ def search_by_key( def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None: graph_client.delete_entity(str(PlatformResourceUrn(self.id)), hard=hard) + + @staticmethod + def search_by_filters( + graph_client: DataHubGraph, + and_filters: List[Dict[str, str]] = [], + or_filters: List[Dict[str, str]] = [], + ) -> Iterable["PlatformResource"]: + if and_filters and or_filters: + raise ValueError( + "Only one of and_filters and or_filters should be provided" + ) + openapi_client = OpenAPIGraphClient(graph_client) + for urn in openapi_client.scroll_urns_by_filter( + entity_type="platformResource", + extra_or_filters=or_filters if or_filters else [], + extra_and_filters=and_filters if and_filters else [], + ): + platform_resource = PlatformResource.from_datahub(graph_client, urn) + if platform_resource: + yield platform_resource diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py index 7c53f72d843c93..7ebfd4d6ea15b4 100644 --- a/smoke-test/tests/platform_resources/test_platform_resource.py +++ b/smoke-test/tests/platform_resources/test_platform_resource.py @@ -112,3 +112,77 @@ def test_platform_resource_non_existent(graph_client, test_id): graph_client=graph_client, ) assert platform_resource is None + + +def test_platform_resource_urn_secondary_key(graph_client, test_id): + key = PlatformResourceKey( + platform=f"test_platform_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_{test_id}", + ) + dataset_urn = ( + f"urn:li:dataset:(urn:li:dataPlatform:test,test_secondary_key_{test_id},PROD)" + ) + platform_resource = PlatformResource.create( + key=key, + value={"test_key": f"test_value_{test_id}"}, + secondary_keys=[dataset_urn], + ) + platform_resource.to_datahub(graph_client) + wait_for_writes_to_sync() + + read_platform_resources = [ + r + for r in PlatformResource.search_by_key( + graph_client, dataset_urn, primary=False + ) + ] + assert len(read_platform_resources) == 1 + assert read_platform_resources[0] == platform_resource + + +def test_platform_resource_listing_by_resource_type(graph_client, test_id): + # Generate two resources with the same resource type + key1 = PlatformResourceKey( + platform=f"test_platform_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_1_{test_id}", + ) + platform_resource1 = PlatformResource.create( + key=key1, + value={"test_key": f"test_value_1_{test_id}"}, + ) + platform_resource1.to_datahub(graph_client) + + key2 = PlatformResourceKey( + platform=f"test_platform_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_2_{test_id}", + ) + platform_resource2 = PlatformResource.create( + key=key2, + value={"test_key": f"test_value_2_{test_id}"}, + ) + platform_resource2.to_datahub(graph_client) + + wait_for_writes_to_sync() + + search_results = [ + r + for r in PlatformResource.search_by_filters( + graph_client, + and_filters=[ + { + "field": "resourceType", + "condition": "EQUAL", + "value": key1.resource_type, + } + ], + ) + ] + assert len(search_results) == 2 + + read_platform_resource_1 = next(r for r in search_results if r.id == key1.id) + read_platform_resource_2 = next(r for r in search_results if r.id == key2.id) + assert read_platform_resource_1 == platform_resource1 + assert read_platform_resource_2 == platform_resource2 From c96a7d5b1cd2495ec3369167da7f6c841643e781 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Fri, 18 Oct 2024 03:13:18 -0700 Subject: [PATCH 19/31] fix(ingest/dbt): Prevent lineage cycles when parsing sql of dbt models (#11666) --- .../ingestion/source/dbt/dbt_common.py | 6 +++ .../tests/unit/test_dbt_source.py | 42 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 4cd3c934ce6348..c95d0e545c5989 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1989,6 +1989,11 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str: time=mce_builder.get_sys_time(), actor=_DEFAULT_ACTOR, ) + sibling_urn = node.get_urn( + self.config.target_platform, + self.config.env, + self.config.target_platform_instance, + ) return UpstreamLineageClass( upstreams=[ UpstreamClass( @@ -1997,6 +2002,7 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str: auditStamp=auditStamp, ) for upstream in upstream_urns + if not (node.node_type == "model" and upstream == sibling_urn) ], fineGrainedLineages=( (cll or None) if self.config.include_column_lineage else None diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index 7d01ecd034523d..f0d4c3408271f7 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -10,6 +10,7 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.dbt import dbt_cloud from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig +from datahub.ingestion.source.dbt.dbt_common import DBTNode from datahub.ingestion.source.dbt.dbt_core import ( DBTCoreConfig, DBTCoreSource, @@ -253,6 +254,47 @@ def test_dbt_config_prefer_sql_parser_lineage(): assert config.prefer_sql_parser_lineage is True +def test_dbt_prefer_sql_parser_lineage_no_self_reference(): + ctx = PipelineContext(run_id="test-run-id") + config = DBTCoreConfig.parse_obj( + { + **create_base_dbt_config(), + "skip_sources_in_lineage": True, + "prefer_sql_parser_lineage": True, + } + ) + source: DBTCoreSource = DBTCoreSource(config, ctx, "dbt") + all_nodes_map = { + "model1": DBTNode( + name="model1", + database=None, + schema=None, + alias=None, + comment="", + description="", + language=None, + raw_code=None, + dbt_adapter="postgres", + dbt_name="model1", + dbt_file_path=None, + dbt_package_name=None, + node_type="model", + materialization="table", + max_loaded_at=None, + catalog_type=None, + missing_from_catalog=False, + owner=None, + compiled_code="SELECT d FROM results WHERE d > (SELECT MAX(d) FROM model1)", + ), + } + source._infer_schemas_and_update_cll(all_nodes_map) + upstream_lineage = source._create_lineage_aspect_for_dbt_node( + all_nodes_map["model1"], all_nodes_map + ) + assert upstream_lineage is not None + assert len(upstream_lineage.upstreams) == 1 + + def test_dbt_s3_config(): # test missing aws config config_dict: dict = { From b7f7f6e1d3a5d6a13c0ae9eb8eade0cee5855e19 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Fri, 18 Oct 2024 20:16:08 +0200 Subject: [PATCH 20/31] fix(ingest/dagster): Fix JobSnapshot import is broken (#11672) --- .../datahub_dagster_plugin/client/dagster_generator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index a2cf159dd12f6e..df123b127e0405 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -12,7 +12,13 @@ TableSchemaMetadataValue, ) from dagster._core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus -from dagster._core.snap import JobSnapshot + +try: + from dagster._core.snap import JobSnapshot # type: ignore[attr-defined] +except ImportError: + # Import changed since Dagster 1.8.12 to this -> https://github.com/dagster-io/dagster/commit/29a37d1f0260cfd112849633d1096ffc916d6c95 + from dagster._core.snap import JobSnap as JobSnapshot + from dagster._core.snap.node import OpDefSnap from dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatsSnapshot from datahub.api.entities.datajob import DataFlow, DataJob From 44e5c43612ebc08c73cb91c044eaad47b5e991fc Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Fri, 18 Oct 2024 12:01:39 -0700 Subject: [PATCH 21/31] feat(ingest/transformer/domain): Add support for on conflict do nothing to dataset domain transformers (#11649) --- .../docs/transformer/dataset_transformer.md | 28 +++---- .../src/datahub/ingestion/graph/client.py | 1 + .../ingestion/transformer/dataset_domain.py | 41 ++++++---- .../tests/unit/test_transform_dataset.py | 76 +++++++++++++++++++ 4 files changed, 119 insertions(+), 27 deletions(-) diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index d48c6d2c1ab5b4..66274ce64a8d29 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -122,12 +122,13 @@ transformers: ``` ## Simple Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|--------------------|----------|--------------|-------------|---------------------------------------------------------------------| -| `owner_urns` | ✅ | list[string] | | List of owner urns. | -| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | -| `replace_existing` | | boolean | `false` | Whether to remove ownership from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| Field | Required | Type | Default | Description | +|--------------------|----------|--------------|-------------|------------------------------------------------------------------------------------------------------------| +| `owner_urns` | ✅ | list[string] | | List of owner urns. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove ownership from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `on_conflict` | | enum | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. | For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics). @@ -191,13 +192,14 @@ transformers: ## Pattern Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------| -| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | -| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | -| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | -| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. | +| Field | Required | Type | Default | Description | +|--------------------|----------|----------------------|-------------|------------------------------------------------------------------------------------------------------------------------------| +| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. | +| `on_conflict` | | enum | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. | let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners. diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index e8fae6254ae885..1d2528a24c4e57 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -351,6 +351,7 @@ def get_tags(self, entity_urn: str) -> Optional[GlobalTagsClass]: def get_glossary_terms(self, entity_urn: str) -> Optional[GlossaryTermsClass]: return self.get_aspect(entity_urn=entity_urn, aspect_type=GlossaryTermsClass) + @functools.lru_cache(maxsize=1) def get_domain(self, entity_urn: str) -> Optional[DomainsClass]: return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py index 6a838248152650..6b78b71eaa78e9 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py @@ -1,6 +1,8 @@ import logging +from enum import auto from typing import Callable, Dict, List, Optional, Sequence, Union, cast +from datahub.configuration._config_enum import ConfigEnum from datahub.configuration.common import ( ConfigurationError, KeyValuePattern, @@ -23,6 +25,13 @@ logger = logging.getLogger(__name__) +class TransformerOnConflict(ConfigEnum): + """Describes the behavior of the transformer when writing an aspect that already exists.""" + + DO_UPDATE = auto() # On conflict, apply the new aspect + DO_NOTHING = auto() # On conflict, do not apply the new aspect + + class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): get_domains_to_add: Union[ Callable[[str], DomainsClass], @@ -32,10 +41,12 @@ class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): _resolve_domain_fn = pydantic_resolve_key("get_domains_to_add") is_container: bool = False + on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE class SimpleDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): domains: List[str] + on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE class PatternDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): @@ -80,12 +91,13 @@ def get_domain_class( @staticmethod def _merge_with_server_domains( - graph: DataHubGraph, urn: str, mce_domain: Optional[DomainsClass] + graph: Optional[DataHubGraph], urn: str, mce_domain: Optional[DomainsClass] ) -> Optional[DomainsClass]: if not mce_domain or not mce_domain.domains: # nothing to add, no need to consult server return None + assert graph server_domain = graph.get_domain(entity_urn=urn) if server_domain: # compute patch @@ -155,7 +167,7 @@ def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: in_domain_aspect: DomainsClass = cast(DomainsClass, aspect) - domain_aspect = DomainsClass(domains=[]) + domain_aspect: DomainsClass = DomainsClass(domains=[]) # Check if we have received existing aspect if in_domain_aspect is not None and self.config.replace_existing is False: domain_aspect.domains.extend(in_domain_aspect.domains) @@ -164,16 +176,18 @@ def transform_aspect( domain_aspect.domains.extend(domain_to_add.domains) - if self.config.semantics == TransformerSemantics.PATCH: - assert self.ctx.graph - patch_domain_aspect: Optional[ - DomainsClass - ] = AddDatasetDomain._merge_with_server_domains( - self.ctx.graph, entity_urn, domain_aspect - ) - return cast(Optional[Aspect], patch_domain_aspect) - - return cast(Optional[Aspect], domain_aspect) + final_aspect: Optional[DomainsClass] = domain_aspect + if domain_aspect.domains: + if self.config.on_conflict == TransformerOnConflict.DO_NOTHING: + assert self.ctx.graph + server_domain = self.ctx.graph.get_domain(entity_urn) + if server_domain and server_domain.domains: + return None + if self.config.semantics == TransformerSemantics.PATCH: + final_aspect = AddDatasetDomain._merge_with_server_domains( + self.ctx.graph, entity_urn, domain_aspect + ) + return cast(Optional[Aspect], final_aspect) class SimpleAddDatasetDomain(AddDatasetDomain): @@ -186,8 +200,7 @@ def __init__( domains = AddDatasetDomain.get_domain_class(ctx.graph, config.domains) generic_config = AddDatasetDomainSemanticsConfig( get_domains_to_add=lambda _: domains, - semantics=config.semantics, - replace_existing=config.replace_existing, + **config.dict(exclude={"domains"}), ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 2e2e85b5d18113..4e9a38cb37ae63 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -56,6 +56,7 @@ from datahub.ingestion.transformer.dataset_domain import ( PatternAddDatasetDomain, SimpleAddDatasetDomain, + TransformerOnConflict, ) from datahub.ingestion.transformer.dataset_domain_based_on_tags import ( DatasetTagDomainMapper, @@ -2498,6 +2499,81 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: assert server_domain in transformed_aspect.domains +def test_simple_add_dataset_domain_on_conflict_do_nothing( + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance +): + acryl_domain = builder.make_domain_urn("acryl.io") + datahub_domain = builder.make_domain_urn("datahubproject.io") + server_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[server_domain]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[datahub_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + "on_conflict": TransformerOnConflict.DO_NOTHING, + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 1 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, EndOfStream) + + +def test_simple_add_dataset_domain_on_conflict_do_nothing_no_conflict( + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance +): + acryl_domain = builder.make_domain_urn("acryl.io") + datahub_domain = builder.make_domain_urn("datahubproject.io") + irrelevant_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[datahub_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + "on_conflict": TransformerOnConflict.DO_NOTHING, + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 2 + assert datahub_domain in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + assert irrelevant_domain not in transformed_aspect.domains + + def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph_instance): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" From 3e8c666567e72571524c68fc0d853dabd856def7 Mon Sep 17 00:00:00 2001 From: Jay Feldman <8128360+feldjay@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:58:52 -0400 Subject: [PATCH 22/31] fix(ingest/looker): Remove bad imports from looker_common (#11663) --- .../src/datahub/ingestion/source/looker/looker_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 3cbb13375229b9..317b212f7fa96d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -929,7 +929,6 @@ def from_api( # noqa: C901 reporter: SourceReport, source_config: LookerDashboardSourceConfig, ) -> Optional["LookerExplore"]: # noqa: C901 - from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME try: explore = client.lookml_model_explore(model, explore_name) @@ -1194,7 +1193,6 @@ def _to_metadata_events( # noqa: C901 ) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. - from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), From f723f5131517be3c742f03122d56dc95bea7f31e Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 18 Oct 2024 13:05:06 -0700 Subject: [PATCH 23/31] feat(ingest/looker): include project name in model/explore properties (#11664) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../ingestion/source/looker/looker_common.py | 16 ++-- .../ingestion/source/looker/looker_source.py | 41 ++++++---- .../looker/golden_looker_mces.json | 7 ++ .../looker/golden_test_allow_ingest.json | 4 + ...olden_test_external_project_view_mces.json | 4 + .../looker/golden_test_file_path_ingest.json | 4 + ...olden_test_folder_path_pattern_ingest.json | 4 + .../golden_test_independent_look_ingest.json | 82 +++++++++++-------- .../looker/golden_test_ingest.json | 4 + .../looker/golden_test_ingest_joins.json | 4 + .../golden_test_ingest_unaliased_joins.json | 4 + ...en_test_non_personal_independent_look.json | 7 ++ .../looker_mces_golden_deleted_stateful.json | 16 ++-- .../looker/looker_mces_usage_history.json | 4 + 14 files changed, 135 insertions(+), 66 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 317b212f7fa96d..3d1683100474e8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1205,15 +1205,19 @@ def _to_metadata_events( # noqa: C901 dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) - custom_properties = {} - if self.label is not None: - custom_properties["looker.explore.label"] = str(self.label) - if self.source_file is not None: - custom_properties["looker.explore.file"] = str(self.source_file) + custom_properties = { + "project": self.project_name, + "model": self.model_name, + "looker.explore.label": self.label, + "looker.explore.name": self.name, + "looker.explore.file": self.source_file, + } dataset_props = DatasetPropertiesClass( name=str(self.label) if self.label else LookerUtil._display_name(self.name), description=self.description, - customProperties=custom_properties, + customProperties={ + k: str(v) for k, v in custom_properties.items() if v is not None + }, ) dataset_props.externalUrl = self._get_url(base_url) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index f269ccf1cd98f8..e42ac7b61c1777 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -139,26 +139,21 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase): """ platform = "looker" - source_config: LookerDashboardSourceConfig - reporter: LookerDashboardSourceReport - user_registry: LookerUserRegistry - reachable_look_registry: Set[ - str - ] # Keep track of look-id which are reachable from Dashboard def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) - self.source_config = config - self.reporter = LookerDashboardSourceReport() + self.source_config: LookerDashboardSourceConfig = config + self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry = LookerUserRegistry(self.looker_api) - self.explore_registry = LookerExploreRegistry( + self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) self.reporter._looker_explore_registry = self.explore_registry self.reporter._looker_api = self.looker_api - self.reachable_look_registry = set() + # Keep track of look-id which are reachable from Dashboard + self.reachable_look_registry: Set[str] = set() # (model, explore) -> list of charts/looks/dashboards that reference this explore # The list values are used purely for debugging purposes. @@ -868,21 +863,31 @@ def _make_explore_metadata_events( ) -> Iterable[ Union[MetadataChangeEvent, MetadataChangeProposalWrapper, MetadataWorkUnit] ]: - if self.source_config.emit_used_explores_only: - explores_to_fetch = list(self.reachable_explores.keys()) - else: + if not self.source_config.emit_used_explores_only: explores_to_fetch = list(self.list_all_explores()) + else: + # We don't keep track of project names for each explore right now. + # Because project names are just used for a custom property, it's + # fine to set them to None. + # TODO: Track project names for each explore. + explores_to_fetch = [ + (None, model, explore) + for (model, explore) in self.reachable_explores.keys() + ] explores_to_fetch.sort() processed_models: List[str] = [] - for model, _ in explores_to_fetch: + for project_name, model, _ in explores_to_fetch: if model not in processed_models: model_key = gen_model_key(self.source_config, model) yield from gen_containers( container_key=model_key, name=model, sub_types=[BIContainerSubTypes.LOOKML_MODEL], + extra_properties=( + {"project": project_name} if project_name is not None else None + ), ) yield MetadataChangeProposalWrapper( entityUrn=model_key.as_urn(), @@ -896,7 +901,7 @@ def _make_explore_metadata_events( self.reporter.total_explores = len(explores_to_fetch) for future in BackpressureAwareExecutor.map( self.fetch_one_explore, - ((model, explore) for (model, explore) in explores_to_fetch), + ((model, explore) for (_project, model, explore) in explores_to_fetch), max_workers=self.source_config.max_threads, ): events, explore_id, start_time, end_time = future.result() @@ -907,7 +912,7 @@ def _make_explore_metadata_events( f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}" ) - def list_all_explores(self) -> Iterable[Tuple[str, str]]: + def list_all_explores(self) -> Iterable[Tuple[Optional[str], str, str]]: # returns a list of (model, explore) tuples for model in self.looker_api.all_lookml_models(): @@ -916,7 +921,7 @@ def list_all_explores(self) -> Iterable[Tuple[str, str]]: for explore in model.explores: if explore.name is None: continue - yield (model.name, explore.name) + yield (model.project_name, model.name, explore.name) def fetch_one_explore( self, model: str, explore: str diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 5cac7b1bb73b19..a9c445b5986efe 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -440,7 +441,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "bogus data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/bogus data/my_view", @@ -616,7 +620,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index 24a738a815cda8..af9c62a2a41803 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -282,7 +283,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b1460779da4f5f..b89bc356b48fdc 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "looker_hub", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 74400b9b5cc56b..810fefd8f6cb85 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "looker_hub", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 89241fb52fb634..3d78397f54a235 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -287,6 +287,7 @@ "description": "third", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -613,7 +614,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index f178e97e78fa02..5a540e61e768d7 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -1107,12 +1108,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/sales_model" ] } }, @@ -1124,10 +1125,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "sales_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1149,7 +1153,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1204,7 +1208,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1223,12 +1227,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1240,12 +1244,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1257,7 +1261,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1267,8 +1271,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1283,12 +1287,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/data" ] } }, @@ -1300,10 +1304,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1325,7 +1332,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1380,7 +1387,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1399,12 +1406,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1416,12 +1423,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1433,7 +1440,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1443,8 +1450,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1459,12 +1466,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/order_model" ] } }, @@ -1476,10 +1483,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "order_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1501,7 +1511,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1556,7 +1566,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1575,12 +1585,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1592,12 +1602,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1609,7 +1619,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1619,8 +1629,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index d969ef62a96e5f..9ac95b8482a475 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -229,6 +229,7 @@ "urn:li:chart:(looker,ap-south-1.dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -574,7 +575,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 153db363c78280..3a2c6359ea63c2 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 98adbdc5b829e4..007eee348aeaf8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -282,7 +283,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 63ffdda8c5b6f5..859b9163d7aad6 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -783,7 +784,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", @@ -959,7 +963,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "sales_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 567ab78a14754b..8256c984afb274 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -539,7 +540,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", @@ -810,8 +814,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -827,8 +831,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -844,8 +848,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "container", + "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 3befb62a631de5..0b3530f9c24629 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -234,7 +235,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", From b41716b91c70407c2385bbad69660f078717a473 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 18 Oct 2024 14:29:03 -0700 Subject: [PATCH 24/31] feat(ingest/fivetran): protect against high sync volume (#11589) --- .../ingestion/source/fivetran/fivetran.py | 21 ++++--- .../source/fivetran/fivetran_log_api.py | 63 ++++++++++--------- .../source/fivetran/fivetran_query.py | 34 +++++++--- .../integration/fivetran/test_fivetran.py | 58 ++++------------- 4 files changed, 87 insertions(+), 89 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 704a6f20a5c19b..334bb58ea84f8e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -27,7 +27,10 @@ PlatformDetail, ) from datahub.ingestion.source.fivetran.data_classes import Connector, Job -from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.fivetran.fivetran_log_api import ( + MAX_JOBS_PER_CONNECTOR, + FivetranLogAPI, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -72,11 +75,6 @@ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext): self.audit_log = FivetranLogAPI(self.config.fivetran_log_config) - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ) - def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: input_dataset_urn_list: List[DatasetUrn] = [] output_dataset_urn_list: List[DatasetUrn] = [] @@ -267,6 +265,13 @@ def _get_connector_workunits( ).as_workunit(is_primary_source=False) # Map Fivetran's job/sync history entity with Datahub's data process entity + if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR: + self.report.warning( + title="Not all sync history was captured", + message=f"The connector had more than {MAX_JOBS_PER_CONNECTOR} sync runs in the past {self.config.history_sync_lookback_period} days. " + f"Only the most recent {MAX_JOBS_PER_CONNECTOR} syncs were ingested.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id})", + ) for job in connector.jobs: dpi = self._generate_dpi_from_job(job, datajob) yield from self._get_dpi_workunits(job, dpi) @@ -279,7 +284,9 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), - self.stale_entity_removal_handler.workunit_processor, + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 31c16139066e43..5908efe39e2b40 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -22,6 +22,10 @@ logger: logging.Logger = logging.getLogger(__name__) +# We don't want to generate a massive number of dataProcesses for a single connector. +# This is primarily used as a safeguard to prevent performance issues. +MAX_JOBS_PER_CONNECTOR = 1000 + class FivetranLogAPI: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: @@ -158,34 +162,32 @@ def _get_table_lineage( return table_lineage_list - def _get_all_connector_sync_logs(self, syncs_interval: int) -> Dict[str, Dict]: - sync_logs = {} - for row in self._query( - self.fivetran_log_query.get_sync_logs_query().format( - db_clause=self.fivetran_log_query.db_clause, - syncs_interval=syncs_interval, - ) - ): - if row[Constant.CONNECTOR_ID] not in sync_logs: - sync_logs[row[Constant.CONNECTOR_ID]] = { - row[Constant.SYNC_ID]: { - row["message_event"]: ( - row[Constant.TIME_STAMP].timestamp(), - row[Constant.MESSAGE_DATA], - ) - } - } - elif row[Constant.SYNC_ID] not in sync_logs[row[Constant.CONNECTOR_ID]]: - sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]] = { - row["message_event"]: ( - row[Constant.TIME_STAMP].timestamp(), - row[Constant.MESSAGE_DATA], - ) - } - else: - sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]][ - row["message_event"] - ] = (row[Constant.TIME_STAMP].timestamp(), row[Constant.MESSAGE_DATA]) + def _get_all_connector_sync_logs( + self, syncs_interval: int, connector_ids: List[str] + ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: + sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} + + # Format connector_ids as a comma-separated string of quoted IDs + formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) + + query = self.fivetran_log_query.get_sync_logs_query().format( + db_clause=self.fivetran_log_query.db_clause, + syncs_interval=syncs_interval, + max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, + connector_ids=formatted_connector_ids, + ) + + for row in self._query(query): + connector_id = row[Constant.CONNECTOR_ID] + sync_id = row[Constant.SYNC_ID] + + if connector_id not in sync_logs: + sync_logs[connector_id] = {} + + sync_logs[connector_id][sync_id] = { + "sync_start": (row["start_time"].timestamp(), None), + "sync_end": (row["end_time"].timestamp(), row["end_message_data"]), + } return sync_logs @@ -244,7 +246,10 @@ def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: def _fill_connectors_jobs( self, connectors: List[Connector], syncs_interval: int ) -> None: - sync_logs = self._get_all_connector_sync_logs(syncs_interval) + connector_ids = [connector.connector_id for connector in connectors] + sync_logs = self._get_all_connector_sync_logs( + syncs_interval, connector_ids=connector_ids + ) for connector in connectors: connector.jobs = self._get_jobs_list(sync_logs.get(connector.connector_id)) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index d965f53ff554b3..c4680b4b1037a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -37,14 +37,32 @@ def get_users_query(self) -> str: def get_sync_logs_query(self) -> str: return """ - SELECT connector_id, - sync_id, - message_event, - message_data, - time_stamp - FROM {db_clause}log - WHERE message_event in ('sync_start', 'sync_end') - and time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'""" + WITH ranked_syncs AS ( + SELECT + connector_id, + sync_id, + MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, + MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, + MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, + ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn + FROM {db_clause}log + WHERE message_event in ('sync_start', 'sync_end') + AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' + AND connector_id IN ({connector_ids}) + GROUP BY connector_id, sync_id + ) + SELECT + connector_id, + sync_id, + start_time, + end_time, + end_message_data + FROM ranked_syncs + WHERE rn <= {max_jobs_per_connector} + AND start_time IS NOT NULL + AND end_time IS NOT NULL + ORDER BY connector_id, end_time DESC + """ def get_table_lineage_query(self) -> str: return f""" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 0f5d098ee39c4a..33ac09e69a3c0a 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -101,64 +101,32 @@ def default_query_results( } ] elif query == fivetran_log_query.get_sync_logs_query().format( - db_clause=fivetran_log_query.db_clause, syncs_interval=7 + db_clause=fivetran_log_query.db_clause, + syncs_interval=7, + max_jobs_per_connector=1000, + connector_ids="'calendar_elected'", ): return [ { "connector_id": "calendar_elected", "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), + "end_message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', }, { "connector_id": "calendar_elected", "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "start_time": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "end_time": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), + "end_message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', }, { "connector_id": "calendar_elected", "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 5, 403000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", - "message_event": "sync_end", - "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', - "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", - "message_event": "sync_end", - "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", - "message_event": "sync_end", - "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', - "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832", - "message_event": "sync_end", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 35, 478000), + "start_time": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), + "end_time": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), + "end_message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', }, ] # Unreachable code From df7456564810eea07fff0b74112c5cba0951fc0a Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Sat, 19 Oct 2024 14:53:28 -0700 Subject: [PATCH 25/31] feat(sdk):platform-resource - complex queries (#11675) --- .../platformresource/platform_resource.py | 193 ++++++------ .../src/datahub/utilities/openapi_utils.py | 69 +++++ .../src/datahub/utilities/search_utils.py | 285 ++++++++++++++++++ .../test_platform_resource.py | 15 + .../tests/unit/utilities/test_search_utils.py | 71 +++++ .../test_platform_resource.py | 78 ++++- 6 files changed, 617 insertions(+), 94 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/openapi_utils.py create mode 100644 metadata-ingestion/src/datahub/utilities/search_utils.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_search_utils.py diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py index 349b0ff11d84f7..0f7b10a067053a 100644 --- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py +++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type, Union, cast from avrogen.dict_wrapper import DictWrapper from pydantic import BaseModel @@ -14,7 +14,14 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import DatahubKey from datahub.ingestion.graph.client import DataHubGraph -from datahub.metadata.urns import PlatformResourceUrn +from datahub.metadata.urns import DataPlatformUrn, PlatformResourceUrn, Urn +from datahub.utilities.openapi_utils import OpenAPIGraphClient +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + ElasticsearchQueryBuilder, + LogicalOperator, + SearchField, +) logger = logging.getLogger(__name__) @@ -69,71 +76,75 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass: ) -class OpenAPIGraphClient: +class DataPlatformInstanceUrn: + """ + A simple implementation of a URN class for DataPlatformInstance. + Since this is not present in the URN registry, we need to implement it here. + """ - ENTITY_KEY_ASPECT_MAP = { - aspect_type.ASPECT_INFO.get("keyForEntity"): name - for name, aspect_type in models.ASPECT_NAME_MAP.items() - if aspect_type.ASPECT_INFO.get("keyForEntity") - } + @staticmethod + def create_from_id(platform_instance_urn: str) -> Urn: + if platform_instance_urn.startswith("urn:li:platformInstance:"): + string_urn = platform_instance_urn + else: + string_urn = f"urn:li:platformInstance:{platform_instance_urn}" + return Urn.from_string(string_urn) - def __init__(self, graph: DataHubGraph): - self.graph = graph - self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3" - def scroll_urns_by_filter( - self, - entity_type: str, - extra_or_filters: List[Dict[str, str]], - extra_and_filters: List[Dict[str, str]] = [], - ) -> Iterable[str]: - """ - Scroll through all urns that match the given filters - """ +class UrnSearchField(SearchField): + """ + A search field that supports URN values. + TODO: Move this to search_utils after we make this more generic. + """ - key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) - assert key_aspect, f"No key aspect found for entity type {entity_type}" - if extra_or_filters and extra_and_filters: - raise ValueError( - "Only one of extra_or_filters and extra_and_filters should be provided" - ) + def __init__(self, field_name: str, urn_value_extractor: Callable[[str], Urn]): + self.urn_value_extractor = urn_value_extractor + super().__init__(field_name) - count = 1000 - query = ( - " OR ".join( - [ - f"{filter['field']}:\"{filter['value']}\"" - for filter in extra_or_filters - ] - ) - if extra_or_filters - else " AND ".join( - [ - f"{filter['field']}:\"{filter['value']}\"" - for filter in extra_and_filters - ] - ) + def get_search_value(self, value: str) -> str: + return str(self.urn_value_extractor(value)) + + +class PlatformResourceSearchField(SearchField): + def __init__(self, field_name: str): + super().__init__(field_name) + + @classmethod + def from_search_field( + cls, search_field: SearchField + ) -> "PlatformResourceSearchField": + # pretends to be a class method, but just returns the input + return search_field # type: ignore + + +class PlatformResourceSearchFields: + PRIMARY_KEY = PlatformResourceSearchField("primaryKey") + RESOURCE_TYPE = PlatformResourceSearchField("resourceType") + SECONDARY_KEYS = PlatformResourceSearchField("secondaryKeys") + PLATFORM = PlatformResourceSearchField.from_search_field( + UrnSearchField( + field_name="platform.keyword", + urn_value_extractor=DataPlatformUrn.create_from_id, ) - scroll_id = None - while True: - response = self.graph._get_generic( - self.openapi_base + f"/entity/{entity_type.lower()}", - params={ - "systemMetadata": "false", - "includeSoftDelete": "false", - "skipCache": "false", - "aspects": [key_aspect], - "scrollId": scroll_id, - "count": count, - "query": query, - }, - ) - entities = response.get("entities", []) - scroll_id = response.get("scrollId") - for entity in entities: - yield entity["urn"] - if not scroll_id: - break + ) + PLATFORM_INSTANCE = PlatformResourceSearchField.from_search_field( + UrnSearchField( + field_name="platformInstance.keyword", + urn_value_extractor=DataPlatformInstanceUrn.create_from_id, + ) + ) + + +class ElasticPlatformResourceQuery(ElasticDocumentQuery[PlatformResourceSearchField]): + def __init__(self): + super().__init__() + + @classmethod + def create_from( + cls: Type["ElasticPlatformResourceQuery"], + *args: Tuple[Union[str, PlatformResourceSearchField], str], + ) -> "ElasticPlatformResourceQuery": + return cast(ElasticPlatformResourceQuery, super().create_from(*args)) class PlatformResource(BaseModel): @@ -147,6 +158,12 @@ def remove( cls, key: PlatformResourceKey, ) -> "PlatformResource": + """ + Creates a PlatformResource object with the removed status set to True. + Removed PlatformResource objects are used to soft-delete resources from + the graph. + To hard-delete a resource, use the delete method. + """ return cls( id=key.id, removed=True, @@ -240,28 +257,38 @@ def from_datahub( @staticmethod def search_by_key( - graph_client: DataHubGraph, key: str, primary: bool = True + graph_client: DataHubGraph, + key: str, + primary: bool = True, + is_exact: bool = True, ) -> Iterable["PlatformResource"]: - extra_or_filters = [] - extra_or_filters.append( - { - "field": "primaryKey", - "condition": "EQUAL", - "value": key, - } + """ + Searches for PlatformResource entities by primary or secondary key. + + :param graph_client: DataHubGraph client + :param key: The key to search for + :param primary: Whether to search for primary only or expand the search + to secondary keys (default: True) + :param is_exact: Whether to search for an exact match (default: True) + :return: An iterable of PlatformResource objects + """ + + elastic_platform_resource_group = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match( + PlatformResourceSearchFields.PRIMARY_KEY, key, is_exact=is_exact + ) ) if not primary: # we expand the search to secondary keys - extra_or_filters.append( - { - "field": "secondaryKeys", - "condition": "EQUAL", - "value": key, - } + elastic_platform_resource_group.add_field_match( + PlatformResourceSearchFields.SECONDARY_KEYS, key, is_exact=is_exact ) + query = elastic_platform_resource_group.end() openapi_client = OpenAPIGraphClient(graph_client) for urn in openapi_client.scroll_urns_by_filter( entity_type="platformResource", - extra_or_filters=extra_or_filters, + query=query, ): platform_resource = PlatformResource.from_datahub(graph_client, urn) if platform_resource: @@ -273,18 +300,16 @@ def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None: @staticmethod def search_by_filters( graph_client: DataHubGraph, - and_filters: List[Dict[str, str]] = [], - or_filters: List[Dict[str, str]] = [], + query: Union[ + ElasticPlatformResourceQuery, + ElasticDocumentQuery, + ElasticsearchQueryBuilder, + ], ) -> Iterable["PlatformResource"]: - if and_filters and or_filters: - raise ValueError( - "Only one of and_filters and or_filters should be provided" - ) openapi_client = OpenAPIGraphClient(graph_client) for urn in openapi_client.scroll_urns_by_filter( entity_type="platformResource", - extra_or_filters=or_filters if or_filters else [], - extra_and_filters=and_filters if and_filters else [], + query=query, ): platform_resource = PlatformResource.from_datahub(graph_client, urn) if platform_resource: diff --git a/metadata-ingestion/src/datahub/utilities/openapi_utils.py b/metadata-ingestion/src/datahub/utilities/openapi_utils.py new file mode 100644 index 00000000000000..e704ff7f84cbbc --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/openapi_utils.py @@ -0,0 +1,69 @@ +import logging +from typing import Iterable, Union + +import datahub.metadata.schema_classes as models +from datahub.ingestion.graph.client import DataHubGraph +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + ElasticsearchQueryBuilder, +) + +logger = logging.getLogger(__name__) + + +class OpenAPIGraphClient: + """ + An experimental client for the DataHubGraph that uses the OpenAPI endpoints + to query entities and aspects. + Does not support all features of the DataHubGraph. + API is subject to change. + + DO NOT USE THIS UNLESS YOU KNOW WHAT YOU ARE DOING. + """ + + ENTITY_KEY_ASPECT_MAP = { + aspect_type.ASPECT_INFO.get("keyForEntity"): name + for name, aspect_type in models.ASPECT_NAME_MAP.items() + if aspect_type.ASPECT_INFO.get("keyForEntity") + } + + def __init__(self, graph: DataHubGraph): + self.graph = graph + self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3" + + def scroll_urns_by_filter( + self, + entity_type: str, + query: Union[ElasticDocumentQuery, ElasticsearchQueryBuilder], + ) -> Iterable[str]: + """ + Scroll through all urns that match the given filters. + + """ + + key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) + assert key_aspect, f"No key aspect found for entity type {entity_type}" + + count = 1000 + string_query = query.build() + scroll_id = None + logger.debug(f"Scrolling with query: {string_query}") + while True: + response = self.graph._get_generic( + self.openapi_base + f"/entity/{entity_type.lower()}", + params={ + "systemMetadata": "false", + "includeSoftDelete": "false", + "skipCache": "false", + "aspects": [key_aspect], + "scrollId": scroll_id, + "count": count, + "query": string_query, + }, + ) + entities = response.get("entities", []) + scroll_id = response.get("scrollId") + for entity in entities: + yield entity["urn"] + if not scroll_id: + break diff --git a/metadata-ingestion/src/datahub/utilities/search_utils.py b/metadata-ingestion/src/datahub/utilities/search_utils.py new file mode 100644 index 00000000000000..0bd88addd86600 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/search_utils.py @@ -0,0 +1,285 @@ +import logging +import re +from enum import Enum +from typing import Generic, List, Optional, Tuple, Type, TypeVar, Union + +logger = logging.getLogger(__name__) + + +class LogicalOperator(Enum): + AND = "AND" + OR = "OR" + + +class SearchField: + def __init__(self, field_name: str): + self.field_name = field_name + + def get_search_value(self, value: str) -> str: + return value + + def __str__(self) -> str: + return self.field_name + + def __repr__(self) -> str: + return self.__str__() + + @classmethod + def from_string_field(cls, field_name: str) -> "SearchField": + return cls(field_name) + + +class QueryNode: + def __init__(self, operator: Optional[LogicalOperator] = None): + self.operator = operator + self.children: List[Union[QueryNode, str]] = [] + + def add_child(self, child: Union["QueryNode", str]) -> None: + self.children.append(child) + + def build(self) -> str: + if not self.children: + return "" + + if self.operator is None: + return ( + self.children[0] + if isinstance(self.children[0], str) + else self.children[0].build() + ) + + child_queries = [] + for child in self.children: + if isinstance(child, str): + child_queries.append(child) + else: + child_queries.append(child.build()) + + joined_queries = f" {self.operator.value} ".join(child_queries) + return f"({joined_queries})" if len(child_queries) > 1 else joined_queries + + +class ElasticsearchQueryBuilder: + SPECIAL_CHARACTERS = r'+-=&|> None: + self.root = QueryNode(operator=operator) + + @classmethod + def escape_special_characters(cls, value: str) -> str: + """ + Escape special characters in the search term. + """ + return re.sub(f"([{re.escape(cls.SPECIAL_CHARACTERS)}])", r"\\\1", value) + + def _create_term( + self, field: SearchField, value: str, is_exact: bool = False + ) -> str: + escaped_value = self.escape_special_characters(field.get_search_value(value)) + field_name: str = field.field_name + if is_exact: + return f'{field_name}:"{escaped_value}"' + return f"{field_name}:{escaped_value}" + + def add_field_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticsearchQueryBuilder": + term = self._create_term(field, value, is_exact) + self.root.add_child(term) + return self + + def add_field_not_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticsearchQueryBuilder": + term = f"-{self._create_term(field, value, is_exact)}" + self.root.add_child(term) + return self + + def add_range( + self, + field: str, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "ElasticsearchQueryBuilder": + min_bracket = "[" if include_min else "{" + max_bracket = "]" if include_max else "}" + min_val = min_value if min_value is not None else "*" + max_val = max_value if max_value is not None else "*" + range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}" + self.root.add_child(range_query) + return self + + def add_wildcard(self, field: str, pattern: str) -> "ElasticsearchQueryBuilder": + wildcard_query = f"{field}:{pattern}" + self.root.add_child(wildcard_query) + return self + + def add_fuzzy( + self, field: str, value: str, fuzziness: int = 2 + ) -> "ElasticsearchQueryBuilder": + fuzzy_query = f"{field}:{value}~{fuzziness}" + self.root.add_child(fuzzy_query) + return self + + def add_boost( + self, field: str, value: str, boost: float + ) -> "ElasticsearchQueryBuilder": + boosted_query = f"{field}:{value}^{boost}" + self.root.add_child(boosted_query) + return self + + def group(self, operator: LogicalOperator) -> "QueryGroup": + return QueryGroup(self, operator) + + def build(self) -> str: + return self.root.build() + + +class QueryGroup: + def __init__(self, parent: ElasticsearchQueryBuilder, operator: LogicalOperator): + self.parent = parent + self.node = QueryNode(operator) + self.parent.root.add_child(self.node) + + def add_field_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "QueryGroup": + if isinstance(field, str): + field = SearchField.from_string_field(field) + term = self.parent._create_term(field, value, is_exact) + self.node.add_child(term) + return self + + def add_field_not_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "QueryGroup": + if isinstance(field, str): + field = SearchField.from_string_field(field) + term = f"-{self.parent._create_term(field, value, is_exact)}" + self.node.add_child(term) + return self + + def add_range( + self, + field: str, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "QueryGroup": + min_bracket = "[" if include_min else "{" + max_bracket = "]" if include_max else "}" + min_val = min_value if min_value is not None else "*" + max_val = max_value if max_value is not None else "*" + range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}" + self.node.add_child(range_query) + return self + + def add_wildcard(self, field: str, pattern: str) -> "QueryGroup": + wildcard_query = f"{field}:{pattern}" + self.node.add_child(wildcard_query) + return self + + def add_fuzzy(self, field: str, value: str, fuzziness: int = 2) -> "QueryGroup": + fuzzy_query = f"{field}:{value}~{fuzziness}" + self.node.add_child(fuzzy_query) + return self + + def add_boost(self, field: str, value: str, boost: float) -> "QueryGroup": + boosted_query = f"{field}:{value}^{boost}" + self.node.add_child(boosted_query) + return self + + def group(self, operator: LogicalOperator) -> "QueryGroup": + new_group = QueryGroup(self.parent, operator) + self.node.add_child(new_group.node) + return new_group + + def end(self) -> ElasticsearchQueryBuilder: + return self.parent + + +SF = TypeVar("SF", bound=SearchField) + + +class ElasticDocumentQuery(Generic[SF]): + def __init__(self) -> None: + self.query_builder = ElasticsearchQueryBuilder() + + @classmethod + def create_from( + cls: Type["ElasticDocumentQuery[SF]"], + *args: Tuple[Union[str, SF], str], + ) -> "ElasticDocumentQuery[SF]": + instance = cls() + for arg in args: + if isinstance(arg, SearchField): + # If the value is empty, we treat it as a wildcard search + logger.info(f"Adding wildcard search for field {arg}") + instance.add_wildcard(arg, "*") + elif isinstance(arg, tuple) and len(arg) == 2: + field, value = arg + assert isinstance(value, str) + if isinstance(field, SearchField): + instance.add_field_match(field, value) + elif isinstance(field, str): + instance.add_field_match( + SearchField.from_string_field(field), value + ) + else: + raise ValueError("Invalid field type {}".format(type(field))) + return instance + + def add_field_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "ElasticDocumentQuery": + if isinstance(field, str): + field = SearchField.from_string_field(field) + self.query_builder.add_field_match(field, value, is_exact) + return self + + def add_field_not_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticDocumentQuery": + self.query_builder.add_field_not_match(field, value, is_exact) + return self + + def add_range( + self, + field: SearchField, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_range( + field_name, min_value, max_value, include_min, include_max + ) + return self + + def add_wildcard(self, field: SearchField, pattern: str) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_wildcard(field_name, pattern) + return self + + def add_fuzzy( + self, field: SearchField, value: str, fuzziness: int = 2 + ) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_fuzzy(field_name, value, fuzziness) + return self + + def add_boost( + self, field: SearchField, value: str, boost: float + ) -> "ElasticDocumentQuery": + self.query_builder.add_boost(field.field_name, value, boost) + return self + + def group(self, operator: LogicalOperator) -> QueryGroup: + return self.query_builder.group(operator) + + def build(self) -> str: + return self.query_builder.build() diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py index e6c9a9466d62b4..a84e373dbe72c2 100644 --- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py +++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py @@ -4,9 +4,12 @@ import datahub.metadata.schema_classes as models from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, PlatformResource, PlatformResourceKey, + PlatformResourceSearchFields, ) +from datahub.utilities.search_utils import LogicalOperator def test_platform_resource_dict(): @@ -179,3 +182,15 @@ class TestModel(BaseModel): ).encode("utf-8") assert platform_resource_info_mcp.aspect.value.schemaType == "JSON" assert platform_resource_info_mcp.aspect.value.schemaRef == TestModel.__name__ + + +def test_platform_resource_filters(): + + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PRIMARY_KEY, "test_1") + .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, "server") + .end() + ) + assert query.build() == '(primaryKey:"test_1" AND resourceType:"server")' diff --git a/metadata-ingestion/tests/unit/utilities/test_search_utils.py b/metadata-ingestion/tests/unit/utilities/test_search_utils.py new file mode 100644 index 00000000000000..6fa2e46c7f20e8 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_search_utils.py @@ -0,0 +1,71 @@ +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + LogicalOperator, + SearchField, +) + + +def test_simple_and_filters(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match("field1", "value1") + .add_field_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" AND field2:"value2")' + + +def test_simple_or_filters(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match("field1", "value1") + .add_field_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" OR field2:"value2")' + + # Use SearchFilter to create this query + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match(SearchField.from_string_field("field1"), "value1") + .add_field_match(SearchField.from_string_field("field2"), "value2") + .end() + ) + assert query.build() == '(field1:"value1" OR field2:"value2")' + + +def test_simple_field_match(): + query: ElasticDocumentQuery = ElasticDocumentQuery.create_from( + ("field1", "value1:1") + ) + assert query.build() == 'field1:"value1\\:1"' + + # Another way to create the same query + query = ElasticDocumentQuery.create_from() + query.add_field_match("field1", "value1:1") + assert query.build() == 'field1:"value1\\:1"' + + +def test_negation(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match("field1", "value1") + .add_field_not_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" AND -field2:"value2")' + + +def test_multi_arg_create_from(): + query: ElasticDocumentQuery = ElasticDocumentQuery.create_from( + ("field1", "value1"), + ("field2", "value2"), + ) + assert query.build() == '(field1:"value1" AND field2:"value2")' diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py index 7ebfd4d6ea15b4..39d15f2e8dea6d 100644 --- a/smoke-test/tests/platform_resources/test_platform_resource.py +++ b/smoke-test/tests/platform_resources/test_platform_resource.py @@ -5,8 +5,10 @@ import pytest from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, PlatformResource, PlatformResourceKey, + PlatformResourceSearchFields, ) from tests.utils import wait_for_healthcheck_util, wait_for_writes_to_sync @@ -42,7 +44,12 @@ def cleanup_resources(graph_client): logger.warning(f"Failed to delete resource: {e}") # Additional cleanup for any resources that might have been missed - for resource in PlatformResource.search_by_key(graph_client, "test_"): + for resource in PlatformResource.search_by_filters( + graph_client, + ElasticPlatformResourceQuery.create_from().add_wildcard( + PlatformResourceSearchFields.PRIMARY_KEY, "test_*" + ), + ): try: resource.delete(graph_client) except Exception as e: @@ -114,7 +121,7 @@ def test_platform_resource_non_existent(graph_client, test_id): assert platform_resource is None -def test_platform_resource_urn_secondary_key(graph_client, test_id): +def test_platform_resource_urn_secondary_key(graph_client, test_id, cleanup_resources): key = PlatformResourceKey( platform=f"test_platform_{test_id}", resource_type=f"test_resource_type_{test_id}", @@ -129,6 +136,7 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id): secondary_keys=[dataset_urn], ) platform_resource.to_datahub(graph_client) + cleanup_resources.append(platform_resource) wait_for_writes_to_sync() read_platform_resources = [ @@ -141,7 +149,9 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id): assert read_platform_resources[0] == platform_resource -def test_platform_resource_listing_by_resource_type(graph_client, test_id): +def test_platform_resource_listing_by_resource_type( + graph_client, test_id, cleanup_resources +): # Generate two resources with the same resource type key1 = PlatformResourceKey( platform=f"test_platform_{test_id}", @@ -171,13 +181,9 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id): r for r in PlatformResource.search_by_filters( graph_client, - and_filters=[ - { - "field": "resourceType", - "condition": "EQUAL", - "value": key1.resource_type, - } - ], + query=ElasticPlatformResourceQuery.create_from( + (PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type) + ), ) ] assert len(search_results) == 2 @@ -186,3 +192,55 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id): read_platform_resource_2 = next(r for r in search_results if r.id == key2.id) assert read_platform_resource_1 == platform_resource1 assert read_platform_resource_2 == platform_resource2 + + +def test_platform_resource_listing_complex_queries(graph_client, test_id): + # Generate two resources with the same resource type + key1 = PlatformResourceKey( + platform=f"test_platform1_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_1_{test_id}", + ) + platform_resource1 = PlatformResource.create( + key=key1, + value={"test_key": f"test_value_1_{test_id}"}, + ) + platform_resource1.to_datahub(graph_client) + + key2 = PlatformResourceKey( + platform=f"test_platform2_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_2_{test_id}", + ) + platform_resource2 = PlatformResource.create( + key=key2, + value={"test_key": f"test_value_2_{test_id}"}, + ) + platform_resource2.to_datahub(graph_client) + + wait_for_writes_to_sync() + from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + LogicalOperator, + PlatformResourceSearchFields, + ) + + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type) + .add_field_not_match(PlatformResourceSearchFields.PLATFORM, key1.platform) + .end() + ) + + search_results = [ + r + for r in PlatformResource.search_by_filters( + graph_client, + query=query, + ) + ] + assert len(search_results) == 1 + + read_platform_resource = search_results[0] + assert read_platform_resource == platform_resource2 From 1957753fa287ec17f5e7ab3e6e6d1d84440ed93d Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Sun, 20 Oct 2024 06:58:32 +0530 Subject: [PATCH 26/31] fix(docs): fix businessattributes doc (#11653) --- docs/businessattributes.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/businessattributes.md b/docs/businessattributes.md index 1744f48f879e82..3e912e7e609805 100644 --- a/docs/businessattributes.md +++ b/docs/businessattributes.md @@ -28,7 +28,6 @@ Taking the example of "United States- Social Security Number", if an application What you need to create/update and associate business attributes to dataset schema field * **Manage Business Attributes** platform privilege to create/update/delete business attributes. -* **Edit Dataset Column Business Attribute** metadata privilege to associate business attributes to dataset schema field. ## Using Business Attributes As of now Business Attributes can only be created through UI From 9a98e495434a6a73cb2459f2436a2855a987f5b0 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Sun, 20 Oct 2024 23:59:45 -0700 Subject: [PATCH 27/31] feat(ingest/fivetran): add safeguards on table/column lineage (#11674) --- .../ingestion/source/fivetran/config.py | 19 +-- .../ingestion/source/fivetran/data_classes.py | 2 +- .../ingestion/source/fivetran/fivetran.py | 23 ++- .../source/fivetran/fivetran_log_api.py | 86 +++++------ .../source/fivetran/fivetran_query.py | 143 +++++++++++------- .../integration/fivetran/test_fivetran.py | 6 +- 6 files changed, 156 insertions(+), 123 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index 02eb096b240f52..2fb5ffd16ea34c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -1,6 +1,6 @@ +import dataclasses import logging -from dataclasses import dataclass, field as dataclass_field -from typing import Dict, List, Optional +from typing import Dict, Optional import pydantic from pydantic import Field, root_validator @@ -23,6 +23,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, ) +from datahub.utilities.lossy_collections import LossyList from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) @@ -114,24 +115,24 @@ def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict: return values -@dataclass +@dataclasses.dataclass class MetadataExtractionPerfReport(Report): - connectors_metadata_extraction_sec: PerfTimer = dataclass_field( + connectors_metadata_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) - connectors_lineage_extraction_sec: PerfTimer = dataclass_field( + connectors_lineage_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) - connectors_jobs_extraction_sec: PerfTimer = dataclass_field( + connectors_jobs_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) -@dataclass +@dataclasses.dataclass class FivetranSourceReport(StaleEntityRemovalSourceReport): connectors_scanned: int = 0 - filtered_connectors: List[str] = dataclass_field(default_factory=list) - metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field( + filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList) + metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field( default_factory=MetadataExtractionPerfReport ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py index 18de2b01edd3b7..046aa9efe3f59b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py @@ -24,7 +24,7 @@ class Connector: sync_frequency: int destination_id: str user_id: str - table_lineage: List[TableLineage] + lineage: List[TableLineage] jobs: List["Job"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 334bb58ea84f8e..c27ec57c2e99ec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -27,9 +27,10 @@ PlatformDetail, ) from datahub.ingestion.source.fivetran.data_classes import Connector, Job -from datahub.ingestion.source.fivetran.fivetran_log_api import ( +from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.fivetran.fivetran_query import ( MAX_JOBS_PER_CONNECTOR, - FivetranLogAPI, + MAX_TABLE_LINEAGE_PER_CONNECTOR, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -106,13 +107,21 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." ) - for table_lineage in connector.table_lineage: + if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR: + self.report.warning( + title="Table lineage truncated", + message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. " + f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id})", + ) + + for lineage in connector.lineage: input_dataset_urn = DatasetUrn.create_from_ids( platform_id=source_platform, table_name=( - f"{source_database.lower()}.{table_lineage.source_table}" + f"{source_database.lower()}.{lineage.source_table}" if source_database - else table_lineage.source_table + else lineage.source_table ), env=source_platform_detail.env, platform_instance=source_platform_detail.platform_instance, @@ -121,14 +130,14 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: output_dataset_urn = DatasetUrn.create_from_ids( platform_id=self.config.fivetran_log_config.destination_platform, - table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}", + table_name=f"{self.audit_log.fivetran_log_database.lower()}.{lineage.destination_table}", env=destination_platform_detail.env, platform_instance=destination_platform_detail.platform_instance, ) output_dataset_urn_list.append(output_dataset_urn) if self.config.include_column_lineage: - for column_lineage in table_lineage.column_lineage: + for column_lineage in lineage.column_lineage: fine_grained_lineage.append( FineGrainedLineage( upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 5908efe39e2b40..b55c8bbbd607fa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -1,6 +1,7 @@ import functools import json import logging +from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple import sqlglot @@ -22,10 +23,6 @@ logger: logging.Logger = logging.getLogger(__name__) -# We don't want to generate a massive number of dataProcesses for a single connector. -# This is primarily used as a safeguard to prevent performance issues. -MAX_JOBS_PER_CONNECTOR = 1000 - class FivetranLogAPI: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: @@ -91,55 +88,51 @@ def _query(self, query: str) -> List[Dict]: resp = self.engine.execute(query) return [row for row in resp] - def _get_column_lineage_metadata(self) -> Dict[str, List]: + def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]: """ - Return's dict of column lineage metadata with key as '-' + Returns dict of column lineage metadata with key as (, ) """ - all_column_lineage: Dict[str, List] = {} + all_column_lineage = defaultdict(list) column_lineage_result = self._query( self.fivetran_log_query.get_column_lineage_query() ) for column_lineage in column_lineage_result: - key = f"{column_lineage[Constant.SOURCE_TABLE_ID]}-{column_lineage[Constant.DESTINATION_TABLE_ID]}" - if key not in all_column_lineage: - all_column_lineage[key] = [column_lineage] - else: - all_column_lineage[key].append(column_lineage) - return all_column_lineage + key = ( + column_lineage[Constant.SOURCE_TABLE_ID], + column_lineage[Constant.DESTINATION_TABLE_ID], + ) + all_column_lineage[key].append(column_lineage) + return dict(all_column_lineage) - def _get_connectors_table_lineage_metadata(self) -> Dict[str, List]: + def _get_table_lineage_metadata(self) -> Dict[str, List]: """ - Return's dict of table lineage metadata with key as 'CONNECTOR_ID' + Returns dict of table lineage metadata with key as 'CONNECTOR_ID' """ - connectors_table_lineage_metadata: Dict[str, List] = {} + connectors_table_lineage_metadata = defaultdict(list) table_lineage_result = self._query( self.fivetran_log_query.get_table_lineage_query() ) for table_lineage in table_lineage_result: - if ( + connectors_table_lineage_metadata[ table_lineage[Constant.CONNECTOR_ID] - not in connectors_table_lineage_metadata - ): - connectors_table_lineage_metadata[ - table_lineage[Constant.CONNECTOR_ID] - ] = [table_lineage] - else: - connectors_table_lineage_metadata[ - table_lineage[Constant.CONNECTOR_ID] - ].append(table_lineage) - return connectors_table_lineage_metadata + ].append(table_lineage) + return dict(connectors_table_lineage_metadata) - def _get_table_lineage( + def _extract_connector_lineage( self, - column_lineage_metadata: Dict[str, List], table_lineage_result: Optional[List], + column_lineage_metadata: Dict[Tuple[str, str], List], ) -> List[TableLineage]: table_lineage_list: List[TableLineage] = [] if table_lineage_result is None: return table_lineage_list for table_lineage in table_lineage_result: + # Join the column lineage into the table lineage. column_lineage_result = column_lineage_metadata.get( - f"{table_lineage[Constant.SOURCE_TABLE_ID]}-{table_lineage[Constant.DESTINATION_TABLE_ID]}" + ( + table_lineage[Constant.SOURCE_TABLE_ID], + table_lineage[Constant.DESTINATION_TABLE_ID], + ) ) column_lineage_list: List[ColumnLineage] = [] if column_lineage_result: @@ -152,6 +145,7 @@ def _get_table_lineage( ) for column_lineage in column_lineage_result ] + table_lineage_list.append( TableLineage( source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}", @@ -167,14 +161,9 @@ def _get_all_connector_sync_logs( ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} - # Format connector_ids as a comma-separated string of quoted IDs - formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) - - query = self.fivetran_log_query.get_sync_logs_query().format( - db_clause=self.fivetran_log_query.db_clause, + query = self.fivetran_log_query.get_sync_logs_query( syncs_interval=syncs_interval, - max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, - connector_ids=formatted_connector_ids, + connector_ids=connector_ids, ) for row in self._query(query): @@ -234,13 +223,13 @@ def get_user_email(self, user_id: str) -> Optional[str]: return None return self._get_users().get(user_id) - def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: - table_lineage_metadata = self._get_connectors_table_lineage_metadata() + def _fill_connectors_lineage(self, connectors: List[Connector]) -> None: + table_lineage_metadata = self._get_table_lineage_metadata() column_lineage_metadata = self._get_column_lineage_metadata() for connector in connectors: - connector.table_lineage = self._get_table_lineage( - column_lineage_metadata=column_lineage_metadata, + connector.lineage = self._extract_connector_lineage( table_lineage_result=table_lineage_metadata.get(connector.connector_id), + column_lineage_metadata=column_lineage_metadata, ) def _fill_connectors_jobs( @@ -262,6 +251,7 @@ def get_allowed_connectors_list( ) -> List[Connector]: connectors: List[Connector] = [] with report.metadata_extraction_perf.connectors_metadata_extraction_sec: + logger.info("Fetching connector list") connector_list = self._query(self.fivetran_log_query.get_connectors_query()) for connector in connector_list: if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]): @@ -279,12 +269,20 @@ def get_allowed_connectors_list( sync_frequency=connector[Constant.SYNC_FREQUENCY], destination_id=connector[Constant.DESTINATION_ID], user_id=connector[Constant.CONNECTING_USER_ID], - table_lineage=[], - jobs=[], + lineage=[], # filled later + jobs=[], # filled later ) ) + + if not connectors: + # Some of our queries don't work well when there's no connectors, since + # we push down connector id filters. + return [] + with report.metadata_extraction_perf.connectors_lineage_extraction_sec: - self._fill_connectors_table_lineage(connectors) + logger.info("Fetching connector lineage") + self._fill_connectors_lineage(connectors) with report.metadata_extraction_perf.connectors_jobs_extraction_sec: + logger.info("Fetching connector job run history") self._fill_connectors_jobs(connectors, syncs_interval) return connectors diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index c4680b4b1037a2..c9e329b706768f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -1,3 +1,11 @@ +from typing import List + +# Safeguards to prevent fetching massive amounts of data. +MAX_TABLE_LINEAGE_PER_CONNECTOR = 100 +MAX_COLUMN_LINEAGE_PER_CONNECTOR = 3000 +MAX_JOBS_PER_CONNECTOR = 1000 + + class FivetranLogQuery: # Note: All queries are written in Snowflake SQL. # They will be transpiled to the target database's SQL dialect at runtime. @@ -24,69 +32,88 @@ def get_connectors_query(self) -> str: destination_id FROM {self.db_clause}connector WHERE - _fivetran_deleted = FALSE\ + _fivetran_deleted = FALSE """ def get_users_query(self) -> str: - return f""" - SELECT id as user_id, - given_name, - family_name, - email - FROM {self.db_clause}user""" + return f"""\ +SELECT id as user_id, +given_name, +family_name, +email +FROM {self.db_clause}user +""" - def get_sync_logs_query(self) -> str: - return """ - WITH ranked_syncs AS ( - SELECT - connector_id, - sync_id, - MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, - MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, - MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, - ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn - FROM {db_clause}log - WHERE message_event in ('sync_start', 'sync_end') - AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' - AND connector_id IN ({connector_ids}) - GROUP BY connector_id, sync_id - ) - SELECT - connector_id, - sync_id, - start_time, - end_time, - end_message_data - FROM ranked_syncs - WHERE rn <= {max_jobs_per_connector} - AND start_time IS NOT NULL - AND end_time IS NOT NULL - ORDER BY connector_id, end_time DESC - """ + def get_sync_logs_query( + self, + syncs_interval: int, + connector_ids: List[str], + ) -> str: + # Format connector_ids as a comma-separated string of quoted IDs + formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) + + return f"""\ +WITH ranked_syncs AS ( + SELECT + connector_id, + sync_id, + MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, + MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, + MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, + ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn + FROM {self.db_clause}log + WHERE message_event in ('sync_start', 'sync_end') + AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' + AND connector_id IN ({formatted_connector_ids}) + GROUP BY connector_id, sync_id +) +SELECT + connector_id, + sync_id, + start_time, + end_time, + end_message_data +FROM ranked_syncs +WHERE rn <= {MAX_JOBS_PER_CONNECTOR} + AND start_time IS NOT NULL + AND end_time IS NOT NULL +ORDER BY connector_id, end_time DESC +""" def get_table_lineage_query(self) -> str: - return f""" - SELECT stm.connector_id as connector_id, - stm.id as source_table_id, - stm.name as source_table_name, - ssm.name as source_schema_name, - dtm.id as destination_table_id, - dtm.name as destination_table_name, - dsm.name as destination_schema_name - FROM {self.db_clause}table_lineage as tl - JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id - JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id - JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id - JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id""" + return f"""\ +SELECT + stm.connector_id as connector_id, + stm.id as source_table_id, + stm.name as source_table_name, + ssm.name as source_schema_name, + dtm.id as destination_table_id, + dtm.name as destination_table_name, + dsm.name as destination_schema_name +FROM {self.db_clause}table_lineage as tl +JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id +JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id +JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id +JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id +QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR} +ORDER BY stm.connector_id, tl.created_at DESC +""" def get_column_lineage_query(self) -> str: - return f""" - SELECT scm.table_id as source_table_id, - dcm.table_id as destination_table_id, - scm.name as source_column_name, - dcm.name as destination_column_name - FROM {self.db_clause}column_lineage as cl - JOIN {self.db_clause}source_column_metadata as scm - on cl.source_column_id = scm.id - JOIN {self.db_clause}destination_column_metadata as dcm - on cl.destination_column_id = dcm.id""" + return f"""\ +SELECT + scm.table_id as source_table_id, + dcm.table_id as destination_table_id, + scm.name as source_column_name, + dcm.name as destination_column_name +FROM {self.db_clause}column_lineage as cl +JOIN {self.db_clause}source_column_metadata as scm + ON cl.source_column_id = scm.id +JOIN {self.db_clause}destination_column_metadata as dcm + ON cl.destination_column_id = dcm.id +-- Only joining source_table_metadata to get the connector_id. +JOIN {self.db_clause}source_table_metadata as stm + ON scm.table_id = stm.id +QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR} +ORDER BY stm.connector_id, cl.created_at DESC +""" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 33ac09e69a3c0a..e72162b12e48fd 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -100,11 +100,9 @@ def default_query_results( "email": "abc.xyz@email.com", } ] - elif query == fivetran_log_query.get_sync_logs_query().format( - db_clause=fivetran_log_query.db_clause, + elif query == fivetran_log_query.get_sync_logs_query( syncs_interval=7, - max_jobs_per_connector=1000, - connector_ids="'calendar_elected'", + connector_ids=["calendar_elected"], ): return [ { From 725d76a8ff8084865d6407c63bcf40ef813a72b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:00:09 +0200 Subject: [PATCH 28/31] fix(ui): show DataHub logo for DataHub sources in ingestion souces list (#11658) Co-authored-by: Shirshanka Das --- .../src/app/ingest/source/builder/constants.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index b67ca388c10546..0e0ba8b22e37ef 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -35,6 +35,7 @@ import csvLogo from '../../../../images/csv-logo.png'; import qlikLogo from '../../../../images/qliklogo.png'; import sigmaLogo from '../../../../images/sigmalogo.png'; import sacLogo from '../../../../images/saclogo.svg'; +import datahubLogo from '../../../../images/datahublogo.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -125,6 +126,11 @@ export const SIGMA = 'sigma'; export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`; export const SAC = 'sac'; export const SAC_URN = `urn:li:dataPlatform:${SAC}`; +export const DATAHUB = 'datahub'; +export const DATAHUB_GC = 'datahub-gc'; +export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file'; +export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary'; +export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, @@ -165,6 +171,7 @@ export const PLATFORM_URN_TO_LOGO = { [QLIK_SENSE_URN]: qlikLogo, [SIGMA_URN]: sigmaLogo, [SAC_URN]: sacLogo, + [DATAHUB_URN]: datahubLogo, }; export const SOURCE_TO_PLATFORM_URN = { @@ -178,5 +185,7 @@ export const SOURCE_TO_PLATFORM_URN = { [SNOWFLAKE_USAGE]: SNOWFLAKE_URN, [STARBURST_TRINO_USAGE]: TRINO_URN, [DBT_CLOUD]: DBT_URN, - [VERTICA]: VERTICA_URN, + [DATAHUB_GC]: DATAHUB_URN, + [DATAHUB_LINEAGE_FILE]: DATAHUB_URN, + [DATAHUB_BUSINESS_GLOSSARY]: DATAHUB_URN, }; From 4adfba1a45ac91b48afd9da2bbfa705417a074ef Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 29/31] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From dd487de00bf84405bae5c73a96690efc86a765eb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 30/31] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> From 76386ae85885d0c3d08e6ed99537bead01508a41 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:08:48 -0700 Subject: [PATCH 31/31] build(deps): bump express from 4.18.2 to 4.19.2 in /docs-website (#10128) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>