From f31209819b6edd36418af625e18ef0a2b3c2e0b2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:08:48 -0700
Subject: [PATCH 01/31] build(deps): bump express from 4.18.2 to 4.19.2 in
 /docs-website (#10128)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

From 1e331961aac082cc45af3738aa5ffe886026d8f5 Mon Sep 17 00:00:00 2001
From: "julien.jehannet" <julien.jehannet@aviv-group.com>
Date: Wed, 17 Jul 2024 13:41:50 +0200
Subject: [PATCH 02/31] refactor(ingest/glue): simplify database filtering

---
 .../src/datahub/ingestion/source/aws/glue.py  | 50 +++++++++----------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index 3b9b5dbf63e184..fcc8bb80ccb3ff 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -678,12 +678,19 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
         else:
             paginator_response = paginator.paginate()
 
-        for page in paginator_response:
-            yield from page["DatabaseList"]
+        pattern = "DatabaseList"
+        if self.source_config.ignore_resource_links:
+            # exclude records that contain TargetDatabase struct key to ignore resource links
+            pattern += "[?!TargetDatabase]"
+
+        for database in paginator_response.search(pattern):
+            if self.source_config.database_pattern.allowed(database["Name"]):
+                yield database
 
-    def get_tables_from_database(self, database_name: str) -> Iterable[Dict]:
+    def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict]:
         # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/paginator/GetTables.html
         paginator = self.glue_client.get_paginator("get_tables")
+        database_name = database["Name"]
 
         if self.source_config.catalog_id:
             paginator_response = paginator.paginate(
@@ -692,34 +699,25 @@ def get_tables_from_database(self, database_name: str) -> Iterable[Dict]:
         else:
             paginator_response = paginator.paginate(DatabaseName=database_name)
 
-        for page in paginator_response:
-            yield from page["TableList"]
+        yield from paginator_response.search("TableList")
+        # for table in paginator_response.search("TableList"):
+        # if resource links are detected, re-use database name from the outermost catalog
+        # otherwise, you will use external names instead of new aliased ones when creating full table names
+        # Note: we use an explicit source_config check but it is useless actually (filtering has been done)
+        # if not self.source_config.ignore_resource_links and "TargetDatabase" in database:
+        #   table["DatabaseName"] = database["Name"]
+        #   yield table
 
     def get_all_databases_and_tables(
         self,
     ) -> Tuple[Dict, List[Dict]]:
-        all_databases = self.get_all_databases()
-
-        if self.source_config.ignore_resource_links:
-            all_databases = [
-                database
-                for database in all_databases
-                if "TargetDatabase" not in database
-            ]
-
-        allowed_databases = {
-            database["Name"]: database
-            for database in all_databases
-            if self.source_config.database_pattern.allowed(database["Name"])
-        }
-
+        all_databases = [*self.get_all_databases()]
         all_tables = [
-            table
-            for database_name in allowed_databases
-            for table in self.get_tables_from_database(database_name)
+            tables
+            for database in all_databases
+            for tables in self.get_tables_from_database(database)
         ]
-
-        return allowed_databases, all_tables
+        return all_databases, all_tables
 
     def get_lineage_if_enabled(
         self, mce: MetadataChangeEventClass
@@ -1039,7 +1037,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         databases, tables = self.get_all_databases_and_tables()
 
-        for database in databases.values():
+        for database in databases:
             yield from self.gen_database_containers(database)
 
         for table in tables:

From e1a8629a07da2f04e6f6e3f9aef3f33837b0010e Mon Sep 17 00:00:00 2001
From: "julien.jehannet" <julien.jehannet@aviv-group.com>
Date: Wed, 17 Jul 2024 15:18:28 +0200
Subject: [PATCH 03/31] fix(ingest/glue): use database names from the current
 catalog in returned tables list

---
 .../src/datahub/ingestion/source/aws/glue.py   | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index fcc8bb80ccb3ff..2f618b39360b6b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -699,14 +699,16 @@ def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict
         else:
             paginator_response = paginator.paginate(DatabaseName=database_name)
 
-        yield from paginator_response.search("TableList")
-        # for table in paginator_response.search("TableList"):
-        # if resource links are detected, re-use database name from the outermost catalog
-        # otherwise, you will use external names instead of new aliased ones when creating full table names
-        # Note: we use an explicit source_config check but it is useless actually (filtering has been done)
-        # if not self.source_config.ignore_resource_links and "TargetDatabase" in database:
-        #   table["DatabaseName"] = database["Name"]
-        #   yield table
+        for table in paginator_response.search("TableList"):
+            # if resource links are detected, re-use database names from the current catalog
+            # otherwise, external names are used instead of aliased ones when creating full table names later
+            # Note: use an explicit source_config check but it is useless actually (filtering has been done)
+            if (
+                not self.source_config.ignore_resource_links
+                and "TargetDatabase" in database
+            ):
+                table["DatabaseName"] = database["Name"]
+            yield table
 
     def get_all_databases_and_tables(
         self,

From 6ea44e5d680109ddb828fc1e640e528323ff3229 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:08:48 -0700
Subject: [PATCH 04/31] build(deps): bump express from 4.18.2 to 4.19.2 in
 /docs-website (#10128)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

From 57f053d55438c8ace7befc5875fe5570a631d732 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:08:48 -0700
Subject: [PATCH 05/31] build(deps): bump express from 4.18.2 to 4.19.2 in
 /docs-website (#10128)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

From a227a987f8f74864dc63a3ddd6f4a04a80e9fba6 Mon Sep 17 00:00:00 2001
From: Julien Jehannet <julien.jehannet@aviv-group.com>
Date: Wed, 24 Jul 2024 16:01:15 +0200
Subject: [PATCH 06/31] fix(tests): change signature for
 `get_all_databases_and_tables()`

---
 metadata-ingestion/src/datahub/ingestion/source/aws/glue.py | 2 +-
 metadata-ingestion/tests/unit/test_glue_source.py           | 4 ++--
 metadata-ingestion/tests/unit/test_glue_source_stubs.py     | 6 ++----
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index 2f618b39360b6b..1dc6d16a0a81fa 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -712,7 +712,7 @@ def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict
 
     def get_all_databases_and_tables(
         self,
-    ) -> Tuple[Dict, List[Dict]]:
+    ) -> Tuple[List[Mapping[str, Any]], List[Dict]]:
         all_databases = [*self.get_all_databases()]
         all_tables = [
             tables
diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py
index 45b9899eacaa77..e9ad9fe8e28d01 100644
--- a/metadata-ingestion/tests/unit/test_glue_source.py
+++ b/metadata-ingestion/tests/unit/test_glue_source.py
@@ -266,8 +266,8 @@ def test_platform_config():
 @pytest.mark.parametrize(
     "ignore_resource_links, all_databases_and_tables_result",
     [
-        (True, ({}, [])),
-        (False, ({"test-database": resource_link_database}, target_database_tables)),
+        (True, ([], [])),
+        (False, ([resource_link_database], target_database_tables)),
     ],
 )
 def test_ignore_resource_links(ignore_resource_links, all_databases_and_tables_result):
diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py
index 46ab65234c22df..c4f228646195cc 100644
--- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py
+++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py
@@ -92,10 +92,8 @@
         },
     ]
 }
-databases_1 = {
-    "flights-database": {"Name": "flights-database", "CatalogId": "123412341234"}
-}
-databases_2 = {"test-database": {"Name": "test-database", "CatalogId": "123412341234"}}
+databases_1 = [{"Name": "flights-database", "CatalogId": "123412341234"}]
+databases_2 = [{"Name": "test-database", "CatalogId": "123412341234"}]
 tables_1 = [
     {
         "Name": "avro",

From 6d156c74b35617cd163d2e22100b3cbf93ba930d Mon Sep 17 00:00:00 2001
From: Julien Jehannet <julien.jehannet@aviv-group.com>
Date: Thu, 25 Jul 2024 17:31:27 +0200
Subject: [PATCH 07/31] docs: rephrase some comments

---
 .../src/datahub/ingestion/source/aws/glue.py               | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
index 1dc6d16a0a81fa..c8358521fd091d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -680,7 +680,7 @@ def get_all_databases(self) -> Iterable[Mapping[str, Any]]:
 
         pattern = "DatabaseList"
         if self.source_config.ignore_resource_links:
-            # exclude records that contain TargetDatabase struct key to ignore resource links
+            # exclude resource links by using a JMESPath conditional query against the TargetDatabase struct key
             pattern += "[?!TargetDatabase]"
 
         for database in paginator_response.search(pattern):
@@ -701,8 +701,9 @@ def get_tables_from_database(self, database: Mapping[str, Any]) -> Iterable[Dict
 
         for table in paginator_response.search("TableList"):
             # if resource links are detected, re-use database names from the current catalog
-            # otherwise, external names are used instead of aliased ones when creating full table names later
-            # Note: use an explicit source_config check but it is useless actually (filtering has been done)
+            # otherwise external resource names are picked out instead of aliased ones
+            # This will cause an incoherent situation when creating full table names later
+            # Note: use an explicit source_config check but it is useless actually (filtering has already been done)
             if (
                 not self.source_config.ignore_resource_links
                 and "TargetDatabase" in database

From 28f1a937c515d1e7ca495b4e3d1214de2c0411f7 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 16 Oct 2024 19:18:32 -0700
Subject: [PATCH 08/31] chore(ingest): reorganize unit tests (#11636)

---
 .../tests/unit/{ => api}/test_apis.py         |  0
 .../test_entity_filter_report.py}             |  0
 .../tests/unit/{ => api}/test_pipeline.py     | 10 +++----
 .../unit/{ => api}/test_plugin_system.py      |  0
 .../{test_report.py => test_source_report.py} |  0
 .../tests/unit/{ => api}/test_workunit.py     |  0
 .../{ => bigquery}/test_bigquery_lineage.py   |  0
 .../{ => bigquery}/test_bigquery_profiler.py  |  0
 .../{ => bigquery}/test_bigquery_source.py    |  0
 .../test_bigquery_sql_lineage.py              |  0
 .../{ => bigquery}/test_bigquery_usage.py     |  0
 .../test_bigqueryv2_usage_source.py           |  0
 .../test_bq_get_partition_range.py            |  0
 .../tests/unit/{ => cli}/test_check.py        |  0
 .../unit/{ => cli}/test_check_upgrade.py      |  0
 .../tests/unit/{ => cli}/test_cli_utils.py    |  0
 .../{ => config}/test_key_value_pattern.py    |  0
 .../tests/unit/glue/__init__.py               |  0
 .../tests/unit/{ => glue}/test_glue_source.py | 12 ++------
 .../unit/{ => glue}/test_glue_source_stubs.py |  0
 .../tests/unit/redshift/__init__.py           |  0
 .../{ => redshift}/redshift_query_mocker.py   |  0
 .../{ => redshift}/test_redshift_config.py    |  0
 .../{ => redshift}/test_redshift_lineage.py   |  2 +-
 .../{ => redshift}/test_redshift_source.py    |  0
 .../tests/unit/sagemaker/__init__.py          |  0
 .../unit/sagemaker/test_sagemaker_source.py   |  2 +-
 .../test_sagemaker_source_stubs.py            |  0
 .../tests/unit/{graph => sdk}/test_client.py  |  0
 .../unit/{ => sdk}/test_kafka_emitter.py      |  0
 .../tests/unit/{ => sdk}/test_mce_builder.py  |  0
 .../tests/unit/{ => sdk}/test_mcp_builder.py  |  0
 .../tests/unit/{ => sdk}/test_mcp_wrapper.py  |  0
 .../tests/unit/{ => sdk}/test_rest_emitter.py |  0
 .../tests/unit/{ => serde}/test_codegen.py    |  0
 .../state}/test_ldap_state.py                 |  0
 .../unit/{ => utilities}/test_cli_logging.py  |  0
 .../unit/{ => utilities}/test_ordered_set.py  |  0
 .../tests/unit/utilities/test_perf_timer.py   | 28 ++++++++++---------
 .../test_serialized_lru_cache.py              |  0
 .../{ => utilities}/test_topological_sort.py  |  0
 .../unit/{ => utilities}/test_utilities.py    |  0
 42 files changed, 25 insertions(+), 29 deletions(-)
 rename metadata-ingestion/tests/unit/{ => api}/test_apis.py (100%)
 rename metadata-ingestion/tests/unit/{test_report.py => api/test_entity_filter_report.py} (100%)
 rename metadata-ingestion/tests/unit/{ => api}/test_pipeline.py (97%)
 rename metadata-ingestion/tests/unit/{ => api}/test_plugin_system.py (100%)
 rename metadata-ingestion/tests/unit/api/{test_report.py => test_source_report.py} (100%)
 rename metadata-ingestion/tests/unit/{ => api}/test_workunit.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_lineage.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_profiler.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_source.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_sql_lineage.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigquery_usage.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bigqueryv2_usage_source.py (100%)
 rename metadata-ingestion/tests/unit/{ => bigquery}/test_bq_get_partition_range.py (100%)
 rename metadata-ingestion/tests/unit/{ => cli}/test_check.py (100%)
 rename metadata-ingestion/tests/unit/{ => cli}/test_check_upgrade.py (100%)
 rename metadata-ingestion/tests/unit/{ => cli}/test_cli_utils.py (100%)
 rename metadata-ingestion/tests/unit/{ => config}/test_key_value_pattern.py (100%)
 create mode 100644 metadata-ingestion/tests/unit/glue/__init__.py
 rename metadata-ingestion/tests/unit/{ => glue}/test_glue_source.py (97%)
 rename metadata-ingestion/tests/unit/{ => glue}/test_glue_source_stubs.py (100%)
 create mode 100644 metadata-ingestion/tests/unit/redshift/__init__.py
 rename metadata-ingestion/tests/unit/{ => redshift}/redshift_query_mocker.py (100%)
 rename metadata-ingestion/tests/unit/{ => redshift}/test_redshift_config.py (100%)
 rename metadata-ingestion/tests/unit/{ => redshift}/test_redshift_lineage.py (99%)
 rename metadata-ingestion/tests/unit/{ => redshift}/test_redshift_source.py (100%)
 create mode 100644 metadata-ingestion/tests/unit/sagemaker/__init__.py
 rename metadata-ingestion/tests/unit/{ => sagemaker}/test_sagemaker_source_stubs.py (100%)
 rename metadata-ingestion/tests/unit/{graph => sdk}/test_client.py (100%)
 rename metadata-ingestion/tests/unit/{ => sdk}/test_kafka_emitter.py (100%)
 rename metadata-ingestion/tests/unit/{ => sdk}/test_mce_builder.py (100%)
 rename metadata-ingestion/tests/unit/{ => sdk}/test_mcp_builder.py (100%)
 rename metadata-ingestion/tests/unit/{ => sdk}/test_mcp_wrapper.py (100%)
 rename metadata-ingestion/tests/unit/{ => sdk}/test_rest_emitter.py (100%)
 rename metadata-ingestion/tests/unit/{ => serde}/test_codegen.py (100%)
 rename metadata-ingestion/tests/unit/{ => stateful_ingestion/state}/test_ldap_state.py (100%)
 rename metadata-ingestion/tests/unit/{ => utilities}/test_cli_logging.py (100%)
 rename metadata-ingestion/tests/unit/{ => utilities}/test_ordered_set.py (100%)
 rename metadata-ingestion/tests/unit/{ => utilities}/test_serialized_lru_cache.py (100%)
 rename metadata-ingestion/tests/unit/{ => utilities}/test_topological_sort.py (100%)
 rename metadata-ingestion/tests/unit/{ => utilities}/test_utilities.py (100%)

diff --git a/metadata-ingestion/tests/unit/test_apis.py b/metadata-ingestion/tests/unit/api/test_apis.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_apis.py
rename to metadata-ingestion/tests/unit/api/test_apis.py
diff --git a/metadata-ingestion/tests/unit/test_report.py b/metadata-ingestion/tests/unit/api/test_entity_filter_report.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_report.py
rename to metadata-ingestion/tests/unit/api/test_entity_filter_report.py
diff --git a/metadata-ingestion/tests/unit/test_pipeline.py b/metadata-ingestion/tests/unit/api/test_pipeline.py
similarity index 97%
rename from metadata-ingestion/tests/unit/test_pipeline.py
rename to metadata-ingestion/tests/unit/api/test_pipeline.py
index a462f281367973..432d8e11c1c0b4 100644
--- a/metadata-ingestion/tests/unit/test_pipeline.py
+++ b/metadata-ingestion/tests/unit/api/test_pipeline.py
@@ -224,9 +224,9 @@ def test_configure_with_file_sink_does_not_init_graph(self, mock_source, tmp_pat
     def test_run_including_fake_transformation(self):
         pipeline = Pipeline.create(
             {
-                "source": {"type": "tests.unit.test_pipeline.FakeSource"},
+                "source": {"type": "tests.unit.api.test_pipeline.FakeSource"},
                 "transformers": [
-                    {"type": "tests.unit.test_pipeline.AddStatusRemovedTransformer"}
+                    {"type": "tests.unit.api.test_pipeline.AddStatusRemovedTransformer"}
                 ],
                 "sink": {"type": "tests.test_helpers.sink_helpers.RecordingSink"},
                 "run_id": "pipeline_test",
@@ -253,7 +253,7 @@ def test_run_including_registered_transformation(self):
 
         pipeline = Pipeline.create(
             {
-                "source": {"type": "tests.unit.test_pipeline.FakeSource"},
+                "source": {"type": "tests.unit.api.test_pipeline.FakeSource"},
                 "transformers": [
                     {
                         "type": "simple_add_dataset_ownership",
@@ -297,7 +297,7 @@ def test_pipeline_return_code(self, tmp_path, source, strict_warnings, exit_code
 ---
 run_id: pipeline_test
 source:
-    type: tests.unit.test_pipeline.{source}
+    type: tests.unit.api.test_pipeline.{source}
     config: {{}}
 sink:
     type: console
@@ -379,7 +379,7 @@ def test_pipeline_return_code(self, tmp_path, source, strict_warnings, exit_code
     def test_pipeline_process_commits(self, commit_policy, source, should_commit):
         pipeline = Pipeline.create(
             {
-                "source": {"type": f"tests.unit.test_pipeline.{source}"},
+                "source": {"type": f"tests.unit.api.test_pipeline.{source}"},
                 "sink": {"type": "console"},
                 "run_id": "pipeline_test",
             }
diff --git a/metadata-ingestion/tests/unit/test_plugin_system.py b/metadata-ingestion/tests/unit/api/test_plugin_system.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_plugin_system.py
rename to metadata-ingestion/tests/unit/api/test_plugin_system.py
diff --git a/metadata-ingestion/tests/unit/api/test_report.py b/metadata-ingestion/tests/unit/api/test_source_report.py
similarity index 100%
rename from metadata-ingestion/tests/unit/api/test_report.py
rename to metadata-ingestion/tests/unit/api/test_source_report.py
diff --git a/metadata-ingestion/tests/unit/test_workunit.py b/metadata-ingestion/tests/unit/api/test_workunit.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_workunit.py
rename to metadata-ingestion/tests/unit/api/test_workunit.py
diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bigquery_lineage.py
rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_lineage.py
diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_profiler.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bigquery_profiler.py
rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_profiler.py
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_source.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bigquery_source.py
rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_source.py
diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_sql_lineage.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py
rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_sql_lineage.py
diff --git a/metadata-ingestion/tests/unit/test_bigquery_usage.py b/metadata-ingestion/tests/unit/bigquery/test_bigquery_usage.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bigquery_usage.py
rename to metadata-ingestion/tests/unit/bigquery/test_bigquery_usage.py
diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
rename to metadata-ingestion/tests/unit/bigquery/test_bigqueryv2_usage_source.py
diff --git a/metadata-ingestion/tests/unit/test_bq_get_partition_range.py b/metadata-ingestion/tests/unit/bigquery/test_bq_get_partition_range.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_bq_get_partition_range.py
rename to metadata-ingestion/tests/unit/bigquery/test_bq_get_partition_range.py
diff --git a/metadata-ingestion/tests/unit/test_check.py b/metadata-ingestion/tests/unit/cli/test_check.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_check.py
rename to metadata-ingestion/tests/unit/cli/test_check.py
diff --git a/metadata-ingestion/tests/unit/test_check_upgrade.py b/metadata-ingestion/tests/unit/cli/test_check_upgrade.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_check_upgrade.py
rename to metadata-ingestion/tests/unit/cli/test_check_upgrade.py
diff --git a/metadata-ingestion/tests/unit/test_cli_utils.py b/metadata-ingestion/tests/unit/cli/test_cli_utils.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_cli_utils.py
rename to metadata-ingestion/tests/unit/cli/test_cli_utils.py
diff --git a/metadata-ingestion/tests/unit/test_key_value_pattern.py b/metadata-ingestion/tests/unit/config/test_key_value_pattern.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_key_value_pattern.py
rename to metadata-ingestion/tests/unit/config/test_key_value_pattern.py
diff --git a/metadata-ingestion/tests/unit/glue/__init__.py b/metadata-ingestion/tests/unit/glue/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/glue/test_glue_source.py
similarity index 97%
rename from metadata-ingestion/tests/unit/test_glue_source.py
rename to metadata-ingestion/tests/unit/glue/test_glue_source.py
index bd8527eea33e41..bbef9c1510f00b 100644
--- a/metadata-ingestion/tests/unit/test_glue_source.py
+++ b/metadata-ingestion/tests/unit/glue/test_glue_source.py
@@ -35,7 +35,7 @@
     validate_all_providers_have_committed_successfully,
 )
 from tests.test_helpers.type_helpers import PytestConfig
-from tests.unit.test_glue_source_stubs import (
+from tests.unit.glue.test_glue_source_stubs import (
     databases_1,
     databases_2,
     get_bucket_tagging,
@@ -71,6 +71,8 @@
 GMS_PORT = 8080
 GMS_SERVER = f"http://localhost:{GMS_PORT}"
 
+test_resources_dir = Path(__file__).parent
+
 
 def glue_source(
     platform_instance: Optional[str] = None,
@@ -247,7 +249,6 @@ def test_glue_ingest(
             write_metadata_file(tmp_path / mce_file, mce_objects)
 
     # Verify the output.
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
     mce_helpers.check_golden_file(
         pytestconfig,
         output_path=tmp_path / mce_file,
@@ -312,8 +313,6 @@ def test_config_without_platform():
 
 @freeze_time(FROZEN_TIME)
 def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
-
     deleted_actor_golden_mcs = "{}/glue_deleted_actor_mces_golden.json".format(
         test_resources_dir
     )
@@ -438,7 +437,6 @@ def test_glue_with_delta_schema_ingest(
         write_metadata_file(tmp_path / "glue_delta_mces.json", mce_objects)
 
     # Verify the output.
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
     mce_helpers.check_golden_file(
         pytestconfig,
         output_path=tmp_path / "glue_delta_mces.json",
@@ -475,7 +473,6 @@ def test_glue_with_malformed_delta_schema_ingest(
         write_metadata_file(tmp_path / "glue_malformed_delta_mces.json", mce_objects)
 
     # Verify the output.
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
     mce_helpers.check_golden_file(
         pytestconfig,
         output_path=tmp_path / "glue_malformed_delta_mces.json",
@@ -571,7 +568,6 @@ def test_glue_ingest_include_table_lineage(
             write_metadata_file(tmp_path / mce_file, mce_objects)
 
     # Verify the output.
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
     mce_helpers.check_golden_file(
         pytestconfig,
         output_path=tmp_path / mce_file,
@@ -678,7 +674,6 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass:
         write_metadata_file(tmp_path / mce_file, mce_objects)
 
     # Verify the output.
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
     mce_helpers.check_golden_file(
         pytestconfig,
         output_path=tmp_path / mce_file,
@@ -716,7 +711,6 @@ def test_glue_ingest_with_profiling(
         write_metadata_file(tmp_path / mce_file, mce_objects)
 
     # Verify the output.
-    test_resources_dir = pytestconfig.rootpath / "tests/unit/glue"
     mce_helpers.check_golden_file(
         pytestconfig,
         output_path=tmp_path / mce_file,
diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_glue_source_stubs.py
rename to metadata-ingestion/tests/unit/glue/test_glue_source_stubs.py
diff --git a/metadata-ingestion/tests/unit/redshift/__init__.py b/metadata-ingestion/tests/unit/redshift/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/tests/unit/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
similarity index 100%
rename from metadata-ingestion/tests/unit/redshift_query_mocker.py
rename to metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
diff --git a/metadata-ingestion/tests/unit/test_redshift_config.py b/metadata-ingestion/tests/unit/redshift/test_redshift_config.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_redshift_config.py
rename to metadata-ingestion/tests/unit/redshift/test_redshift_config.py
diff --git a/metadata-ingestion/tests/unit/test_redshift_lineage.py b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py
similarity index 99%
rename from metadata-ingestion/tests/unit/test_redshift_lineage.py
rename to metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py
index 78b7169a93f3c8..2e3eb8fde1292b 100644
--- a/metadata-ingestion/tests/unit/test_redshift_lineage.py
+++ b/metadata-ingestion/tests/unit/redshift/test_redshift_lineage.py
@@ -26,7 +26,7 @@
     SqlParsingDebugInfo,
     SqlParsingResult,
 )
-from tests.unit.redshift_query_mocker import mock_cursor
+from tests.unit.redshift.redshift_query_mocker import mock_cursor
 
 
 def test_get_sources_from_query():
diff --git a/metadata-ingestion/tests/unit/test_redshift_source.py b/metadata-ingestion/tests/unit/redshift/test_redshift_source.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_redshift_source.py
rename to metadata-ingestion/tests/unit/redshift/test_redshift_source.py
diff --git a/metadata-ingestion/tests/unit/sagemaker/__init__.py b/metadata-ingestion/tests/unit/sagemaker/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py
index 995d176c213b24..2450e6fa8fe564 100644
--- a/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py
+++ b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source.py
@@ -14,7 +14,7 @@
     job_types,
 )
 from tests.test_helpers import mce_helpers
-from tests.unit.test_sagemaker_source_stubs import (
+from tests.unit.sagemaker.test_sagemaker_source_stubs import (
     describe_endpoint_response_1,
     describe_endpoint_response_2,
     describe_feature_group_response_1,
diff --git a/metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py b/metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source_stubs.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_sagemaker_source_stubs.py
rename to metadata-ingestion/tests/unit/sagemaker/test_sagemaker_source_stubs.py
diff --git a/metadata-ingestion/tests/unit/graph/test_client.py b/metadata-ingestion/tests/unit/sdk/test_client.py
similarity index 100%
rename from metadata-ingestion/tests/unit/graph/test_client.py
rename to metadata-ingestion/tests/unit/sdk/test_client.py
diff --git a/metadata-ingestion/tests/unit/test_kafka_emitter.py b/metadata-ingestion/tests/unit/sdk/test_kafka_emitter.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_kafka_emitter.py
rename to metadata-ingestion/tests/unit/sdk/test_kafka_emitter.py
diff --git a/metadata-ingestion/tests/unit/test_mce_builder.py b/metadata-ingestion/tests/unit/sdk/test_mce_builder.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_mce_builder.py
rename to metadata-ingestion/tests/unit/sdk/test_mce_builder.py
diff --git a/metadata-ingestion/tests/unit/test_mcp_builder.py b/metadata-ingestion/tests/unit/sdk/test_mcp_builder.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_mcp_builder.py
rename to metadata-ingestion/tests/unit/sdk/test_mcp_builder.py
diff --git a/metadata-ingestion/tests/unit/test_mcp_wrapper.py b/metadata-ingestion/tests/unit/sdk/test_mcp_wrapper.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_mcp_wrapper.py
rename to metadata-ingestion/tests/unit/sdk/test_mcp_wrapper.py
diff --git a/metadata-ingestion/tests/unit/test_rest_emitter.py b/metadata-ingestion/tests/unit/sdk/test_rest_emitter.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_rest_emitter.py
rename to metadata-ingestion/tests/unit/sdk/test_rest_emitter.py
diff --git a/metadata-ingestion/tests/unit/test_codegen.py b/metadata-ingestion/tests/unit/serde/test_codegen.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_codegen.py
rename to metadata-ingestion/tests/unit/serde/test_codegen.py
diff --git a/metadata-ingestion/tests/unit/test_ldap_state.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_ldap_state.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_ldap_state.py
rename to metadata-ingestion/tests/unit/stateful_ingestion/state/test_ldap_state.py
diff --git a/metadata-ingestion/tests/unit/test_cli_logging.py b/metadata-ingestion/tests/unit/utilities/test_cli_logging.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_cli_logging.py
rename to metadata-ingestion/tests/unit/utilities/test_cli_logging.py
diff --git a/metadata-ingestion/tests/unit/test_ordered_set.py b/metadata-ingestion/tests/unit/utilities/test_ordered_set.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_ordered_set.py
rename to metadata-ingestion/tests/unit/utilities/test_ordered_set.py
diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
index 6129b3e37d8bc3..1de76a32fb708a 100644
--- a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
+++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
@@ -10,37 +10,39 @@
 
 def test_perf_timer_simple():
     with PerfTimer() as timer:
-        time.sleep(1)
-        assert approx(timer.elapsed_seconds()) == 1
+        time.sleep(0.4)
+        assert approx(timer.elapsed_seconds()) == 0.4
 
-    assert approx(timer.elapsed_seconds()) == 1
+    assert approx(timer.elapsed_seconds()) == 0.4
 
 
 def test_perf_timer_paused_timer():
     with PerfTimer() as current_timer:
-        time.sleep(1)
-        assert approx(current_timer.elapsed_seconds()) == 1
+        time.sleep(0.5)
+        assert approx(current_timer.elapsed_seconds()) == 0.5
         with current_timer.pause():
-            time.sleep(2)
-            assert approx(current_timer.elapsed_seconds()) == 1
-        assert approx(current_timer.elapsed_seconds()) == 1
-        time.sleep(1)
+            time.sleep(0.3)
+            assert approx(current_timer.elapsed_seconds()) == 0.5
+        assert approx(current_timer.elapsed_seconds()) == 0.5
+        time.sleep(0.2)
 
-    assert approx(current_timer.elapsed_seconds()) == 2
+    assert approx(current_timer.elapsed_seconds()) == 0.7
 
 
 def test_generator_with_paused_timer():
+    n = 4
+
     def generator_function():
         with PerfTimer() as inner_timer:
             time.sleep(1)
-            for i in range(10):
+            for i in range(n):
                 time.sleep(0.2)
                 with inner_timer.pause():
                     time.sleep(0.2)
                     yield i
-            assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10
+            assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * n
 
     with PerfTimer() as outer_timer:
         seq = generator_function()
         list([i for i in seq])
-        assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10
+        assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * n + 0.2 * n
diff --git a/metadata-ingestion/tests/unit/test_serialized_lru_cache.py b/metadata-ingestion/tests/unit/utilities/test_serialized_lru_cache.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_serialized_lru_cache.py
rename to metadata-ingestion/tests/unit/utilities/test_serialized_lru_cache.py
diff --git a/metadata-ingestion/tests/unit/test_topological_sort.py b/metadata-ingestion/tests/unit/utilities/test_topological_sort.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_topological_sort.py
rename to metadata-ingestion/tests/unit/utilities/test_topological_sort.py
diff --git a/metadata-ingestion/tests/unit/test_utilities.py b/metadata-ingestion/tests/unit/utilities/test_utilities.py
similarity index 100%
rename from metadata-ingestion/tests/unit/test_utilities.py
rename to metadata-ingestion/tests/unit/utilities/test_utilities.py

From 1980256ade433d2506539faa275fd2295cf47a8b Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Wed, 16 Oct 2024 20:47:48 -0700
Subject: [PATCH 09/31] fix(ingest): run sqllineage in process by default
 (#11650)

---
 .../ingestion/source/looker/lookml_config.py  |    3 -
 .../src/datahub/ingestion/source/redash.py    |   23 +-
 .../src/datahub/utilities/sql_parser.py       |    2 +-
 .../lookml/lookml_mces_badsql_parser.json     | 2820 -----------------
 .../tests/integration/lookml/test_lookml.py   |   48 -
 5 files changed, 13 insertions(+), 2883 deletions(-)
 delete mode 100644 metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py
index 0bcee14ec77a1a..da837da1613864 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py
@@ -124,9 +124,6 @@ class LookMLSourceConfig(
         description="List of regex patterns for LookML views to include in the extraction.",
     )
     parse_table_names_from_sql: bool = Field(True, description="See note below.")
-    sql_parser: str = Field(
-        "datahub.utilities.sql_parser.DefaultSQLParser", description="See note below."
-    )
     api: Optional[LookerAPIConfig]
     project_name: Optional[str] = Field(
         None,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redash.py b/metadata-ingestion/src/datahub/ingestion/source/redash.py
index 38cf0bebcbc12f..5fd63e7f93f92a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redash.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redash.py
@@ -2,7 +2,6 @@
 import math
 import sys
 from dataclasses import dataclass, field
-from multiprocessing.pool import ThreadPool
 from typing import Dict, Iterable, List, Optional, Set, Type
 
 import dateutil.parser as dp
@@ -43,6 +42,7 @@
 from datahub.utilities.lossy_collections import LossyDict, LossyList
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.sql_parser import SQLParser
+from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -646,11 +646,11 @@ def _emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
         self.report.total_dashboards = total_dashboards
         self.report.max_page_dashboards = max_page
 
-        dash_exec_pool = ThreadPool(self.config.parallelism)
-        for response in dash_exec_pool.imap_unordered(
-            self._process_dashboard_response, range(1, max_page + 1)
-        ):
-            yield from response
+        yield from ThreadedIteratorExecutor.process(
+            self._process_dashboard_response,
+            [(page,) for page in range(1, max_page + 1)],
+            max_workers=self.config.parallelism,
+        )
 
     def _get_chart_type_from_viz_data(self, viz_data: Dict) -> str:
         """
@@ -769,11 +769,12 @@ def _emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
         logger.info(f"/api/queries total count {total_queries} and max page {max_page}")
         self.report.total_queries = total_queries
         self.report.max_page_queries = max_page
-        chart_exec_pool = ThreadPool(self.config.parallelism)
-        for response in chart_exec_pool.imap_unordered(
-            self._process_query_response, range(1, max_page + 1)
-        ):
-            yield from response
+
+        yield from ThreadedIteratorExecutor.process(
+            self._process_query_response,
+            [(page,) for page in range(1, max_page + 1)],
+            max_workers=self.config.parallelism,
+        )
 
     def add_config_to_report(self) -> None:
         self.report.api_page_limit = self.config.api_page_limit
diff --git a/metadata-ingestion/src/datahub/utilities/sql_parser.py b/metadata-ingestion/src/datahub/utilities/sql_parser.py
index 61693b52b350fb..b88f8fd8c73029 100644
--- a/metadata-ingestion/src/datahub/utilities/sql_parser.py
+++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py
@@ -46,7 +46,7 @@ class SqlLineageSQLParser(SQLParser):
     def __init__(
         self,
         sql_query: str,
-        use_external_process: bool = True,
+        use_external_process: bool = False,
         use_raw_names: bool = False,
     ) -> None:
         super().__init__(sql_query, use_external_process)
diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
deleted file mode 100644
index 5b39e8dd96ac2a..00000000000000
--- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json
+++ /dev/null
@@ -1,2820 +0,0 @@
-[
-{
-    "entityType": "container",
-    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-    "changeType": "UPSERT",
-    "aspectName": "containerProperties",
-    "aspect": {
-        "json": {
-            "customProperties": {
-                "platform": "looker",
-                "env": "PROD",
-                "project_name": "lkml_samples"
-            },
-            "name": "lkml_samples",
-            "env": "PROD"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "container",
-    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "container",
-    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-    "changeType": "UPSERT",
-    "aspectName": "dataPlatformInstance",
-    "aspect": {
-        "json": {
-            "platform": "urn:li:dataPlatform:looker"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "container",
-    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "LookML Project"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "container",
-    "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Folders"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "SELECT\n          is_latest,\n          country,\n          city,\n          timestamp,\n          measurement\n        FROM\n          my_table",
-            "viewLanguage": "sql"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),country)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),city)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),is_latest)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),timestamp)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),measurement)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "my_view",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "country",
-                                "nullable": false,
-                                "description": "The country",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "city",
-                                "nullable": false,
-                                "description": "City",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "is_latest",
-                                "nullable": false,
-                                "description": "Is latest data",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.BooleanType": {}
-                                    }
-                                },
-                                "nativeDataType": "yesno",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "timestamp",
-                                "nullable": false,
-                                "description": "Timestamp of measurement",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.TimeType": {}
-                                    }
-                                },
-                                "nativeDataType": "time",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        },
-                                        {
-                                            "tag": "urn:li:tag:Temporal"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "average_measurement",
-                                "nullable": false,
-                                "description": "My measurement",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "average",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Measure"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "foo.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "my_view",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "SELECT\n          country,\n          city,\n          timestamp,\n          measurement\n        FROM\n          ${my_view.SQL_TABLE_NAME} AS my_view",
-            "viewLanguage": "sql"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "my_derived_view",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "country",
-                                "nullable": false,
-                                "description": "The country",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "city",
-                                "nullable": false,
-                                "description": "City",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "timestamp",
-                                "nullable": false,
-                                "description": "Timestamp of measurement",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.TimeType": {}
-                                    }
-                                },
-                                "nativeDataType": "time",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        },
-                                        {
-                                            "tag": "urn:li:tag:Temporal"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "average_measurement",
-                                "nullable": false,
-                                "description": "My measurement",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "average",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Measure"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "bar.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "my_derived_view",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "view: include_able_view {\n  sql_table_name: looker_schema.include_able ;;\n}\n",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
-                                "type": "VIEW"
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "included_view_file.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "include_able_view",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n  sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n  extends: [looker_events]\n\n  measure: additional_measure {\n    type: count\n  }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n  extends: [include_able_view]\n}\n",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
-                                "type": "VIEW"
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "view_declarations.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "looker_events",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n  sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n  extends: [looker_events]\n\n  measure: additional_measure {\n    type: count\n  }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n  extends: [include_able_view]\n}\n",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD),additional_measure)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD),additional_measure)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "extending_looker_events",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "additional_measure",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "count",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Measure"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "view_declarations.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "extending_looker_events",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n  sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n  extends: [looker_events]\n\n  measure: additional_measure {\n    type: count\n  }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n  extends: [include_able_view]\n}\n",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)",
-                                "type": "VIEW"
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "view_declarations.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "autodetect_sql_name_based_on_view_name",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "include: \"/included_view_file.view\"\n\nview: looker_events {\n  sql_table_name: looker_schema.events ;;\n}\n\nview: extending_looker_events {\n  extends: [looker_events]\n\n  measure: additional_measure {\n    type: count\n  }\n}\n\nview: autodetect_sql_name_based_on_view_name {}\n\nview: test_include_external_view {\n  extends: [include_able_view]\n}\n",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)",
-                                "type": "VIEW"
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "view_declarations.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "test_include_external_view",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "SELECT date AS DATE,\n         platform  AS aliased_platform,\n         country",
-            "viewLanguage": "sql"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/nested"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),date)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),platform)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),country)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "fragment_derived_view",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "date",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NullType": {}
-                                    }
-                                },
-                                "nativeDataType": "unknown",
-                                "recursive": false,
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "aliased_platform",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NullType": {}
-                                    }
-                                },
-                                "nativeDataType": "unknown",
-                                "recursive": false,
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "country",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NullType": {}
-                                    }
-                                },
-                                "nativeDataType": "unknown",
-                                "recursive": false,
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "nested/fragment_derived.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "fragment_derived_view",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                },
-                {
-                    "id": "nested"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "SELECT\n            customer_id,\n            SUM(sale_price) AS lifetime_spend\n          FROM\n            order\n          WHERE\n            {% condition order_region %} order.region {% endcondition %}\n          GROUP BY 1",
-            "viewLanguage": "sql"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),customer_id)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),sale_price)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "customer_facts",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "customer_id",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NullType": {}
-                                    }
-                                },
-                                "nativeDataType": "unknown",
-                                "recursive": false,
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "lifetime_spend",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NullType": {}
-                                    }
-                                },
-                                "nativeDataType": "unknown",
-                                "recursive": false,
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "liquid.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "customer_facts",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "view: ability {\n  sql_table_name: \"ECOMMERCE\".\"ABILITY\"\n    ;;\n\n  dimension: pk {\n    type: number\n    sql: ${TABLE}.\"PK\" ;;\n  }\n\n  measure: count {\n    type: count\n    drill_fields: []\n  }\n}\n",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD),pk)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD),pk)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD),count)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD),count)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "ability",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "pk",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "number",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "count",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "count",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Measure"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "ability.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "ability",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "view: owners {\n  dimension: id {\n    primary_key: yes\n    sql: ${TABLE}.id ;;\n  }\n  dimension: owner_name {\n    sql: ${TABLE}.owner_name ;;\n  }\n}",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD),id)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),id)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD),owner_name)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),owner_name)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "owners",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "id",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": true
-                            },
-                            {
-                                "fieldPath": "owner_name",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": [
-                            "id"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "owners.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "owners",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "explore_source: my_view_explore {\n  bind_all_filters: yes\n\n  column: country {\n    field: my_view_explore.country\n  }\n\n  column: city {\n    field: my_view_explore.city\n  }\n\n  column: is_latest {\n    field: my_view_explore.is_latest\n  }\n\n  derived_column: derived_col {\n    sql: coalesce(country, 'US') ;;\n  }\n}",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.country)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),country)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.city)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),city)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.country)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),unique_countries)"
-                                ],
-                                "confidenceScore": 1.0
-                            },
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD),my_view_explore.is_latest)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD),derived_col)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "view_derived_explore",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "country",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "city",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.StringType": {}
-                                    }
-                                },
-                                "nativeDataType": "string",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "unique_countries",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "count_distinct",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Measure"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            },
-                            {
-                                "fieldPath": "derived_col",
-                                "nullable": false,
-                                "description": "",
-                                "label": "",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "sum",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Measure"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": false
-                            }
-                        ],
-                        "primaryKeys": []
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "native_derived_table.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "view_derived_explore",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "subTypes",
-    "aspect": {
-        "json": {
-            "typeNames": [
-                "View"
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "viewProperties",
-    "aspect": {
-        "json": {
-            "materialized": false,
-            "viewLogic": "view: flights {\n  sql_table_name: flightstats.accidents ;;\n\n  dimension: id {\n    label: \"id\"\n    primary_key: yes\n    type: number\n    sql: ${TABLE}.id ;;\n  }\n}\n\n# override type of id parameter\nview: +flights {\n  dimension: id {\n    label: \"id\"\n    primary_key: yes\n    type: string\n    sql: ${TABLE}.id ;;\n  }\n}",
-            "viewLanguage": "lookml"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "container",
-    "aspect": {
-        "json": {
-            "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.common.BrowsePaths": {
-                        "paths": [
-                            "/Develop/lkml_samples/"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.common.Status": {
-                        "removed": false
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
-                        "upstreams": [
-                            {
-                                "auditStamp": {
-                                    "time": 1586847600000,
-                                    "actor": "urn:li:corpuser:datahub"
-                                },
-                                "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)",
-                                "type": "VIEW"
-                            }
-                        ],
-                        "fineGrainedLineages": [
-                            {
-                                "upstreamType": "FIELD_SET",
-                                "upstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD),id)"
-                                ],
-                                "downstreamType": "FIELD",
-                                "downstreams": [
-                                    "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD),id)"
-                                ],
-                                "confidenceScore": 1.0
-                            }
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "flights",
-                        "platform": "urn:li:dataPlatform:looker",
-                        "version": 0,
-                        "created": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "lastModified": {
-                            "time": 0,
-                            "actor": "urn:li:corpuser:unknown"
-                        },
-                        "hash": "",
-                        "platformSchema": {
-                            "com.linkedin.pegasus2avro.schema.OtherSchema": {
-                                "rawSchema": ""
-                            }
-                        },
-                        "fields": [
-                            {
-                                "fieldPath": "id",
-                                "nullable": false,
-                                "description": "",
-                                "label": "id",
-                                "type": {
-                                    "type": {
-                                        "com.linkedin.pegasus2avro.schema.NumberType": {}
-                                    }
-                                },
-                                "nativeDataType": "number",
-                                "recursive": false,
-                                "globalTags": {
-                                    "tags": [
-                                        {
-                                            "tag": "urn:li:tag:Dimension"
-                                        }
-                                    ]
-                                },
-                                "isPartOfKey": true
-                            }
-                        ],
-                        "primaryKeys": [
-                            "id"
-                        ]
-                    }
-                },
-                {
-                    "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
-                        "customProperties": {
-                            "looker.file.path": "flights.view.lkml",
-                            "looker.model": "data"
-                        },
-                        "name": "flights",
-                        "tags": []
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)",
-    "changeType": "UPSERT",
-    "aspectName": "browsePathsV2",
-    "aspect": {
-        "json": {
-            "path": [
-                {
-                    "id": "Develop"
-                },
-                {
-                    "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
-                    "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": {
-            "urn": "urn:li:tag:Dimension",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.tag.TagProperties": {
-                        "name": "Dimension",
-                        "description": "A tag that is applied to all dimension fields."
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": {
-            "urn": "urn:li:tag:Temporal",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.tag.TagProperties": {
-                        "name": "Temporal",
-                        "description": "A tag that is applied to all time-based (temporal) fields such as timestamps or durations."
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "proposedSnapshot": {
-        "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": {
-            "urn": "urn:li:tag:Measure",
-            "aspects": [
-                {
-                    "com.linkedin.pegasus2avro.tag.TagProperties": {
-                        "name": "Measure",
-                        "description": "A tag that is applied to all measures (metrics). Measures are typically the columns that you aggregate on"
-                    }
-                }
-            ]
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "tag",
-    "entityUrn": "urn:li:tag:Dimension",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "tag",
-    "entityUrn": "urn:li:tag:Measure",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "tag",
-    "entityUrn": "urn:li:tag:Temporal",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1586847600000,
-        "runId": "lookml-test",
-        "lastRunId": "no-run-id-provided"
-    }
-}
-]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py
index e4eb564e3e86b7..94b3b103d0548c 100644
--- a/metadata-ingestion/tests/integration/lookml/test_lookml.py
+++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py
@@ -10,7 +10,6 @@
 from freezegun import freeze_time
 from looker_sdk.sdk.api40.models import DBConnection
 
-from datahub.configuration.common import PipelineExecutionError
 from datahub.ingestion.run.pipeline import Pipeline
 from datahub.ingestion.source.file import read_metadata_file
 from datahub.ingestion.source.looker.looker_template_language import (
@@ -518,53 +517,6 @@ def ingestion_test(
         )
 
 
-@freeze_time(FROZEN_TIME)
-def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time):
-    """Incorrect specification of sql parser should not fail ingestion"""
-    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
-    mce_out = "lookml_mces_badsql_parser.json"
-    pipeline = Pipeline.create(
-        {
-            "run_id": "lookml-test",
-            "source": {
-                "type": "lookml",
-                "config": {
-                    "base_folder": str(test_resources_dir / "lkml_samples"),
-                    "connection_to_platform_map": {
-                        "my_connection": {
-                            "platform": "snowflake",
-                            "default_db": "default_db",
-                            "default_schema": "default_schema",
-                        }
-                    },
-                    "parse_table_names_from_sql": True,
-                    "project_name": "lkml_samples",
-                    "sql_parser": "bad.sql.Parser",
-                    "emit_reachable_views_only": False,
-                    "process_refinements": False,
-                },
-            },
-            "sink": {
-                "type": "file",
-                "config": {
-                    "filename": f"{tmp_path}/{mce_out}",
-                },
-            },
-        }
-    )
-    pipeline.run()
-    pipeline.pretty_print_summary()
-    pipeline.raise_from_status(raise_warnings=False)
-    with pytest.raises(PipelineExecutionError):  # we expect the source to have warnings
-        pipeline.raise_from_status(raise_warnings=True)
-
-    mce_helpers.check_golden_file(
-        pytestconfig,
-        output_path=tmp_path / mce_out,
-        golden_path=test_resources_dir / mce_out,
-    )
-
-
 @freeze_time(FROZEN_TIME)
 def test_lookml_git_info(pytestconfig, tmp_path, mock_time):
     """Add github info to config"""

From fc8465e18ea74a5185da1a49f92d30a99e5f1882 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 08:14:06 -0700
Subject: [PATCH 10/31] feat(ingest): add offline flag to SQL parser CLI
 (#11635)

---
 .../src/datahub/cli/check_cli.py              | 35 +++++++++++++++----
 .../datahub/sql_parsing/sqlglot_lineage.py    |  4 ++-
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py
index 6e9bfddd350f92..39ed1b2bfea087 100644
--- a/metadata-ingestion/src/datahub/cli/check_cli.py
+++ b/metadata-ingestion/src/datahub/cli/check_cli.py
@@ -188,9 +188,13 @@ def sql_format(sql: str, platform: str) -> None:
 @click.option(
     "--sql",
     type=str,
-    required=True,
     help="The SQL query to parse",
 )
+@click.option(
+    "--sql-file",
+    type=click.Path(exists=True, dir_okay=False, readable=True),
+    help="The SQL file to parse",
+)
 @click.option(
     "--platform",
     type=str,
@@ -218,25 +222,44 @@ def sql_format(sql: str, platform: str) -> None:
     type=str,
     help="The default schema to use for unqualified table names",
 )
+@click.option(
+    "--online/--offline",
+    type=bool,
+    is_flag=True,
+    default=True,
+    help="Run in offline mode and disable schema-aware parsing.",
+)
 @telemetry.with_telemetry()
 def sql_lineage(
-    sql: str,
+    sql: Optional[str],
+    sql_file: Optional[str],
     platform: str,
     default_db: Optional[str],
     default_schema: Optional[str],
     platform_instance: Optional[str],
     env: str,
+    online: bool,
 ) -> None:
     """Parse the lineage of a SQL query.
 
-    This performs schema-aware parsing in order to generate column-level lineage.
-    If the relevant tables are not in DataHub, this will be less accurate.
+    In online mode (the default), we perform schema-aware parsing in order to generate column-level lineage.
+    If offline mode is enabled or if the relevant tables are not in DataHub, this will be less accurate.
     """
 
-    graph = get_default_graph()
+    from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result
+
+    if sql is None:
+        if sql_file is None:
+            raise click.UsageError("Either --sql or --sql-file must be provided")
+        sql = pathlib.Path(sql_file).read_text()
+
+    graph = None
+    if online:
+        graph = get_default_graph()
 
-    lineage = graph.parse_sql_lineage(
+    lineage = create_lineage_sql_parsed_result(
         sql,
+        graph=graph,
         platform=platform,
         platform_instance=platform_instance,
         env=env,
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
index 0806d0ec774fe7..273e9d0f9f0b1d 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
@@ -656,7 +656,9 @@ def _get_direct_raw_col_upstreams(
             # Parse the column name out of the node name.
             # Sqlglot calls .sql(), so we have to do the inverse.
             normalized_col = sqlglot.parse_one(node.name).this.name
-            if node.subfield:
+            if hasattr(node, "subfield") and node.subfield:
+                # The hasattr check is necessary, since it lets us be compatible with
+                # sqlglot versions that don't have the subfield attribute.
                 normalized_col = f"{normalized_col}.{node.subfield}"
 
             direct_raw_col_upstreams.add(

From e9060b8afbe056e8404c4c65daff28c307ef0f9a Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 10:08:37 -0700
Subject: [PATCH 11/31] fix(ingest/redshift): reduce sequence limit for LISTAGG
 (#11621)

Co-authored-by: treff7es <treff7es@gmail.com>
Co-authored-by: Aseem Bansal <asmbansal2@gmail.com>
---
 .../ingestion/source/redshift/query.py        | 281 +++++++++---------
 .../unit/redshift/redshift_query_mocker.py    |  40 +--
 2 files changed, 143 insertions(+), 178 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
index 39370b93b561c5..f7fad574f7fbe7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
@@ -4,6 +4,12 @@
 redshift_datetime_format = "%Y-%m-%d %H:%M:%S"
 
 
+# See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext
+# for why we need to limit the size of the query text.
+# We use 290 instead instead of the standard 320, because escape characters can add to the length.
+_QUERY_SEQUENCE_LIMIT = 290
+
+
 class RedshiftCommonQuery:
     CREATE_TEMP_TABLE_CLAUSE = "create temp table"
     CREATE_TEMPORARY_TABLE_CLAUSE = "create temporary table"
@@ -487,71 +493,70 @@ def list_unload_commands_sql(
     def list_insert_create_queries_sql(
         db_name: str, start_time: datetime, end_time: datetime
     ) -> str:
-        return """
-            with query_txt as
-                (
-                    select
-                        query,
-                        pid,
-                        LISTAGG(case
-                            when LEN(RTRIM(text)) = 0 then text
-                            else RTRIM(text)
-                        end) within group (
-                    order by
-                        sequence) as ddl
-                    from
-                        (
-                        select
-                            query,
-                            pid,
-                            text,
-                            sequence
-                        from
-                            STL_QUERYTEXT
-                        where
-                            sequence < 320
-                        order by
-                            sequence
-                    )
-                    group by
-                        query,
-                        pid
-                )
-                        select
-                    distinct tbl as target_table_id,
-                    sti.schema as target_schema,
-                    sti.table as target_table,
-                    sti.database as cluster,
-                    usename as username,
-                    ddl,
-                    sq.query as query_id,
-                    min(si.starttime) as timestamp,
-                    ANY_VALUE(pid) as session_id
-                from
-                    stl_insert as si
-                left join SVV_TABLE_INFO sti on
-                    sti.table_id = tbl
-                left join svl_user_info sui on
-                    si.userid = sui.usesysid
-                left join query_txt sq on
-                    si.query = sq.query
-                left join stl_load_commits slc on
-                    slc.query = si.query
-                where
-                        sui.usename <> 'rdsdb'
-                        and cluster = '{db_name}'
-                        and slc.query IS NULL
-                        and si.starttime >= '{start_time}'
-                        and si.starttime < '{end_time}'
-                group by
-                    target_table_id,
-                    target_schema,
-                    target_table,
-                    cluster,
-                    username,
-                    ddl,
-                    sq.query
+        return """\
+with query_txt as (
+    select
+        query,
+        pid,
+        LISTAGG(case
+            when LEN(RTRIM(text)) = 0 then text
+            else RTRIM(text)
+        end) within group (
+            order by sequence
+        ) as ddl
+    from (
+        select
+            query,
+            pid,
+            text,
+            sequence
+        from
+            STL_QUERYTEXT
+        where
+            sequence < {_QUERY_SEQUENCE_LIMIT}
+        order by
+            sequence
+    )
+    group by
+        query,
+        pid
+)
+select
+    distinct tbl as target_table_id,
+    sti.schema as target_schema,
+    sti.table as target_table,
+    sti.database as cluster,
+    usename as username,
+    ddl,
+    sq.query as query_id,
+    min(si.starttime) as timestamp,
+    ANY_VALUE(pid) as session_id
+from
+    stl_insert as si
+left join SVV_TABLE_INFO sti on
+    sti.table_id = tbl
+left join svl_user_info sui on
+    si.userid = sui.usesysid
+left join query_txt sq on
+    si.query = sq.query
+left join stl_load_commits slc on
+    slc.query = si.query
+where
+        sui.usename <> 'rdsdb'
+        and cluster = '{db_name}'
+        and slc.query IS NULL
+        and si.starttime >= '{start_time}'
+        and si.starttime < '{end_time}'
+group by
+    target_table_id,
+    target_schema,
+    target_table,
+    cluster,
+    username,
+    ddl,
+    sq.query
         """.format(
+            _QUERY_SEQUENCE_LIMIT=_QUERY_SEQUENCE_LIMIT,
             # We need the original database name for filtering
             db_name=db_name,
             start_time=start_time.strftime(redshift_datetime_format),
@@ -564,84 +569,82 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str:
 
         end_time_str: str = end_time.strftime(redshift_datetime_format)
 
-        return rf"""-- DataHub Redshift Source temp table DDL query
+        return rf"""\
+-- DataHub Redshift Source temp table DDL query
+select
+    *
+from (
+    select
+        session_id,
+        transaction_id,
+        start_time,
+        userid,
+        REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\n','\\n'), '(CREATE(?:[\\n\\s\\t]+(?:temp|temporary))?(?:[\\n\\s\\t]+)table(?:[\\n\\s\\t]+)[^\\n\\s\\t()-]+)', 0, 1, 'ipe'),'[\\n\\s\\t]+',' ',1,'p') as create_command,
+        query_text,
+        row_number() over (
+            partition by session_id, TRIM(query_text)
+            order by start_time desc
+        ) rn
+    from (
+        select
+            pid as session_id,
+            xid as transaction_id,
+            starttime as start_time,
+            type,
+            query_text,
+            userid
+        from (
             select
-                *
+                starttime,
+                pid,
+                xid,
+                type,
+                userid,
+                LISTAGG(case
+                    when LEN(RTRIM(text)) = 0 then text
+                    else RTRIM(text)
+                end,
+                '') within group (
+                    order by sequence
+                ) as query_text
             from
-                (
-                select
-                    session_id,
-                    transaction_id,
-                    start_time,
-                    userid,
-                    REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\n','\\n'), '(CREATE(?:[\\n\\s\\t]+(?:temp|temporary))?(?:[\\n\\s\\t]+)table(?:[\\n\\s\\t]+)[^\\n\\s\\t()-]+)', 0, 1, 'ipe'),'[\\n\\s\\t]+',' ',1,'p') as create_command,
-                    query_text,
-                    row_number() over (
-                        partition by session_id, TRIM(query_text)
-                        order by start_time desc
-                    ) rn
-                from
-                    (
-                    select
-                        pid as session_id,
-                        xid as transaction_id,
-                        starttime as start_time,
-                        type,
-                        query_text,
-                        userid
-                    from
-                        (
-                        select
-                            starttime,
-                            pid,
-                            xid,
-                            type,
-                            userid,
-                            LISTAGG(case
-                                when LEN(RTRIM(text)) = 0 then text
-                                else RTRIM(text)
-                            end,
-                            '') within group (
-                                order by sequence
-                            ) as query_text
-                        from
-                            SVL_STATEMENTTEXT
-                        where
-                            type in ('DDL', 'QUERY')
-                            AND        starttime >= '{start_time_str}'
-                            AND        starttime < '{end_time_str}'
-                            -- See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext
-                            AND sequence < 320
-                        group by
-                            starttime,
-                            pid,
-                            xid,
-                            type,
-                            userid
-                        order by
-                            starttime,
-                            pid,
-                            xid,
-                            type,
-                            userid
-                            asc)
-                    where
-                        type in ('DDL', 'QUERY')
-                )
-                where
-                    (create_command ilike 'create temp table %'
-                        or create_command ilike 'create temporary table %'
-                        -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction
-                        or create_command ilike 'create table %')
-                    -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.
-                    and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'
-                    and create_command not like 'CREATE TEMP TABLE volt_tt_'
-                    -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query
-                    and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'
-
-            )
+                SVL_STATEMENTTEXT
             where
-                rn = 1
+                type in ('DDL', 'QUERY')
+                AND        starttime >= '{start_time_str}'
+                AND        starttime < '{end_time_str}'
+                AND sequence < {_QUERY_SEQUENCE_LIMIT}
+            group by
+                starttime,
+                pid,
+                xid,
+                type,
+                userid
+            order by
+                starttime,
+                pid,
+                xid,
+                type,
+                userid
+                asc
+        )
+        where
+            type in ('DDL', 'QUERY')
+    )
+    where
+        (create_command ilike 'create temp table %'
+            or create_command ilike 'create temporary table %'
+            -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction
+            or create_command ilike 'create table %')
+        -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.
+        and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'
+        and create_command not like 'CREATE TEMP TABLE volt_tt_'
+        -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query
+        and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'
+
+)
+where
+    rn = 1
             """
 
     # Add this join to the sql query for more metrics on completed queries
diff --git a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
index ada76e624032ba..06b592d42914bf 100644
--- a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
+++ b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
@@ -56,45 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None:
 
 query_vs_cursor_mocker = {
     (
-        "-- DataHub Redshift Source temp table DDL query\n            select\n                *\n            "
-        "from\n                (\n                select\n                    session_id,\n                  "
-        "  transaction_id,\n                    start_time,\n                    userid,\n                   "
-        " REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:["
-        "\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)["
-        "^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n          "
-        "          query_text,\n                    row_number() over (\n                        partition "
-        "by session_id, TRIM(query_text)\n                        order by start_time desc\n                    ) rn\n   "
-        "             from\n                    (\n                    select\n                        pid "
-        "as session_id,\n                        xid as transaction_id,\n                        starttime "
-        "as start_time,\n                        type,\n                        query_text,\n                "
-        "        userid\n                    from\n                        (\n                        "
-        "select\n                            starttime,\n                            pid,\n                  "
-        "          xid,\n                            type,\n                            userid,\n            "
-        "                LISTAGG(case\n                                when LEN(RTRIM(text)) = 0 then text\n "
-        "                               else RTRIM(text)\n                            end,\n                 "
-        "           '') within group (\n                                order by sequence\n                  "
-        "          ) as query_text\n                        from\n                            "
-        "SVL_STATEMENTTEXT\n                        where\n                            type in ('DDL', "
-        "'QUERY')\n                            AND        starttime >= '2024-01-01 12:00:00'\n               "
-        "             AND        starttime < '2024-01-10 12:00:00'\n                            -- See "
-        "https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl"
-        "-statementtext\n                            AND sequence < 320\n                        group by\n  "
-        "                          starttime,\n                            pid,\n                            "
-        "xid,\n                            type,\n                            userid\n                       "
-        " order by\n                            starttime,\n                            pid,\n               "
-        "             xid,\n                            type,\n                            userid\n          "
-        "                  asc)\n                    where\n                        type in ('DDL', "
-        "'QUERY')\n                )\n                where\n                    (create_command ilike "
-        "'create temp table %'\n                        or create_command ilike 'create temporary table %'\n "
-        "                       -- we want to get all the create table statements and not just temp tables "
-        "if non temp table is created and dropped in the same transaction\n                        or "
-        "create_command ilike 'create table %')\n                    -- Redshift creates temp tables with "
-        "the following names: volt_tt_%. We need to filter them out.\n                    and query_text not "
-        "ilike 'CREATE TEMP TABLE volt_tt_%'\n                    and create_command not like 'CREATE TEMP "
-        "TABLE volt_tt_'\n                    -- We need to filter out our query and it was not possible "
-        "earlier when we did not have any comment in the query\n                    and query_text not ilike "
-        "'%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl"
-        "-statementtext%'\n\n            )\n            where\n                rn = 1\n            "
+        "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n    *\nfrom (\n    select\n        session_id,\n        transaction_id,\n        start_time,\n        userid,\n        REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n        query_text,\n        row_number() over (\n            partition by session_id, TRIM(query_text)\n            order by start_time desc\n        ) rn\n    from (\n        select\n            pid as session_id,\n            xid as transaction_id,\n            starttime as start_time,\n            type,\n            query_text,\n            userid\n        from (\n            select\n                starttime,\n                pid,\n                xid,\n                type,\n                userid,\n                LISTAGG(case\n                    when LEN(RTRIM(text)) = 0 then text\n                    else RTRIM(text)\n                end,\n                '') within group (\n                    order by sequence\n                ) as query_text\n            from\n                SVL_STATEMENTTEXT\n            where\n                type in ('DDL', 'QUERY')\n                AND        starttime >= '2024-01-01 12:00:00'\n                AND        starttime < '2024-01-10 12:00:00'\n                AND sequence < 290\n            group by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n            order by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n                asc\n        )\n        where\n            type in ('DDL', 'QUERY')\n    )\n    where\n        (create_command ilike 'create temp table %'\n            or create_command ilike 'create temporary table %'\n            -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n            or create_command ilike 'create table %')\n        -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n        and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n        and create_command not like 'CREATE TEMP TABLE volt_tt_'\n        -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n        and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n    rn = 1\n            "
     ): mock_temp_table_cursor,
     "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor,
 }

From 2101ff15f5a618b88c6efd30e06db3c268efb248 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth <treff7es@gmail.com>
Date: Thu, 17 Oct 2024 19:57:02 +0200
Subject: [PATCH 12/31] fix(ingest/bigquery): Not setting platform instance for
 bigquery platform resources (#11659)

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  3 -
 .../bigquery_platform_resource_helper.py      |  2 +-
 .../bigquery_v2/bigquery_mcp_golden.json      | 90 +++++++++----------
 3 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index c30dade921d257..3c6202cc7cbfaf 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -65,9 +65,6 @@
 # We can't use close as it is not called if the ingestion is not successful
 def cleanup(config: BigQueryV2Config) -> None:
     if config._credentials_path is not None:
-        logger.debug(
-            f"Deleting temporary credential file at {config._credentials_path}"
-        )
         os.unlink(config._credentials_path)
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py
index d2da895be985dc..9da2aceb19220a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py
@@ -42,7 +42,7 @@ def platform_resource_key(self) -> PlatformResourceKey:
         return PlatformResourceKey(
             platform="bigquery",
             resource_type="BigQueryLabelInfo",
-            platform_instance=self.project,
+            platform_instance=None,
             primary_key=self.label.primary_key(),
         )
 
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
index b268926f155b74..640ee1bf436b03 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
+++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
@@ -201,7 +201,7 @@
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2",
+    "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b",
     "changeType": "UPSERT",
     "aspectName": "platformResourceInfo",
     "aspect": {
@@ -221,30 +221,29 @@
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2",
+    "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b",
     "changeType": "UPSERT",
     "aspectName": "dataPlatformInstance",
     "aspect": {
         "json": {
-            "platform": "urn:li:dataPlatform:bigquery",
-            "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+            "platform": "urn:li:dataPlatform:bigquery"
         }
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+    "entityType": "platformResource",
+    "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
@@ -254,13 +253,13 @@
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3",
+    "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916",
     "changeType": "UPSERT",
     "aspectName": "platformResourceInfo",
     "aspect": {
@@ -280,19 +279,50 @@
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3",
+    "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916",
     "changeType": "UPSERT",
     "aspectName": "dataPlatformInstance",
     "aspect": {
         "json": {
-            "platform": "urn:li:dataPlatform:bigquery",
-            "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+            "platform": "urn:li:dataPlatform:bigquery"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "platformResource",
+    "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
         }
     },
     "systemMetadata": {
@@ -395,38 +425,6 @@
         "lastRunId": "no-run-id-provided"
     }
 },
-{
-    "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
-        "lastRunId": "no-run-id-provided"
-    }
-},
 {
     "entityType": "dataset",
     "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",

From bb4a50a08aed04550402d384fe12379ca2deaf10 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 11:50:42 -0700
Subject: [PATCH 13/31] fix(ingest/dbt): fix bug in CLL pruning (#11614)

---
 .../src/datahub/ingestion/source/dbt/dbt_common.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index c15f1deb43a3af..4cd3c934ce6348 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1074,8 +1074,8 @@ def add_node_to_cll_list(dbt_name: str) -> None:
             for upstream in all_nodes_map[dbt_name].upstream_nodes:
                 schema_nodes.add(upstream)
 
-                upstream_node = all_nodes_map[upstream]
-                if upstream_node.is_ephemeral_model():
+                upstream_node = all_nodes_map.get(upstream)
+                if upstream_node and upstream_node.is_ephemeral_model():
                     add_node_to_cll_list(upstream)
 
             cll_nodes.add(dbt_name)

From 760f997100d5ddf81b6cbd99caf3754941b99dc6 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 12:33:04 -0700
Subject: [PATCH 14/31] fix(ingest/redshift): fix syntax error in temp sql
 (#11661)

---
 .../src/datahub/ingestion/source/redshift/exception.py    | 8 ++++----
 .../src/datahub/ingestion/source/redshift/query.py        | 5 ++---
 .../tests/unit/redshift/redshift_query_mocker.py          | 2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py
index 43ad5bfcefdf1b..ed0856fc1e2924 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py
@@ -40,25 +40,25 @@ def report_redshift_failure(
     error_message = str(e).lower()
     if "permission denied" in error_message:
         if "svv_table_info" in error_message:
-            report.report_failure(
+            report.failure(
                 title="Permission denied",
                 message="Failed to extract metadata due to insufficient permission to access 'svv_table_info' table. Please ensure the provided database user has access.",
                 exc=e,
             )
         elif "svl_user_info" in error_message:
-            report.report_failure(
+            report.failure(
                 title="Permission denied",
                 message="Failed to extract metadata due to insufficient permission to access 'svl_user_info' table. Please ensure the provided database user has access.",
                 exc=e,
             )
         else:
-            report.report_failure(
+            report.failure(
                 title="Permission denied",
                 message="Failed to extract metadata due to insufficient permissions.",
                 exc=e,
             )
     else:
-        report.report_failure(
+        report.failure(
             title="Failed to extract some metadata",
             message="Failed to extract some metadata from Redshift.",
             exc=e,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
index f7fad574f7fbe7..b18b526ef30fce 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
@@ -569,8 +569,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str:
 
         end_time_str: str = end_time.strftime(redshift_datetime_format)
 
-        return rf"""\
--- DataHub Redshift Source temp table DDL query
+        return rf"""-- DataHub Redshift Source temp table DDL query
 select
     *
 from (
@@ -645,7 +644,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str:
 )
 where
     rn = 1
-            """
+"""
 
     # Add this join to the sql query for more metrics on completed queries
     # LEFT JOIN svl_query_metrics_summary sqms ON ss.query = sqms.query
diff --git a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
index 06b592d42914bf..10c0250e37e37f 100644
--- a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
+++ b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
@@ -56,7 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None:
 
 query_vs_cursor_mocker = {
     (
-        "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n    *\nfrom (\n    select\n        session_id,\n        transaction_id,\n        start_time,\n        userid,\n        REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n        query_text,\n        row_number() over (\n            partition by session_id, TRIM(query_text)\n            order by start_time desc\n        ) rn\n    from (\n        select\n            pid as session_id,\n            xid as transaction_id,\n            starttime as start_time,\n            type,\n            query_text,\n            userid\n        from (\n            select\n                starttime,\n                pid,\n                xid,\n                type,\n                userid,\n                LISTAGG(case\n                    when LEN(RTRIM(text)) = 0 then text\n                    else RTRIM(text)\n                end,\n                '') within group (\n                    order by sequence\n                ) as query_text\n            from\n                SVL_STATEMENTTEXT\n            where\n                type in ('DDL', 'QUERY')\n                AND        starttime >= '2024-01-01 12:00:00'\n                AND        starttime < '2024-01-10 12:00:00'\n                AND sequence < 290\n            group by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n            order by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n                asc\n        )\n        where\n            type in ('DDL', 'QUERY')\n    )\n    where\n        (create_command ilike 'create temp table %'\n            or create_command ilike 'create temporary table %'\n            -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n            or create_command ilike 'create table %')\n        -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n        and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n        and create_command not like 'CREATE TEMP TABLE volt_tt_'\n        -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n        and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n    rn = 1\n            "
+        "-- DataHub Redshift Source temp table DDL query\nselect\n    *\nfrom (\n    select\n        session_id,\n        transaction_id,\n        start_time,\n        userid,\n        REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n        query_text,\n        row_number() over (\n            partition by session_id, TRIM(query_text)\n            order by start_time desc\n        ) rn\n    from (\n        select\n            pid as session_id,\n            xid as transaction_id,\n            starttime as start_time,\n            type,\n            query_text,\n            userid\n        from (\n            select\n                starttime,\n                pid,\n                xid,\n                type,\n                userid,\n                LISTAGG(case\n                    when LEN(RTRIM(text)) = 0 then text\n                    else RTRIM(text)\n                end,\n                '') within group (\n                    order by sequence\n                ) as query_text\n            from\n                SVL_STATEMENTTEXT\n            where\n                type in ('DDL', 'QUERY')\n                AND        starttime >= '2024-01-01 12:00:00'\n                AND        starttime < '2024-01-10 12:00:00'\n                AND sequence < 290\n            group by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n            order by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n                asc\n        )\n        where\n            type in ('DDL', 'QUERY')\n    )\n    where\n        (create_command ilike 'create temp table %'\n            or create_command ilike 'create temporary table %'\n            -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n            or create_command ilike 'create table %')\n        -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n        and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n        and create_command not like 'CREATE TEMP TABLE volt_tt_'\n        -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n        and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n    rn = 1\n"
     ): mock_temp_table_cursor,
     "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor,
 }

From f111501d6111d94234bc90423c6921377d3b2a80 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 16:36:18 -0700
Subject: [PATCH 15/31] docs(airflow): add known limitations for automatic
 lineage (#11652)

---
 docs/lineage/airflow.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md
index 35f2ff862e6958..829c048a8f8e24 100644
--- a/docs/lineage/airflow.md
+++ b/docs/lineage/airflow.md
@@ -132,7 +132,7 @@ conn_id = datahub_rest_default  # or datahub_kafka_default
 ```
 
 | Name                       | Default value        | Description                                                                                                                                                                            |
-|----------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| -------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | enabled                    | true                 | If the plugin should be enabled.                                                                                                                                                       |
 | conn_id                    | datahub_rest_default | The name of the datahub connection you set in step 1.                                                                                                                                  |
 | cluster                    | prod                 | name of the airflow cluster                                                                                                                                                            |
@@ -191,6 +191,10 @@ These operators are supported by OpenLineage, but we haven't tested them yet:
 There's also a few operators (e.g. BashOperator, PythonOperator) that have custom extractors, but those extractors don't generate lineage.
 -->
 
+Known limitations:
+
+- We do not fully support operators that run multiple SQL statements at once. In these cases, we'll only capture lineage from the first SQL statement.
+
 ## Manual Lineage Annotation
 
 ### Using `inlets` and `outlets`

From aed3aa228e2e6de7ef33c92b604425b66d8b55d9 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 17:50:59 -0700
Subject: [PATCH 16/31] perf(ingest): streamline CLL generation (#11645)

---
 metadata-ingestion/setup.py                   |  2 +-
 .../ingestion/source/redshift/lineage.py      |  2 +-
 .../ingestion/source/tableau/tableau.py       |  2 +-
 .../datahub/sql_parsing/sqlglot_lineage.py    | 51 ++++++++++-------
 .../tableau/test_tableau_ingest.py            | 17 ++++--
 ...st_bigquery_subquery_column_inference.json | 57 +++++++++++++++++++
 .../unit/sql_parsing/test_sqlglot_lineage.py  | 15 +++++
 7 files changed, 118 insertions(+), 28 deletions(-)
 create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index bfec2c00cb8647..365da21208ecce 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -101,7 +101,7 @@
 sqlglot_lib = {
     # Using an Acryl fork of sqlglot.
     # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1
-    "acryl-sqlglot[rs]==25.20.2.dev6",
+    "acryl-sqlglot[rs]==25.25.2.dev9",
 }
 
 classification_lib = {
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
index 0e4cb6f1599e52..fe491ccb318505 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
@@ -133,7 +133,7 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str,
     assert isinstance(parsed_query, sqlglot.exp.Alter)
     prev_name = parsed_query.this.name
     rename_clause = parsed_query.args["actions"][0]
-    assert isinstance(rename_clause, sqlglot.exp.RenameTable)
+    assert isinstance(rename_clause, sqlglot.exp.AlterRename)
     new_name = rename_clause.this.name
 
     schema = parsed_query.this.db or default_schema
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
index 9f011790990ec2..2c17a5a322f052 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -2131,7 +2131,7 @@ def _create_lineage_from_unsupported_csql(
 
         fine_grained_lineages: List[FineGrainedLineage] = []
         if self.config.extract_column_level_lineage:
-            logger.info("Extracting CLL from custom sql")
+            logger.debug("Extracting CLL from custom sql")
             fine_grained_lineages = make_fine_grained_lineage_class(
                 parsed_result, csql_urn, out_columns
             )
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
index 273e9d0f9f0b1d..6a7ff5be6d1ea8 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
@@ -1,6 +1,5 @@
 import dataclasses
 import functools
-import itertools
 import logging
 import traceback
 from collections import defaultdict
@@ -14,6 +13,8 @@
 import sqlglot.optimizer.annotate_types
 import sqlglot.optimizer.optimizer
 import sqlglot.optimizer.qualify
+import sqlglot.optimizer.qualify_columns
+import sqlglot.optimizer.unnest_subqueries
 
 from datahub.cli.env_utils import get_boolean_env_variable
 from datahub.ingestion.graph.client import DataHubGraph
@@ -63,24 +64,30 @@
 SQL_LINEAGE_TIMEOUT_SECONDS = 10
 
 
-RULES_BEFORE_TYPE_ANNOTATION: tuple = tuple(
-    filter(
-        lambda func: func.__name__
-        not in {
-            # Skip pushdown_predicates because it sometimes throws exceptions, and we
-            # don't actually need it for anything.
-            "pushdown_predicates",
-            # Skip normalize because it can sometimes be expensive.
-            "normalize",
-        },
-        itertools.takewhile(
-            lambda func: func != sqlglot.optimizer.annotate_types.annotate_types,
-            sqlglot.optimizer.optimizer.RULES,
-        ),
-    )
+# These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES.
+# If there's a change in their rules, we probably need to re-evaluate our list as well.
+assert len(sqlglot.optimizer.optimizer.RULES) == 14
+
+_OPTIMIZE_RULES = (
+    sqlglot.optimizer.optimizer.qualify,
+    # We need to enable this in order for annotate types to work.
+    sqlglot.optimizer.optimizer.pushdown_projections,
+    # sqlglot.optimizer.optimizer.normalize,  # causes perf issues
+    sqlglot.optimizer.optimizer.unnest_subqueries,
+    # sqlglot.optimizer.optimizer.pushdown_predicates,  # causes perf issues
+    # sqlglot.optimizer.optimizer.optimize_joins,
+    # sqlglot.optimizer.optimizer.eliminate_subqueries,
+    # sqlglot.optimizer.optimizer.merge_subqueries,
+    # sqlglot.optimizer.optimizer.eliminate_joins,
+    # sqlglot.optimizer.optimizer.eliminate_ctes,
+    sqlglot.optimizer.optimizer.quote_identifiers,
+    # These three are run separately or not run at all.
+    # sqlglot.optimizer.optimizer.annotate_types,
+    # sqlglot.optimizer.canonicalize.canonicalize,
+    # sqlglot.optimizer.simplify.simplify,
 )
-# Quick check that the rules were loaded correctly.
-assert 0 < len(RULES_BEFORE_TYPE_ANNOTATION) < len(sqlglot.optimizer.optimizer.RULES)
+
+_DEBUG_TYPE_ANNOTATIONS = False
 
 
 class _ColumnRef(_FrozenModel):
@@ -385,11 +392,12 @@ def _sqlglot_force_column_normalizer(
                 schema=sqlglot_db_schema,
                 qualify_columns=True,
                 validate_qualify_columns=False,
+                allow_partial_qualification=True,
                 identify=True,
                 # sqlglot calls the db -> schema -> table hierarchy "catalog", "db", "table".
                 catalog=default_db,
                 db=default_schema,
-                rules=RULES_BEFORE_TYPE_ANNOTATION,
+                rules=_OPTIMIZE_RULES,
             )
         except (sqlglot.errors.OptimizeError, ValueError) as e:
             raise SqlUnderstandingError(
@@ -408,6 +416,10 @@ def _sqlglot_force_column_normalizer(
         except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e:
             # This is not a fatal error, so we can continue.
             logger.debug("sqlglot failed to annotate or parse types: %s", e)
+        if _DEBUG_TYPE_ANNOTATIONS and logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                "Type annotated sql %s", statement.sql(pretty=True, dialect=dialect)
+            )
 
     return statement, _ColumnResolver(
         sqlglot_db_schema=sqlglot_db_schema,
@@ -907,6 +919,7 @@ def _sqlglot_lineage_inner(
         # At this stage we only want to qualify the table names. The columns will be dealt with later.
         qualify_columns=False,
         validate_qualify_columns=False,
+        allow_partial_qualification=True,
         # Only insert quotes where necessary.
         identify=False,
     )
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index 5a5552a78c56fa..3798df07000c8a 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -19,6 +19,7 @@
 
 from datahub.configuration.source_common import DEFAULT_ENV
 from datahub.emitter.mce_builder import make_schema_field_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.run.pipeline import Pipeline, PipelineContext
 from datahub.ingestion.source.tableau.tableau import (
     TableauConfig,
@@ -37,7 +38,7 @@
     FineGrainedLineageUpstreamType,
     UpstreamLineage,
 )
-from datahub.metadata.schema_classes import MetadataChangeProposalClass, UpstreamClass
+from datahub.metadata.schema_classes import UpstreamClass
 from tests.test_helpers import mce_helpers, test_connection_helpers
 from tests.test_helpers.state_helpers import (
     get_current_checkpoint_from_pipeline,
@@ -939,11 +940,12 @@ def test_tableau_unsupported_csql():
         database_override_map={"production database": "prod"}
     )
 
-    def test_lineage_metadata(
+    def check_lineage_metadata(
         lineage, expected_entity_urn, expected_upstream_table, expected_cll
     ):
-        mcp = cast(MetadataChangeProposalClass, next(iter(lineage)).metadata)
-        assert mcp.aspect == UpstreamLineage(
+        mcp = cast(MetadataChangeProposalWrapper, list(lineage)[0].metadata)
+
+        expected = UpstreamLineage(
             upstreams=[
                 UpstreamClass(
                     dataset=expected_upstream_table,
@@ -966,6 +968,9 @@ def test_lineage_metadata(
         )
         assert mcp.entityUrn == expected_entity_urn
 
+        actual_aspect = mcp.aspect
+        assert actual_aspect == expected
+
     csql_urn = "urn:li:dataset:(urn:li:dataPlatform:tableau,09988088-05ad-173c-a2f1-f33ba3a13d1a,PROD)"
     expected_upstream_table = "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.UserDetail,PROD)"
     expected_cll = {
@@ -996,7 +1001,7 @@ def test_lineage_metadata(
         },
         out_columns=[],
     )
-    test_lineage_metadata(
+    check_lineage_metadata(
         lineage=lineage,
         expected_entity_urn=csql_urn,
         expected_upstream_table=expected_upstream_table,
@@ -1014,7 +1019,7 @@ def test_lineage_metadata(
         },
         out_columns=[],
     )
-    test_lineage_metadata(
+    check_lineage_metadata(
         lineage=lineage,
         expected_entity_urn=csql_urn,
         expected_upstream_table=expected_upstream_table,
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json
new file mode 100644
index 00000000000000..32b3830bf1412a
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json
@@ -0,0 +1,57 @@
+{
+    "query_type": "SELECT",
+    "query_type_props": {},
+    "query_fingerprint": "4094ebd230c1d47c7e6879b05ab927e550923b1986eb58c5f3814396cf401d18",
+    "in_tables": [
+        "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)"
+    ],
+    "out_tables": [],
+    "column_lineage": [
+        {
+            "downstream": {
+                "table": null,
+                "column": "user_id",
+                "column_type": null,
+                "native_column_type": null
+            },
+            "upstreams": [
+                {
+                    "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)",
+                    "column": "user_id"
+                }
+            ]
+        },
+        {
+            "downstream": {
+                "table": null,
+                "column": "source",
+                "column_type": null,
+                "native_column_type": null
+            },
+            "upstreams": [
+                {
+                    "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)",
+                    "column": "source"
+                }
+            ]
+        },
+        {
+            "downstream": {
+                "table": null,
+                "column": "user_source",
+                "column_type": null,
+                "native_column_type": null
+            },
+            "upstreams": [
+                {
+                    "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)",
+                    "column": "user_source"
+                }
+            ]
+        }
+    ],
+    "debug_info": {
+        "confidence": 0.2,
+        "generalized_statement": "SELECT user_id, source, user_source FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_ FROM invent_dw.UserDetail) AS source_user WHERE rank_ = ?"
+    }
+}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
index fb1d2a0bc50110..90cc863d6bd231 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
@@ -1253,3 +1253,18 @@ def test_snowflake_drop_schema() -> None:
         dialect="snowflake",
         expected_file=RESOURCE_DIR / "test_snowflake_drop_schema.json",
     )
+
+
+def test_bigquery_subquery_column_inference() -> None:
+    assert_sql_result(
+        """\
+SELECT user_id, source, user_source
+FROM (
+    SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_
+    FROM invent_dw.UserDetail
+) source_user
+WHERE rank_ = 1
+""",
+        dialect="bigquery",
+        expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json",
+    )

From 59a2cee875551275facc659dd39d329ed5cbce3c Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Fri, 18 Oct 2024 10:33:28 +0530
Subject: [PATCH 17/31] feat(ingest): ensure sqlite file delete on clean exit
 (#11612)

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  9 +-
 .../source/bigquery_v2/bigquery_queries.py    |  4 +
 .../source/bigquery_v2/queries_extractor.py   | 18 +++-
 .../ingestion/source/redshift/lineage_v2.py   |  7 +-
 .../ingestion/source/redshift/redshift.py     | 23 +++--
 .../source/snowflake/snowflake_connection.py  |  3 +-
 .../source/snowflake/snowflake_queries.py     | 64 ++++++++-----
 .../source/snowflake/snowflake_v2.py          | 94 ++++++++++---------
 .../sql_parsing/sql_parsing_aggregator.py     |  7 ++
 .../bigquery_v2/test_bigquery_queries.py      | 16 ++++
 .../snowflake/test_snowflake_queries.py       | 24 +++++
 .../unit/sql_parsing/test_sql_aggregator.py   | 22 +++++
 12 files changed, 203 insertions(+), 88 deletions(-)
 create mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 3c6202cc7cbfaf..0e986acc81add8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -272,7 +272,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
             self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
 
-            queries_extractor = BigQueryQueriesExtractor(
+            with BigQueryQueriesExtractor(
                 connection=self.config.get_bigquery_client(),
                 schema_api=self.bq_schema_extractor.schema_api,
                 config=BigQueryQueriesExtractorConfig(
@@ -288,9 +288,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 identifiers=self.identifiers,
                 schema_resolver=self.sql_parser_schema_resolver,
                 discovered_tables=self.bq_schema_extractor.table_refs,
-            )
-            self.report.queries_extractor = queries_extractor.report
-            yield from queries_extractor.get_workunits_internal()
+            ) as queries_extractor:
+                self.report.queries_extractor = queries_extractor.report
+                yield from queries_extractor.get_workunits_internal()
+
         else:
             if self.config.include_usage_statistics:
                 yield from self.usage_extractor.get_usage_workunits(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py
index ed27aae19ce963..47f21c9f32353a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py
@@ -88,3 +88,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
     def get_report(self) -> BigQueryQueriesSourceReport:
         return self.report
+
+    def close(self) -> None:
+        self.queries_extractor.close()
+        self.connection.close()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
index b4a443673b9a97..afaaaf51964f8e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
@@ -13,6 +13,7 @@
     BaseTimeWindowConfig,
     get_time_bucket,
 )
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.api.source_helpers import auto_workunit
 from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -114,7 +115,7 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
     )
 
 
-class BigQueryQueriesExtractor:
+class BigQueryQueriesExtractor(Closeable):
     """
     Extracts query audit log and generates usage/lineage/operation workunits.
 
@@ -181,6 +182,7 @@ def __init__(
             is_allowed_table=self.is_allowed_table,
             format_queries=False,
         )
+
         self.report.sql_aggregator = self.aggregator.report
         self.report.num_discovered_tables = (
             len(self.discovered_tables) if self.discovered_tables else None
@@ -273,12 +275,14 @@ def get_workunits_internal(
             self.report.num_unique_queries = len(queries_deduped)
             logger.info(f"Found {self.report.num_unique_queries} unique queries")
 
-        with self.report.audit_log_load_timer:
+        with self.report.audit_log_load_timer, queries_deduped:
             i = 0
             for _, query_instances in queries_deduped.items():
                 for query in query_instances.values():
                     if i > 0 and i % 10000 == 0:
-                        logger.info(f"Added {i} query log entries to SQL aggregator")
+                        logger.info(
+                            f"Added {i} query log equeries_dedupedntries to SQL aggregator"
+                        )
                         if self.report.sql_aggregator:
                             logger.info(self.report.sql_aggregator.as_string())
 
@@ -287,6 +291,11 @@ def get_workunits_internal(
 
         yield from auto_workunit(self.aggregator.gen_metadata())
 
+        if not use_cached_audit_log:
+            queries.close()
+            shared_connection.close()
+            audit_log_file.unlink(missing_ok=True)
+
     def deduplicate_queries(
         self, queries: FileBackedList[ObservedQuery]
     ) -> FileBackedDict[Dict[int, ObservedQuery]]:
@@ -404,6 +413,9 @@ def _parse_audit_log_row(self, row: BigQueryJob) -> ObservedQuery:
 
         return entry
 
+    def close(self) -> None:
+        self.aggregator.close()
+
 
 def _extract_query_text(row: BigQueryJob) -> str:
     # We wrap select statements in a CTE to make them parseable as DML statement.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
index 4df64c80bad8a8..53f9383ec02a72 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
@@ -5,6 +5,7 @@
 import redshift_connector
 
 from datahub.emitter import mce_builder
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
@@ -39,7 +40,7 @@
 logger = logging.getLogger(__name__)
 
 
-class RedshiftSqlLineageV2:
+class RedshiftSqlLineageV2(Closeable):
     # does lineage and usage based on SQL parsing.
 
     def __init__(
@@ -56,6 +57,7 @@ def __init__(
         self.context = context
 
         self.database = database
+
         self.aggregator = SqlParsingAggregator(
             platform=self.platform,
             platform_instance=self.config.platform_instance,
@@ -436,3 +438,6 @@ def generate(self) -> Iterable[MetadataWorkUnit]:
                 message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
                 context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
             )
+
+    def close(self) -> None:
+        self.aggregator.close()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
index a9fc9ab8f3e993..76030cea984946 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -451,24 +451,23 @@ def _extract_metadata(
         )
 
         if self.config.use_lineage_v2:
-            lineage_extractor = RedshiftSqlLineageV2(
+            with RedshiftSqlLineageV2(
                 config=self.config,
                 report=self.report,
                 context=self.ctx,
                 database=database,
                 redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
-            )
-
-            yield from lineage_extractor.aggregator.register_schemas_from_stream(
-                self.process_schemas(connection, database)
-            )
+            ) as lineage_extractor:
+                yield from lineage_extractor.aggregator.register_schemas_from_stream(
+                    self.process_schemas(connection, database)
+                )
 
-            self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
-            yield from self.extract_lineage_v2(
-                connection=connection,
-                database=database,
-                lineage_extractor=lineage_extractor,
-            )
+                self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
+                yield from self.extract_lineage_v2(
+                    connection=connection,
+                    database=database,
+                    lineage_extractor=lineage_extractor,
+                )
 
             all_tables = self.get_all_tables()
         else:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
index d39e95a884dbc2..a9f454cfd3cdb3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
@@ -18,6 +18,7 @@
 from datahub.configuration.connection_resolver import auto_connection_resolver
 from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.source.snowflake.constants import (
     CLIENT_PREFETCH_THREADS,
     CLIENT_SESSION_KEEP_ALIVE,
@@ -364,7 +365,7 @@ def get_connection(self) -> "SnowflakeConnection":
             ) from e
 
 
-class SnowflakeConnection:
+class SnowflakeConnection(Closeable):
     _connection: NativeSnowflakeConnection
 
     def __init__(self, connection: NativeSnowflakeConnection):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
index 1445d02aa49dbd..e11073d77b46eb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
@@ -1,3 +1,4 @@
+import contextlib
 import dataclasses
 import functools
 import json
@@ -17,6 +18,7 @@
     BaseTimeWindowConfig,
     BucketDuration,
 )
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.report import Report
 from datahub.ingestion.api.source import Source, SourceReport
@@ -121,7 +123,7 @@ class SnowflakeQueriesSourceReport(SourceReport):
     queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None
 
 
-class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin):
+class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def __init__(
         self,
         connection: SnowflakeConnection,
@@ -143,28 +145,33 @@ def __init__(
 
         self._structured_report = structured_report
 
-        self.aggregator = SqlParsingAggregator(
-            platform=self.identifiers.platform,
-            platform_instance=self.identifiers.identifier_config.platform_instance,
-            env=self.identifiers.identifier_config.env,
-            schema_resolver=schema_resolver,
-            graph=graph,
-            eager_graph_load=False,
-            generate_lineage=self.config.include_lineage,
-            generate_queries=self.config.include_queries,
-            generate_usage_statistics=self.config.include_usage_statistics,
-            generate_query_usage_statistics=self.config.include_query_usage_statistics,
-            usage_config=BaseUsageConfig(
-                bucket_duration=self.config.window.bucket_duration,
-                start_time=self.config.window.start_time,
-                end_time=self.config.window.end_time,
-                user_email_pattern=self.config.user_email_pattern,
-                # TODO make the rest of the fields configurable
-            ),
-            generate_operations=self.config.include_operations,
-            is_temp_table=self.is_temp_table,
-            is_allowed_table=self.is_allowed_table,
-            format_queries=False,
+        # The exit stack helps ensure that we close all the resources we open.
+        self._exit_stack = contextlib.ExitStack()
+
+        self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
+            SqlParsingAggregator(
+                platform=self.identifiers.platform,
+                platform_instance=self.identifiers.identifier_config.platform_instance,
+                env=self.identifiers.identifier_config.env,
+                schema_resolver=schema_resolver,
+                graph=graph,
+                eager_graph_load=False,
+                generate_lineage=self.config.include_lineage,
+                generate_queries=self.config.include_queries,
+                generate_usage_statistics=self.config.include_usage_statistics,
+                generate_query_usage_statistics=self.config.include_query_usage_statistics,
+                usage_config=BaseUsageConfig(
+                    bucket_duration=self.config.window.bucket_duration,
+                    start_time=self.config.window.start_time,
+                    end_time=self.config.window.end_time,
+                    user_email_pattern=self.config.user_email_pattern,
+                    # TODO make the rest of the fields configurable
+                ),
+                generate_operations=self.config.include_operations,
+                is_temp_table=self.is_temp_table,
+                is_allowed_table=self.is_allowed_table,
+                format_queries=False,
+            )
         )
         self.report.sql_aggregator = self.aggregator.report
 
@@ -248,6 +255,10 @@ def get_workunits_internal(
                 self.aggregator.add(query)
 
         yield from auto_workunit(self.aggregator.gen_metadata())
+        if not use_cached_audit_log:
+            queries.close()
+            shared_connection.close()
+            audit_log_file.unlink(missing_ok=True)
 
     def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
         # Derived from _populate_external_lineage_from_copy_history.
@@ -426,6 +437,9 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery:
         )
         return entry
 
+    def close(self) -> None:
+        self._exit_stack.close()
+
 
 class SnowflakeQueriesSource(Source):
     def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig):
@@ -468,6 +482,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
     def get_report(self) -> SnowflakeQueriesSourceReport:
         return self.report
 
+    def close(self) -> None:
+        self.connection.close()
+        self.queries_extractor.close()
+
 
 # Make sure we don't try to generate too much info for a single query.
 _MAX_TABLES_PER_QUERY = 20
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 0d7881f36554d1..dd7f73268fdc4f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -1,3 +1,4 @@
+import contextlib
 import functools
 import json
 import logging
@@ -149,7 +150,12 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                 cached_domains=[k for k in self.config.domain], graph=self.ctx.graph
             )
 
-        self.connection = self.config.get_connection()
+        # The exit stack helps ensure that we close all the resources we open.
+        self._exit_stack = contextlib.ExitStack()
+
+        self.connection: SnowflakeConnection = self._exit_stack.enter_context(
+            self.config.get_connection()
+        )
 
         # For database, schema, tables, views, etc
         self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
@@ -157,25 +163,27 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
         self.aggregator: Optional[SqlParsingAggregator] = None
 
         if self.config.use_queries_v2 or self.config.include_table_lineage:
-            self.aggregator = SqlParsingAggregator(
-                platform=self.identifiers.platform,
-                platform_instance=self.config.platform_instance,
-                env=self.config.env,
-                graph=self.ctx.graph,
-                eager_graph_load=(
-                    # If we're ingestion schema metadata for tables/views, then we will populate
-                    # schemas into the resolver as we go. We only need to do a bulk fetch
-                    # if we're not ingesting schema metadata as part of ingestion.
-                    not (
-                        self.config.include_technical_schema
-                        and self.config.include_tables
-                        and self.config.include_views
-                    )
-                    and not self.config.lazy_schema_resolver
-                ),
-                generate_usage_statistics=False,
-                generate_operations=False,
-                format_queries=self.config.format_sql_queries,
+            self.aggregator = self._exit_stack.enter_context(
+                SqlParsingAggregator(
+                    platform=self.identifiers.platform,
+                    platform_instance=self.config.platform_instance,
+                    env=self.config.env,
+                    graph=self.ctx.graph,
+                    eager_graph_load=(
+                        # If we're ingestion schema metadata for tables/views, then we will populate
+                        # schemas into the resolver as we go. We only need to do a bulk fetch
+                        # if we're not ingesting schema metadata as part of ingestion.
+                        not (
+                            self.config.include_technical_schema
+                            and self.config.include_tables
+                            and self.config.include_views
+                        )
+                        and not self.config.lazy_schema_resolver
+                    ),
+                    generate_usage_statistics=False,
+                    generate_operations=False,
+                    format_queries=self.config.format_sql_queries,
+                )
             )
             self.report.sql_aggregator = self.aggregator.report
 
@@ -191,14 +199,16 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                     pipeline_name=self.ctx.pipeline_name,
                     run_id=self.ctx.run_id,
                 )
-            self.lineage_extractor = SnowflakeLineageExtractor(
-                config,
-                self.report,
-                connection=self.connection,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                redundant_run_skip_handler=redundant_lineage_run_skip_handler,
-                sql_aggregator=self.aggregator,
+            self.lineage_extractor = self._exit_stack.enter_context(
+                SnowflakeLineageExtractor(
+                    config,
+                    self.report,
+                    connection=self.connection,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    redundant_run_skip_handler=redundant_lineage_run_skip_handler,
+                    sql_aggregator=self.aggregator,
+                )
             )
 
         self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
@@ -213,13 +223,15 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                     pipeline_name=self.ctx.pipeline_name,
                     run_id=self.ctx.run_id,
                 )
-            self.usage_extractor = SnowflakeUsageExtractor(
-                config,
-                self.report,
-                connection=self.connection,
-                filter=self.filters,
-                identifiers=self.identifiers,
-                redundant_run_skip_handler=redundant_usage_run_skip_handler,
+            self.usage_extractor = self._exit_stack.enter_context(
+                SnowflakeUsageExtractor(
+                    config,
+                    self.report,
+                    connection=self.connection,
+                    filter=self.filters,
+                    identifiers=self.identifiers,
+                    redundant_run_skip_handler=redundant_usage_run_skip_handler,
+                )
             )
 
         self.profiling_state_handler: Optional[ProfilingHandler] = None
@@ -444,10 +456,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self._snowflake_clear_ocsp_cache()
 
-        self.connection = self.config.get_connection()
-        if self.connection is None:
-            return
-
         self.inspect_session_metadata(self.connection)
 
         snowsight_url_builder = None
@@ -513,7 +521,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
             schema_resolver = self.aggregator._schema_resolver
 
-            queries_extractor = SnowflakeQueriesExtractor(
+            queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor(
                 connection=self.connection,
                 config=SnowflakeQueriesExtractorConfig(
                     window=self.config,
@@ -535,6 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
             # it should be pretty straightforward to refactor this and only initialize the aggregator once.
             self.report.queries_extractor = queries_extractor.report
             yield from queries_extractor.get_workunits_internal()
+            queries_extractor.close()
 
         else:
             if self.config.include_table_lineage and self.lineage_extractor:
@@ -723,7 +732,4 @@ def _snowflake_clear_ocsp_cache(self) -> None:
     def close(self) -> None:
         super().close()
         StatefulIngestionSourceBase.close(self)
-        if self.lineage_extractor:
-            self.lineage_extractor.close()
-        if self.usage_extractor:
-            self.usage_extractor.close()
+        self._exit_stack.close()
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
index 5f2709fe426605..0b7ad14a8c1b41 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
@@ -275,6 +275,8 @@ class SqlAggregatorReport(Report):
     tool_meta_report: Optional[ToolMetaExtractorReport] = None
 
     def compute_stats(self) -> None:
+        if self._aggregator._closed:
+            return
         self.schema_resolver_count = self._aggregator._schema_resolver.schema_count()
         self.num_unique_query_fingerprints = len(self._aggregator._query_map)
 
@@ -345,6 +347,7 @@ def __init__(
 
         # The exit stack helps ensure that we close all the resources we open.
         self._exit_stack = contextlib.ExitStack()
+        self._closed: bool = False
 
         # Set up the schema resolver.
         self._schema_resolver: SchemaResolver
@@ -456,12 +459,16 @@ def __init__(
                 shared_connection=self._shared_connection,
                 tablename="query_usage_counts",
             )
+            self._exit_stack.push(self._query_usage_counts)
 
         # Tool Extractor
         self._tool_meta_extractor = ToolMetaExtractor()
         self.report.tool_meta_report = self._tool_meta_extractor.report
 
     def close(self) -> None:
+        # Compute stats once before closing connections
+        self.report.compute_stats()
+        self._closed = True
         self._exit_stack.close()
 
     @property
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py
index 9290100b0c521c..ef846f698f156e 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py
@@ -1,4 +1,5 @@
 import json
+import os
 from datetime import datetime
 from pathlib import Path
 from unittest.mock import patch
@@ -6,7 +7,9 @@
 import pytest
 from freezegun import freeze_time
 
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.bigquery_v2.bigquery_queries import (
+    BigQueryQueriesSource,
     BigQueryQueriesSourceReport,
 )
 from datahub.metadata.urns import CorpUserUrn
@@ -93,3 +96,16 @@ def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tm
         output_path=mcp_output_path,
         golden_path=mcp_golden_path,
     )
+
+
+@patch("google.cloud.bigquery.Client")
+@patch("google.cloud.resourcemanager_v3.ProjectsClient")
+def test_source_close_cleans_tmp(projects_client, client, tmp_path):
+    with patch("tempfile.tempdir", str(tmp_path)):
+        source = BigQueryQueriesSource.create(
+            {"project_ids": ["project1"]}, PipelineContext("run-id")
+        )
+        assert len(os.listdir(tmp_path)) > 0
+        # This closes QueriesExtractor which in turn closes SqlParsingAggregator
+        source.close()
+        assert len(os.listdir(tmp_path)) == 0
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py
new file mode 100644
index 00000000000000..82f5691bcee3de
--- /dev/null
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py
@@ -0,0 +1,24 @@
+import os
+from unittest.mock import patch
+
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.snowflake.snowflake_queries import SnowflakeQueriesSource
+
+
+@patch("snowflake.connector.connect")
+def test_source_close_cleans_tmp(snowflake_connect, tmp_path):
+    with patch("tempfile.tempdir", str(tmp_path)):
+        source = SnowflakeQueriesSource.create(
+            {
+                "connection": {
+                    "account_id": "ABC12345.ap-south-1.aws",
+                    "username": "TST_USR",
+                    "password": "TST_PWD",
+                }
+            },
+            PipelineContext("run-id"),
+        )
+        assert len(os.listdir(tmp_path)) > 0
+        # This closes QueriesExtractor which in turn closes SqlParsingAggregator
+        source.close()
+        assert len(os.listdir(tmp_path)) == 0
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py
index 0d21936a74d072..849d550ef69c57 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py
@@ -1,5 +1,7 @@
+import os
 import pathlib
 from datetime import datetime, timezone
+from unittest.mock import patch
 
 import pytest
 from freezegun import freeze_time
@@ -661,3 +663,23 @@ def test_basic_usage(pytestconfig: pytest.Config) -> None:
         outputs=mcps,
         golden_path=RESOURCE_DIR / "test_basic_usage.json",
     )
+
+
+def test_sql_aggreator_close_cleans_tmp(tmp_path):
+    frozen_timestamp = parse_user_datetime(FROZEN_TIME)
+    with patch("tempfile.tempdir", str(tmp_path)):
+        aggregator = SqlParsingAggregator(
+            platform="redshift",
+            generate_lineage=False,
+            generate_usage_statistics=True,
+            generate_operations=False,
+            usage_config=BaseUsageConfig(
+                start_time=get_time_bucket(frozen_timestamp, BucketDuration.DAY),
+                end_time=frozen_timestamp,
+            ),
+            generate_queries=True,
+            generate_query_usage_statistics=True,
+        )
+        assert len(os.listdir(tmp_path)) > 0
+        aggregator.close()
+        assert len(os.listdir(tmp_path)) == 0

From a85f7869ddebe2ea24d0d689f464abbb8e61a155 Mon Sep 17 00:00:00 2001
From: Shirshanka Das <shirshanka@apache.org>
Date: Fri, 18 Oct 2024 01:12:05 -0700
Subject: [PATCH 18/31] =?UTF-8?q?fix(sdk):=20platform=20resource=20-=20sup?=
 =?UTF-8?q?port=20indexed=20queries=20when=20urns=20are=20i=E2=80=A6=20(#1?=
 =?UTF-8?q?1660)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../platformresource/platform_resource.py     | 41 +++++++++-
 .../test_platform_resource.py                 | 74 +++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
index 1556a67a9e5555..349b0ff11d84f7 100644
--- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
+++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
@@ -85,6 +85,7 @@ def scroll_urns_by_filter(
         self,
         entity_type: str,
         extra_or_filters: List[Dict[str, str]],
+        extra_and_filters: List[Dict[str, str]] = [],
     ) -> Iterable[str]:
         """
         Scroll through all urns that match the given filters
@@ -92,10 +93,26 @@ def scroll_urns_by_filter(
 
         key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type)
         assert key_aspect, f"No key aspect found for entity type {entity_type}"
+        if extra_or_filters and extra_and_filters:
+            raise ValueError(
+                "Only one of extra_or_filters and extra_and_filters should be provided"
+            )
 
         count = 1000
-        query = " OR ".join(
-            [f"{filter['field']}:{filter['value']}" for filter in extra_or_filters]
+        query = (
+            " OR ".join(
+                [
+                    f"{filter['field']}:\"{filter['value']}\""
+                    for filter in extra_or_filters
+                ]
+            )
+            if extra_or_filters
+            else " AND ".join(
+                [
+                    f"{filter['field']}:\"{filter['value']}\""
+                    for filter in extra_and_filters
+                ]
+            )
         )
         scroll_id = None
         while True:
@@ -252,3 +269,23 @@ def search_by_key(
 
     def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None:
         graph_client.delete_entity(str(PlatformResourceUrn(self.id)), hard=hard)
+
+    @staticmethod
+    def search_by_filters(
+        graph_client: DataHubGraph,
+        and_filters: List[Dict[str, str]] = [],
+        or_filters: List[Dict[str, str]] = [],
+    ) -> Iterable["PlatformResource"]:
+        if and_filters and or_filters:
+            raise ValueError(
+                "Only one of and_filters and or_filters should be provided"
+            )
+        openapi_client = OpenAPIGraphClient(graph_client)
+        for urn in openapi_client.scroll_urns_by_filter(
+            entity_type="platformResource",
+            extra_or_filters=or_filters if or_filters else [],
+            extra_and_filters=and_filters if and_filters else [],
+        ):
+            platform_resource = PlatformResource.from_datahub(graph_client, urn)
+            if platform_resource:
+                yield platform_resource
diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py
index 7c53f72d843c93..7ebfd4d6ea15b4 100644
--- a/smoke-test/tests/platform_resources/test_platform_resource.py
+++ b/smoke-test/tests/platform_resources/test_platform_resource.py
@@ -112,3 +112,77 @@ def test_platform_resource_non_existent(graph_client, test_id):
         graph_client=graph_client,
     )
     assert platform_resource is None
+
+
+def test_platform_resource_urn_secondary_key(graph_client, test_id):
+    key = PlatformResourceKey(
+        platform=f"test_platform_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_{test_id}",
+    )
+    dataset_urn = (
+        f"urn:li:dataset:(urn:li:dataPlatform:test,test_secondary_key_{test_id},PROD)"
+    )
+    platform_resource = PlatformResource.create(
+        key=key,
+        value={"test_key": f"test_value_{test_id}"},
+        secondary_keys=[dataset_urn],
+    )
+    platform_resource.to_datahub(graph_client)
+    wait_for_writes_to_sync()
+
+    read_platform_resources = [
+        r
+        for r in PlatformResource.search_by_key(
+            graph_client, dataset_urn, primary=False
+        )
+    ]
+    assert len(read_platform_resources) == 1
+    assert read_platform_resources[0] == platform_resource
+
+
+def test_platform_resource_listing_by_resource_type(graph_client, test_id):
+    # Generate two resources with the same resource type
+    key1 = PlatformResourceKey(
+        platform=f"test_platform_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_1_{test_id}",
+    )
+    platform_resource1 = PlatformResource.create(
+        key=key1,
+        value={"test_key": f"test_value_1_{test_id}"},
+    )
+    platform_resource1.to_datahub(graph_client)
+
+    key2 = PlatformResourceKey(
+        platform=f"test_platform_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_2_{test_id}",
+    )
+    platform_resource2 = PlatformResource.create(
+        key=key2,
+        value={"test_key": f"test_value_2_{test_id}"},
+    )
+    platform_resource2.to_datahub(graph_client)
+
+    wait_for_writes_to_sync()
+
+    search_results = [
+        r
+        for r in PlatformResource.search_by_filters(
+            graph_client,
+            and_filters=[
+                {
+                    "field": "resourceType",
+                    "condition": "EQUAL",
+                    "value": key1.resource_type,
+                }
+            ],
+        )
+    ]
+    assert len(search_results) == 2
+
+    read_platform_resource_1 = next(r for r in search_results if r.id == key1.id)
+    read_platform_resource_2 = next(r for r in search_results if r.id == key2.id)
+    assert read_platform_resource_1 == platform_resource1
+    assert read_platform_resource_2 == platform_resource2

From c96a7d5b1cd2495ec3369167da7f6c841643e781 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Fri, 18 Oct 2024 03:13:18 -0700
Subject: [PATCH 19/31] fix(ingest/dbt): Prevent lineage cycles when parsing
 sql of dbt models (#11666)

---
 .../ingestion/source/dbt/dbt_common.py        |  6 +++
 .../tests/unit/test_dbt_source.py             | 42 +++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index 4cd3c934ce6348..c95d0e545c5989 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1989,6 +1989,11 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str:
                 time=mce_builder.get_sys_time(),
                 actor=_DEFAULT_ACTOR,
             )
+            sibling_urn = node.get_urn(
+                self.config.target_platform,
+                self.config.env,
+                self.config.target_platform_instance,
+            )
             return UpstreamLineageClass(
                 upstreams=[
                     UpstreamClass(
@@ -1997,6 +2002,7 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str:
                         auditStamp=auditStamp,
                     )
                     for upstream in upstream_urns
+                    if not (node.node_type == "model" and upstream == sibling_urn)
                 ],
                 fineGrainedLineages=(
                     (cll or None) if self.config.include_column_lineage else None
diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py
index 7d01ecd034523d..f0d4c3408271f7 100644
--- a/metadata-ingestion/tests/unit/test_dbt_source.py
+++ b/metadata-ingestion/tests/unit/test_dbt_source.py
@@ -10,6 +10,7 @@
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.dbt import dbt_cloud
 from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig
+from datahub.ingestion.source.dbt.dbt_common import DBTNode
 from datahub.ingestion.source.dbt.dbt_core import (
     DBTCoreConfig,
     DBTCoreSource,
@@ -253,6 +254,47 @@ def test_dbt_config_prefer_sql_parser_lineage():
     assert config.prefer_sql_parser_lineage is True
 
 
+def test_dbt_prefer_sql_parser_lineage_no_self_reference():
+    ctx = PipelineContext(run_id="test-run-id")
+    config = DBTCoreConfig.parse_obj(
+        {
+            **create_base_dbt_config(),
+            "skip_sources_in_lineage": True,
+            "prefer_sql_parser_lineage": True,
+        }
+    )
+    source: DBTCoreSource = DBTCoreSource(config, ctx, "dbt")
+    all_nodes_map = {
+        "model1": DBTNode(
+            name="model1",
+            database=None,
+            schema=None,
+            alias=None,
+            comment="",
+            description="",
+            language=None,
+            raw_code=None,
+            dbt_adapter="postgres",
+            dbt_name="model1",
+            dbt_file_path=None,
+            dbt_package_name=None,
+            node_type="model",
+            materialization="table",
+            max_loaded_at=None,
+            catalog_type=None,
+            missing_from_catalog=False,
+            owner=None,
+            compiled_code="SELECT d FROM results WHERE d > (SELECT MAX(d) FROM model1)",
+        ),
+    }
+    source._infer_schemas_and_update_cll(all_nodes_map)
+    upstream_lineage = source._create_lineage_aspect_for_dbt_node(
+        all_nodes_map["model1"], all_nodes_map
+    )
+    assert upstream_lineage is not None
+    assert len(upstream_lineage.upstreams) == 1
+
+
 def test_dbt_s3_config():
     # test missing aws config
     config_dict: dict = {

From b7f7f6e1d3a5d6a13c0ae9eb8eade0cee5855e19 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth <treff7es@gmail.com>
Date: Fri, 18 Oct 2024 20:16:08 +0200
Subject: [PATCH 20/31] fix(ingest/dagster): Fix JobSnapshot import is broken
 (#11672)

---
 .../datahub_dagster_plugin/client/dagster_generator.py    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py
index a2cf159dd12f6e..df123b127e0405 100644
--- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py
+++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py
@@ -12,7 +12,13 @@
     TableSchemaMetadataValue,
 )
 from dagster._core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus
-from dagster._core.snap import JobSnapshot
+
+try:
+    from dagster._core.snap import JobSnapshot  # type: ignore[attr-defined]
+except ImportError:
+    # Import changed since Dagster 1.8.12  to this -> https://github.com/dagster-io/dagster/commit/29a37d1f0260cfd112849633d1096ffc916d6c95
+    from dagster._core.snap import JobSnap as JobSnapshot
+
 from dagster._core.snap.node import OpDefSnap
 from dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatsSnapshot
 from datahub.api.entities.datajob import DataFlow, DataJob

From 44e5c43612ebc08c73cb91c044eaad47b5e991fc Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Fri, 18 Oct 2024 12:01:39 -0700
Subject: [PATCH 21/31] feat(ingest/transformer/domain): Add support for on
 conflict do nothing to dataset domain transformers (#11649)

---
 .../docs/transformer/dataset_transformer.md   | 28 +++----
 .../src/datahub/ingestion/graph/client.py     |  1 +
 .../ingestion/transformer/dataset_domain.py   | 41 ++++++----
 .../tests/unit/test_transform_dataset.py      | 76 +++++++++++++++++++
 4 files changed, 119 insertions(+), 27 deletions(-)

diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md
index d48c6d2c1ab5b4..66274ce64a8d29 100644
--- a/metadata-ingestion/docs/transformer/dataset_transformer.md
+++ b/metadata-ingestion/docs/transformer/dataset_transformer.md
@@ -122,12 +122,13 @@ transformers:
 ```
 ## Simple Add Dataset ownership 
 ### Config Details
-| Field              | Required | Type         | Default     | Description                                                         |
-|--------------------|----------|--------------|-------------|---------------------------------------------------------------------|
-| `owner_urns`       | ✅        | list[string] |             | List of owner urns.                                                 |
-| `ownership_type`   |          | string       | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) |
-| `replace_existing` |          | boolean      | `false`     | Whether to remove ownership from entity sent by ingestion source.      |
-| `semantics`        |          | enum         | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.    |
+| Field              | Required | Type         | Default     | Description                                                                                                |
+|--------------------|----------|--------------|-------------|------------------------------------------------------------------------------------------------------------|
+| `owner_urns`       | ✅        | list[string] |             | List of owner urns.                                                                                        |
+| `ownership_type`   |          | string       | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn)                                        |
+| `replace_existing` |          | boolean      | `false`     | Whether to remove ownership from entity sent by ingestion source.                                          |
+| `semantics`        |          | enum         | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.                                           |
+| `on_conflict`      |          | enum         | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. |
 
 For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics).
 
@@ -191,13 +192,14 @@ transformers:
 
 ## Pattern Add Dataset ownership 
 ### Config Details
-| Field              | Required | Type                 | Default     | Description                                                                             |
-|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------|
-| `owner_pattern`    | ✅        | map[regx, list[urn]] |             | entity urn with regular expression and list of owners urn apply to matching entity urn. |
-| `ownership_type`   |          | string               | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn)                     |
-| `replace_existing` |          | boolean              | `false`     | Whether to remove owners from entity sent by ingestion source.                          |
-| `semantics`        |          | enum                 | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.                        |
-| `is_container`     |          | bool    | `false`    | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. |
+| Field              | Required | Type                 | Default     | Description                                                                                                                  |
+|--------------------|----------|----------------------|-------------|------------------------------------------------------------------------------------------------------------------------------|
+| `owner_pattern`    | ✅        | map[regx, list[urn]] |             | entity urn with regular expression and list of owners urn apply to matching entity urn.                                      |
+| `ownership_type`   |          | string               | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn)                                                          |
+| `replace_existing` |          | boolean              | `false`     | Whether to remove owners from entity sent by ingestion source.                                                               |
+| `semantics`        |          | enum                 | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.                                                             |
+| `is_container`     |          | bool                 | `false`     | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. |
+| `on_conflict`      |          | enum                 | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant.                   |
 
 let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework.  This will match the pattern to `urn` of the dataset and assign the respective owners.
 
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index e8fae6254ae885..1d2528a24c4e57 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -351,6 +351,7 @@ def get_tags(self, entity_urn: str) -> Optional[GlobalTagsClass]:
     def get_glossary_terms(self, entity_urn: str) -> Optional[GlossaryTermsClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=GlossaryTermsClass)
 
+    @functools.lru_cache(maxsize=1)
     def get_domain(self, entity_urn: str) -> Optional[DomainsClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass)
 
diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py
index 6a838248152650..6b78b71eaa78e9 100644
--- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py
+++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py
@@ -1,6 +1,8 @@
 import logging
+from enum import auto
 from typing import Callable, Dict, List, Optional, Sequence, Union, cast
 
+from datahub.configuration._config_enum import ConfigEnum
 from datahub.configuration.common import (
     ConfigurationError,
     KeyValuePattern,
@@ -23,6 +25,13 @@
 logger = logging.getLogger(__name__)
 
 
+class TransformerOnConflict(ConfigEnum):
+    """Describes the behavior of the transformer when writing an aspect that already exists."""
+
+    DO_UPDATE = auto()  # On conflict, apply the new aspect
+    DO_NOTHING = auto()  # On conflict, do not apply the new aspect
+
+
 class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
     get_domains_to_add: Union[
         Callable[[str], DomainsClass],
@@ -32,10 +41,12 @@ class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
     _resolve_domain_fn = pydantic_resolve_key("get_domains_to_add")
 
     is_container: bool = False
+    on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE
 
 
 class SimpleDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
     domains: List[str]
+    on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE
 
 
 class PatternDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
@@ -80,12 +91,13 @@ def get_domain_class(
 
     @staticmethod
     def _merge_with_server_domains(
-        graph: DataHubGraph, urn: str, mce_domain: Optional[DomainsClass]
+        graph: Optional[DataHubGraph], urn: str, mce_domain: Optional[DomainsClass]
     ) -> Optional[DomainsClass]:
         if not mce_domain or not mce_domain.domains:
             # nothing to add, no need to consult server
             return None
 
+        assert graph
         server_domain = graph.get_domain(entity_urn=urn)
         if server_domain:
             # compute patch
@@ -155,7 +167,7 @@ def transform_aspect(
         self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
     ) -> Optional[Aspect]:
         in_domain_aspect: DomainsClass = cast(DomainsClass, aspect)
-        domain_aspect = DomainsClass(domains=[])
+        domain_aspect: DomainsClass = DomainsClass(domains=[])
         # Check if we have received existing aspect
         if in_domain_aspect is not None and self.config.replace_existing is False:
             domain_aspect.domains.extend(in_domain_aspect.domains)
@@ -164,16 +176,18 @@ def transform_aspect(
 
         domain_aspect.domains.extend(domain_to_add.domains)
 
-        if self.config.semantics == TransformerSemantics.PATCH:
-            assert self.ctx.graph
-            patch_domain_aspect: Optional[
-                DomainsClass
-            ] = AddDatasetDomain._merge_with_server_domains(
-                self.ctx.graph, entity_urn, domain_aspect
-            )
-            return cast(Optional[Aspect], patch_domain_aspect)
-
-        return cast(Optional[Aspect], domain_aspect)
+        final_aspect: Optional[DomainsClass] = domain_aspect
+        if domain_aspect.domains:
+            if self.config.on_conflict == TransformerOnConflict.DO_NOTHING:
+                assert self.ctx.graph
+                server_domain = self.ctx.graph.get_domain(entity_urn)
+                if server_domain and server_domain.domains:
+                    return None
+            if self.config.semantics == TransformerSemantics.PATCH:
+                final_aspect = AddDatasetDomain._merge_with_server_domains(
+                    self.ctx.graph, entity_urn, domain_aspect
+                )
+        return cast(Optional[Aspect], final_aspect)
 
 
 class SimpleAddDatasetDomain(AddDatasetDomain):
@@ -186,8 +200,7 @@ def __init__(
         domains = AddDatasetDomain.get_domain_class(ctx.graph, config.domains)
         generic_config = AddDatasetDomainSemanticsConfig(
             get_domains_to_add=lambda _: domains,
-            semantics=config.semantics,
-            replace_existing=config.replace_existing,
+            **config.dict(exclude={"domains"}),
         )
         super().__init__(generic_config, ctx)
 
diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py
index 2e2e85b5d18113..4e9a38cb37ae63 100644
--- a/metadata-ingestion/tests/unit/test_transform_dataset.py
+++ b/metadata-ingestion/tests/unit/test_transform_dataset.py
@@ -56,6 +56,7 @@
 from datahub.ingestion.transformer.dataset_domain import (
     PatternAddDatasetDomain,
     SimpleAddDatasetDomain,
+    TransformerOnConflict,
 )
 from datahub.ingestion.transformer.dataset_domain_based_on_tags import (
     DatasetTagDomainMapper,
@@ -2498,6 +2499,81 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass:
     assert server_domain in transformed_aspect.domains
 
 
+def test_simple_add_dataset_domain_on_conflict_do_nothing(
+    pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance
+):
+    acryl_domain = builder.make_domain_urn("acryl.io")
+    datahub_domain = builder.make_domain_urn("datahubproject.io")
+    server_domain = builder.make_domain_urn("test.io")
+
+    pipeline_context = PipelineContext(run_id="transformer_pipe_line")
+    pipeline_context.graph = mock_datahub_graph_instance
+
+    # Return fake aspect to simulate server behaviour
+    def fake_get_domain(entity_urn: str) -> models.DomainsClass:
+        return models.DomainsClass(domains=[server_domain])
+
+    pipeline_context.graph.get_domain = fake_get_domain  # type: ignore
+
+    output = run_dataset_transformer_pipeline(
+        transformer_type=SimpleAddDatasetDomain,
+        aspect=models.DomainsClass(domains=[datahub_domain]),
+        config={
+            "replace_existing": False,
+            "semantics": TransformerSemantics.PATCH,
+            "domains": [acryl_domain],
+            "on_conflict": TransformerOnConflict.DO_NOTHING,
+        },
+        pipeline_context=pipeline_context,
+    )
+
+    assert len(output) == 1
+    assert output[0] is not None
+    assert output[0].record is not None
+    assert isinstance(output[0].record, EndOfStream)
+
+
+def test_simple_add_dataset_domain_on_conflict_do_nothing_no_conflict(
+    pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance
+):
+    acryl_domain = builder.make_domain_urn("acryl.io")
+    datahub_domain = builder.make_domain_urn("datahubproject.io")
+    irrelevant_domain = builder.make_domain_urn("test.io")
+
+    pipeline_context = PipelineContext(run_id="transformer_pipe_line")
+    pipeline_context.graph = mock_datahub_graph_instance
+
+    # Return fake aspect to simulate server behaviour
+    def fake_get_domain(entity_urn: str) -> models.DomainsClass:
+        return models.DomainsClass(domains=[])
+
+    pipeline_context.graph.get_domain = fake_get_domain  # type: ignore
+
+    output = run_dataset_transformer_pipeline(
+        transformer_type=SimpleAddDatasetDomain,
+        aspect=models.DomainsClass(domains=[datahub_domain]),
+        config={
+            "replace_existing": False,
+            "semantics": TransformerSemantics.PATCH,
+            "domains": [acryl_domain],
+            "on_conflict": TransformerOnConflict.DO_NOTHING,
+        },
+        pipeline_context=pipeline_context,
+    )
+
+    assert len(output) == 2
+    assert output[0] is not None
+    assert output[0].record is not None
+    assert isinstance(output[0].record, MetadataChangeProposalWrapper)
+    assert output[0].record.aspect is not None
+    assert isinstance(output[0].record.aspect, models.DomainsClass)
+    transformed_aspect = cast(models.DomainsClass, output[0].record.aspect)
+    assert len(transformed_aspect.domains) == 2
+    assert datahub_domain in transformed_aspect.domains
+    assert acryl_domain in transformed_aspect.domains
+    assert irrelevant_domain not in transformed_aspect.domains
+
+
 def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph_instance):
     pipeline_context: PipelineContext = PipelineContext(
         run_id="test_simple_add_dataset_domain"

From 3e8c666567e72571524c68fc0d853dabd856def7 Mon Sep 17 00:00:00 2001
From: Jay Feldman <8128360+feldjay@users.noreply.github.com>
Date: Fri, 18 Oct 2024 15:58:52 -0400
Subject: [PATCH 22/31] fix(ingest/looker): Remove bad imports from
 looker_common (#11663)

---
 .../src/datahub/ingestion/source/looker/looker_common.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index 3cbb13375229b9..317b212f7fa96d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -929,7 +929,6 @@ def from_api(  # noqa: C901
         reporter: SourceReport,
         source_config: LookerDashboardSourceConfig,
     ) -> Optional["LookerExplore"]:  # noqa: C901
-        from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME
 
         try:
             explore = client.lookml_model_explore(model, explore_name)
@@ -1194,7 +1193,6 @@ def _to_metadata_events(  # noqa: C901
     ) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]:
         # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
         # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.
-        from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME
 
         dataset_snapshot = DatasetSnapshot(
             urn=self.get_explore_urn(config),

From f723f5131517be3c742f03122d56dc95bea7f31e Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Fri, 18 Oct 2024 13:05:06 -0700
Subject: [PATCH 23/31] feat(ingest/looker): include project name in
 model/explore properties (#11664)

Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
---
 .../ingestion/source/looker/looker_common.py  | 16 ++--
 .../ingestion/source/looker/looker_source.py  | 41 ++++++----
 .../looker/golden_looker_mces.json            |  7 ++
 .../looker/golden_test_allow_ingest.json      |  4 +
 ...olden_test_external_project_view_mces.json |  4 +
 .../looker/golden_test_file_path_ingest.json  |  4 +
 ...olden_test_folder_path_pattern_ingest.json |  4 +
 .../golden_test_independent_look_ingest.json  | 82 +++++++++++--------
 .../looker/golden_test_ingest.json            |  4 +
 .../looker/golden_test_ingest_joins.json      |  4 +
 .../golden_test_ingest_unaliased_joins.json   |  4 +
 ...en_test_non_personal_independent_look.json |  7 ++
 .../looker_mces_golden_deleted_stateful.json  | 16 ++--
 .../looker/looker_mces_usage_history.json     |  4 +
 14 files changed, 135 insertions(+), 66 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index 317b212f7fa96d..3d1683100474e8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -1205,15 +1205,19 @@ def _to_metadata_events(  # noqa: C901
         dataset_snapshot.aspects.append(browse_paths)
         dataset_snapshot.aspects.append(StatusClass(removed=False))
 
-        custom_properties = {}
-        if self.label is not None:
-            custom_properties["looker.explore.label"] = str(self.label)
-        if self.source_file is not None:
-            custom_properties["looker.explore.file"] = str(self.source_file)
+        custom_properties = {
+            "project": self.project_name,
+            "model": self.model_name,
+            "looker.explore.label": self.label,
+            "looker.explore.name": self.name,
+            "looker.explore.file": self.source_file,
+        }
         dataset_props = DatasetPropertiesClass(
             name=str(self.label) if self.label else LookerUtil._display_name(self.name),
             description=self.description,
-            customProperties=custom_properties,
+            customProperties={
+                k: str(v) for k, v in custom_properties.items() if v is not None
+            },
         )
         dataset_props.externalUrl = self._get_url(base_url)
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
index f269ccf1cd98f8..e42ac7b61c1777 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -139,26 +139,21 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
     """
 
     platform = "looker"
-    source_config: LookerDashboardSourceConfig
-    reporter: LookerDashboardSourceReport
-    user_registry: LookerUserRegistry
-    reachable_look_registry: Set[
-        str
-    ]  # Keep track of look-id which are reachable from Dashboard
 
     def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
-        self.source_config = config
-        self.reporter = LookerDashboardSourceReport()
+        self.source_config: LookerDashboardSourceConfig = config
+        self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
         self.looker_api: LookerAPI = LookerAPI(self.source_config)
-        self.user_registry = LookerUserRegistry(self.looker_api)
-        self.explore_registry = LookerExploreRegistry(
+        self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
+        self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
             self.looker_api, self.reporter, self.source_config
         )
         self.reporter._looker_explore_registry = self.explore_registry
         self.reporter._looker_api = self.looker_api
 
-        self.reachable_look_registry = set()
+        # Keep track of look-id which are reachable from Dashboard
+        self.reachable_look_registry: Set[str] = set()
 
         # (model, explore) -> list of charts/looks/dashboards that reference this explore
         # The list values are used purely for debugging purposes.
@@ -868,21 +863,31 @@ def _make_explore_metadata_events(
     ) -> Iterable[
         Union[MetadataChangeEvent, MetadataChangeProposalWrapper, MetadataWorkUnit]
     ]:
-        if self.source_config.emit_used_explores_only:
-            explores_to_fetch = list(self.reachable_explores.keys())
-        else:
+        if not self.source_config.emit_used_explores_only:
             explores_to_fetch = list(self.list_all_explores())
+        else:
+            # We don't keep track of project names for each explore right now.
+            # Because project names are just used for a custom property, it's
+            # fine to set them to None.
+            # TODO: Track project names for each explore.
+            explores_to_fetch = [
+                (None, model, explore)
+                for (model, explore) in self.reachable_explores.keys()
+            ]
         explores_to_fetch.sort()
 
         processed_models: List[str] = []
 
-        for model, _ in explores_to_fetch:
+        for project_name, model, _ in explores_to_fetch:
             if model not in processed_models:
                 model_key = gen_model_key(self.source_config, model)
                 yield from gen_containers(
                     container_key=model_key,
                     name=model,
                     sub_types=[BIContainerSubTypes.LOOKML_MODEL],
+                    extra_properties=(
+                        {"project": project_name} if project_name is not None else None
+                    ),
                 )
                 yield MetadataChangeProposalWrapper(
                     entityUrn=model_key.as_urn(),
@@ -896,7 +901,7 @@ def _make_explore_metadata_events(
         self.reporter.total_explores = len(explores_to_fetch)
         for future in BackpressureAwareExecutor.map(
             self.fetch_one_explore,
-            ((model, explore) for (model, explore) in explores_to_fetch),
+            ((model, explore) for (_project, model, explore) in explores_to_fetch),
             max_workers=self.source_config.max_threads,
         ):
             events, explore_id, start_time, end_time = future.result()
@@ -907,7 +912,7 @@ def _make_explore_metadata_events(
                 f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}"
             )
 
-    def list_all_explores(self) -> Iterable[Tuple[str, str]]:
+    def list_all_explores(self) -> Iterable[Tuple[Optional[str], str, str]]:
         # returns a list of (model, explore) tuples
 
         for model in self.looker_api.all_lookml_models():
@@ -916,7 +921,7 @@ def list_all_explores(self) -> Iterable[Tuple[str, str]]:
             for explore in model.explores:
                 if explore.name is None:
                     continue
-                yield (model.name, explore.name)
+                yield (model.project_name, model.name, explore.name)
 
     def fetch_one_explore(
         self, model: str, explore: str
diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
index 5cac7b1bb73b19..a9c445b5986efe 100644
--- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -440,7 +441,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "bogus data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/bogus data/my_view",
@@ -616,7 +620,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
index 24a738a815cda8..af9c62a2a41803 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -282,7 +283,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
index b1460779da4f5f..b89bc356b48fdc 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
@@ -202,6 +202,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -520,7 +521,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "looker_hub",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
index 74400b9b5cc56b..810fefd8f6cb85 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
@@ -202,6 +202,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -520,7 +521,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "looker_hub",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
index 89241fb52fb634..3d78397f54a235 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
@@ -287,6 +287,7 @@
                         "description": "third",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -613,7 +614,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
index f178e97e78fa02..5a540e61e768d7 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
@@ -210,6 +210,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -1107,12 +1108,12 @@
 {
     "proposedSnapshot": {
         "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
             "aspects": [
                 {
                     "com.linkedin.pegasus2avro.common.BrowsePaths": {
                         "paths": [
-                            "/Explore/data"
+                            "/Explore/sales_model"
                         ]
                     }
                 },
@@ -1124,10 +1125,13 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "sales_model",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "sales_explore",
                             "looker.explore.file": "test_source_file.lkml"
                         },
-                        "externalUrl": "https://looker.company.com/explore/data/my_view",
+                        "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
                         "name": "My Explore View",
                         "description": "lorem ipsum",
                         "tags": []
@@ -1149,7 +1153,7 @@
                 },
                 {
                     "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "my_view",
+                        "schemaName": "sales_explore",
                         "platform": "urn:li:dataPlatform:looker",
                         "version": 0,
                         "created": {
@@ -1204,7 +1208,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "subTypes",
     "aspect": {
@@ -1223,12 +1227,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "embed",
     "aspect": {
         "json": {
-            "renderUrl": "https://looker.company.com/embed/explore/data/my_view"
+            "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore"
         }
     },
     "systemMetadata": {
@@ -1240,12 +1244,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "container",
     "aspect": {
         "json": {
-            "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
+            "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
         }
     },
     "systemMetadata": {
@@ -1257,7 +1261,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "browsePathsV2",
     "aspect": {
@@ -1267,8 +1271,8 @@
                     "id": "Explore"
                 },
                 {
-                    "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42",
-                    "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
+                    "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5",
+                    "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
                 }
             ]
         }
@@ -1283,12 +1287,12 @@
 {
     "proposedSnapshot": {
         "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
             "aspects": [
                 {
                     "com.linkedin.pegasus2avro.common.BrowsePaths": {
                         "paths": [
-                            "/Explore/order_model"
+                            "/Explore/data"
                         ]
                     }
                 },
@@ -1300,10 +1304,13 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
-                        "externalUrl": "https://looker.company.com/explore/order_model/order_explore",
+                        "externalUrl": "https://looker.company.com/explore/data/my_view",
                         "name": "My Explore View",
                         "description": "lorem ipsum",
                         "tags": []
@@ -1325,7 +1332,7 @@
                 },
                 {
                     "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "order_explore",
+                        "schemaName": "my_view",
                         "platform": "urn:li:dataPlatform:looker",
                         "version": 0,
                         "created": {
@@ -1380,7 +1387,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "subTypes",
     "aspect": {
@@ -1399,12 +1406,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "embed",
     "aspect": {
         "json": {
-            "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore"
+            "renderUrl": "https://looker.company.com/embed/explore/data/my_view"
         }
     },
     "systemMetadata": {
@@ -1416,12 +1423,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "container",
     "aspect": {
         "json": {
-            "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
+            "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
         }
     },
     "systemMetadata": {
@@ -1433,7 +1440,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "browsePathsV2",
     "aspect": {
@@ -1443,8 +1450,8 @@
                     "id": "Explore"
                 },
                 {
-                    "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60",
-                    "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
+                    "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42",
+                    "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
                 }
             ]
         }
@@ -1459,12 +1466,12 @@
 {
     "proposedSnapshot": {
         "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
             "aspects": [
                 {
                     "com.linkedin.pegasus2avro.common.BrowsePaths": {
                         "paths": [
-                            "/Explore/sales_model"
+                            "/Explore/order_model"
                         ]
                     }
                 },
@@ -1476,10 +1483,13 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "order_model",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "order_explore",
                             "looker.explore.file": "test_source_file.lkml"
                         },
-                        "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
+                        "externalUrl": "https://looker.company.com/explore/order_model/order_explore",
                         "name": "My Explore View",
                         "description": "lorem ipsum",
                         "tags": []
@@ -1501,7 +1511,7 @@
                 },
                 {
                     "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "sales_explore",
+                        "schemaName": "order_explore",
                         "platform": "urn:li:dataPlatform:looker",
                         "version": 0,
                         "created": {
@@ -1556,7 +1566,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "subTypes",
     "aspect": {
@@ -1575,12 +1585,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "embed",
     "aspect": {
         "json": {
-            "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore"
+            "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore"
         }
     },
     "systemMetadata": {
@@ -1592,12 +1602,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "container",
     "aspect": {
         "json": {
-            "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
+            "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
         }
     },
     "systemMetadata": {
@@ -1609,7 +1619,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "browsePathsV2",
     "aspect": {
@@ -1619,8 +1629,8 @@
                     "id": "Explore"
                 },
                 {
-                    "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5",
-                    "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
+                    "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60",
+                    "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
                 }
             ]
         }
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
index d969ef62a96e5f..9ac95b8482a475 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
@@ -229,6 +229,7 @@
                             "urn:li:chart:(looker,ap-south-1.dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -574,7 +575,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
index 153db363c78280..3a2c6359ea63c2 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
@@ -202,6 +202,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -520,7 +521,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
index 98adbdc5b829e4..007eee348aeaf8 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -282,7 +283,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
index 63ffdda8c5b6f5..859b9163d7aad6 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
@@ -210,6 +210,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -783,7 +784,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
@@ -959,7 +963,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "sales_model",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "sales_explore",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
index 567ab78a14754b..8256c984afb274 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
@@ -210,6 +210,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -539,7 +540,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
@@ -810,8 +814,8 @@
     }
 },
 {
-    "entityType": "chart",
-    "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
+    "entityType": "dashboard",
+    "entityUrn": "urn:li:dashboard:(looker,dashboards.11)",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
@@ -827,8 +831,8 @@
     }
 },
 {
-    "entityType": "container",
-    "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7",
+    "entityType": "chart",
+    "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
@@ -844,8 +848,8 @@
     }
 },
 {
-    "entityType": "dashboard",
-    "entityUrn": "urn:li:dashboard:(looker,dashboards.11)",
+    "entityType": "container",
+    "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
index 3befb62a631de5..0b3530f9c24629 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -234,7 +235,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",

From b41716b91c70407c2385bbad69660f078717a473 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Fri, 18 Oct 2024 14:29:03 -0700
Subject: [PATCH 24/31] feat(ingest/fivetran): protect against high sync volume
 (#11589)

---
 .../ingestion/source/fivetran/fivetran.py     | 21 ++++---
 .../source/fivetran/fivetran_log_api.py       | 63 ++++++++++---------
 .../source/fivetran/fivetran_query.py         | 34 +++++++---
 .../integration/fivetran/test_fivetran.py     | 58 ++++-------------
 4 files changed, 87 insertions(+), 89 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index 704a6f20a5c19b..334bb58ea84f8e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -27,7 +27,10 @@
     PlatformDetail,
 )
 from datahub.ingestion.source.fivetran.data_classes import Connector, Job
-from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI
+from datahub.ingestion.source.fivetran.fivetran_log_api import (
+    MAX_JOBS_PER_CONNECTOR,
+    FivetranLogAPI,
+)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
 )
@@ -72,11 +75,6 @@ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext):
 
         self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
 
-        # Create and register the stateful ingestion use-case handler.
-        self.stale_entity_removal_handler = StaleEntityRemovalHandler.create(
-            self, self.config, self.ctx
-        )
-
     def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
         input_dataset_urn_list: List[DatasetUrn] = []
         output_dataset_urn_list: List[DatasetUrn] = []
@@ -267,6 +265,13 @@ def _get_connector_workunits(
             ).as_workunit(is_primary_source=False)
 
         # Map Fivetran's job/sync history entity with Datahub's data process entity
+        if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
+            self.report.warning(
+                title="Not all sync history was captured",
+                message=f"The connector had more than {MAX_JOBS_PER_CONNECTOR} sync runs in the past {self.config.history_sync_lookback_period} days. "
+                f"Only the most recent {MAX_JOBS_PER_CONNECTOR} syncs were ingested.",
+                context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
+            )
         for job in connector.jobs:
             dpi = self._generate_dpi_from_job(job, datajob)
             yield from self._get_dpi_workunits(job, dpi)
@@ -279,7 +284,9 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [
             *super().get_workunit_processors(),
-            self.stale_entity_removal_handler.workunit_processor,
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
         ]
 
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
index 31c16139066e43..5908efe39e2b40 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -22,6 +22,10 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+# We don't want to generate a massive number of dataProcesses for a single connector.
+# This is primarily used as a safeguard to prevent performance issues.
+MAX_JOBS_PER_CONNECTOR = 1000
+
 
 class FivetranLogAPI:
     def __init__(self, fivetran_log_config: FivetranLogConfig) -> None:
@@ -158,34 +162,32 @@ def _get_table_lineage(
 
         return table_lineage_list
 
-    def _get_all_connector_sync_logs(self, syncs_interval: int) -> Dict[str, Dict]:
-        sync_logs = {}
-        for row in self._query(
-            self.fivetran_log_query.get_sync_logs_query().format(
-                db_clause=self.fivetran_log_query.db_clause,
-                syncs_interval=syncs_interval,
-            )
-        ):
-            if row[Constant.CONNECTOR_ID] not in sync_logs:
-                sync_logs[row[Constant.CONNECTOR_ID]] = {
-                    row[Constant.SYNC_ID]: {
-                        row["message_event"]: (
-                            row[Constant.TIME_STAMP].timestamp(),
-                            row[Constant.MESSAGE_DATA],
-                        )
-                    }
-                }
-            elif row[Constant.SYNC_ID] not in sync_logs[row[Constant.CONNECTOR_ID]]:
-                sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]] = {
-                    row["message_event"]: (
-                        row[Constant.TIME_STAMP].timestamp(),
-                        row[Constant.MESSAGE_DATA],
-                    )
-                }
-            else:
-                sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]][
-                    row["message_event"]
-                ] = (row[Constant.TIME_STAMP].timestamp(), row[Constant.MESSAGE_DATA])
+    def _get_all_connector_sync_logs(
+        self, syncs_interval: int, connector_ids: List[str]
+    ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]:
+        sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {}
+
+        # Format connector_ids as a comma-separated string of quoted IDs
+        formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
+
+        query = self.fivetran_log_query.get_sync_logs_query().format(
+            db_clause=self.fivetran_log_query.db_clause,
+            syncs_interval=syncs_interval,
+            max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR,
+            connector_ids=formatted_connector_ids,
+        )
+
+        for row in self._query(query):
+            connector_id = row[Constant.CONNECTOR_ID]
+            sync_id = row[Constant.SYNC_ID]
+
+            if connector_id not in sync_logs:
+                sync_logs[connector_id] = {}
+
+            sync_logs[connector_id][sync_id] = {
+                "sync_start": (row["start_time"].timestamp(), None),
+                "sync_end": (row["end_time"].timestamp(), row["end_message_data"]),
+            }
 
         return sync_logs
 
@@ -244,7 +246,10 @@ def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None:
     def _fill_connectors_jobs(
         self, connectors: List[Connector], syncs_interval: int
     ) -> None:
-        sync_logs = self._get_all_connector_sync_logs(syncs_interval)
+        connector_ids = [connector.connector_id for connector in connectors]
+        sync_logs = self._get_all_connector_sync_logs(
+            syncs_interval, connector_ids=connector_ids
+        )
         for connector in connectors:
             connector.jobs = self._get_jobs_list(sync_logs.get(connector.connector_id))
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
index d965f53ff554b3..c4680b4b1037a2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
@@ -37,14 +37,32 @@ def get_users_query(self) -> str:
 
     def get_sync_logs_query(self) -> str:
         return """
-        SELECT connector_id,
-        sync_id,
-        message_event,
-        message_data,
-        time_stamp
-        FROM {db_clause}log
-        WHERE message_event in ('sync_start', 'sync_end')
-        and time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'"""
+        WITH ranked_syncs AS (
+            SELECT
+                connector_id,
+                sync_id,
+                MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
+                MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
+                MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
+                ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
+            FROM {db_clause}log
+            WHERE message_event in ('sync_start', 'sync_end')
+            AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
+            AND connector_id IN ({connector_ids})
+            GROUP BY connector_id, sync_id
+        )
+        SELECT
+            connector_id,
+            sync_id,
+            start_time,
+            end_time,
+            end_message_data
+        FROM ranked_syncs
+        WHERE rn <= {max_jobs_per_connector}
+          AND start_time IS NOT NULL
+          AND end_time IS NOT NULL
+        ORDER BY connector_id, end_time DESC
+        """
 
     def get_table_lineage_query(self) -> str:
         return f"""
diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
index 0f5d098ee39c4a..33ac09e69a3c0a 100644
--- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
+++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
@@ -101,64 +101,32 @@ def default_query_results(
             }
         ]
     elif query == fivetran_log_query.get_sync_logs_query().format(
-        db_clause=fivetran_log_query.db_clause, syncs_interval=7
+        db_clause=fivetran_log_query.db_clause,
+        syncs_interval=7,
+        max_jobs_per_connector=1000,
+        connector_ids="'calendar_elected'",
     ):
         return [
             {
                 "connector_id": "calendar_elected",
                 "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
+                "start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
+                "end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
+                "end_message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"',
             },
             {
                 "connector_id": "calendar_elected",
                 "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000),
+                "start_time": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000),
+                "end_time": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000),
+                "end_message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"',
             },
             {
                 "connector_id": "calendar_elected",
                 "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 5, 403000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
-                "message_event": "sync_end",
-                "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"',
-                "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
-                "message_event": "sync_end",
-                "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"',
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
-                "message_event": "sync_end",
-                "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"',
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832",
-                "message_event": "sync_end",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 35, 478000),
+                "start_time": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000),
+                "end_time": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000),
+                "end_message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"',
             },
         ]
     # Unreachable code

From df7456564810eea07fff0b74112c5cba0951fc0a Mon Sep 17 00:00:00 2001
From: Shirshanka Das <shirshanka@apache.org>
Date: Sat, 19 Oct 2024 14:53:28 -0700
Subject: [PATCH 25/31] feat(sdk):platform-resource - complex queries (#11675)

---
 .../platformresource/platform_resource.py     | 193 ++++++------
 .../src/datahub/utilities/openapi_utils.py    |  69 +++++
 .../src/datahub/utilities/search_utils.py     | 285 ++++++++++++++++++
 .../test_platform_resource.py                 |  15 +
 .../tests/unit/utilities/test_search_utils.py |  71 +++++
 .../test_platform_resource.py                 |  78 ++++-
 6 files changed, 617 insertions(+), 94 deletions(-)
 create mode 100644 metadata-ingestion/src/datahub/utilities/openapi_utils.py
 create mode 100644 metadata-ingestion/src/datahub/utilities/search_utils.py
 create mode 100644 metadata-ingestion/tests/unit/utilities/test_search_utils.py

diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
index 349b0ff11d84f7..0f7b10a067053a 100644
--- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
+++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type, Union, cast
 
 from avrogen.dict_wrapper import DictWrapper
 from pydantic import BaseModel
@@ -14,7 +14,14 @@
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import DatahubKey
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.metadata.urns import PlatformResourceUrn
+from datahub.metadata.urns import DataPlatformUrn, PlatformResourceUrn, Urn
+from datahub.utilities.openapi_utils import OpenAPIGraphClient
+from datahub.utilities.search_utils import (
+    ElasticDocumentQuery,
+    ElasticsearchQueryBuilder,
+    LogicalOperator,
+    SearchField,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -69,71 +76,75 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass:
         )
 
 
-class OpenAPIGraphClient:
+class DataPlatformInstanceUrn:
+    """
+    A simple implementation of a URN class for DataPlatformInstance.
+    Since this is not present in the URN registry, we need to implement it here.
+    """
 
-    ENTITY_KEY_ASPECT_MAP = {
-        aspect_type.ASPECT_INFO.get("keyForEntity"): name
-        for name, aspect_type in models.ASPECT_NAME_MAP.items()
-        if aspect_type.ASPECT_INFO.get("keyForEntity")
-    }
+    @staticmethod
+    def create_from_id(platform_instance_urn: str) -> Urn:
+        if platform_instance_urn.startswith("urn:li:platformInstance:"):
+            string_urn = platform_instance_urn
+        else:
+            string_urn = f"urn:li:platformInstance:{platform_instance_urn}"
+        return Urn.from_string(string_urn)
 
-    def __init__(self, graph: DataHubGraph):
-        self.graph = graph
-        self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3"
 
-    def scroll_urns_by_filter(
-        self,
-        entity_type: str,
-        extra_or_filters: List[Dict[str, str]],
-        extra_and_filters: List[Dict[str, str]] = [],
-    ) -> Iterable[str]:
-        """
-        Scroll through all urns that match the given filters
-        """
+class UrnSearchField(SearchField):
+    """
+    A search field that supports URN values.
+    TODO: Move this to search_utils after we make this more generic.
+    """
 
-        key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type)
-        assert key_aspect, f"No key aspect found for entity type {entity_type}"
-        if extra_or_filters and extra_and_filters:
-            raise ValueError(
-                "Only one of extra_or_filters and extra_and_filters should be provided"
-            )
+    def __init__(self, field_name: str, urn_value_extractor: Callable[[str], Urn]):
+        self.urn_value_extractor = urn_value_extractor
+        super().__init__(field_name)
 
-        count = 1000
-        query = (
-            " OR ".join(
-                [
-                    f"{filter['field']}:\"{filter['value']}\""
-                    for filter in extra_or_filters
-                ]
-            )
-            if extra_or_filters
-            else " AND ".join(
-                [
-                    f"{filter['field']}:\"{filter['value']}\""
-                    for filter in extra_and_filters
-                ]
-            )
+    def get_search_value(self, value: str) -> str:
+        return str(self.urn_value_extractor(value))
+
+
+class PlatformResourceSearchField(SearchField):
+    def __init__(self, field_name: str):
+        super().__init__(field_name)
+
+    @classmethod
+    def from_search_field(
+        cls, search_field: SearchField
+    ) -> "PlatformResourceSearchField":
+        # pretends to be a class method, but just returns the input
+        return search_field  # type: ignore
+
+
+class PlatformResourceSearchFields:
+    PRIMARY_KEY = PlatformResourceSearchField("primaryKey")
+    RESOURCE_TYPE = PlatformResourceSearchField("resourceType")
+    SECONDARY_KEYS = PlatformResourceSearchField("secondaryKeys")
+    PLATFORM = PlatformResourceSearchField.from_search_field(
+        UrnSearchField(
+            field_name="platform.keyword",
+            urn_value_extractor=DataPlatformUrn.create_from_id,
         )
-        scroll_id = None
-        while True:
-            response = self.graph._get_generic(
-                self.openapi_base + f"/entity/{entity_type.lower()}",
-                params={
-                    "systemMetadata": "false",
-                    "includeSoftDelete": "false",
-                    "skipCache": "false",
-                    "aspects": [key_aspect],
-                    "scrollId": scroll_id,
-                    "count": count,
-                    "query": query,
-                },
-            )
-            entities = response.get("entities", [])
-            scroll_id = response.get("scrollId")
-            for entity in entities:
-                yield entity["urn"]
-            if not scroll_id:
-                break
+    )
+    PLATFORM_INSTANCE = PlatformResourceSearchField.from_search_field(
+        UrnSearchField(
+            field_name="platformInstance.keyword",
+            urn_value_extractor=DataPlatformInstanceUrn.create_from_id,
+        )
+    )
+
+
+class ElasticPlatformResourceQuery(ElasticDocumentQuery[PlatformResourceSearchField]):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def create_from(
+        cls: Type["ElasticPlatformResourceQuery"],
+        *args: Tuple[Union[str, PlatformResourceSearchField], str],
+    ) -> "ElasticPlatformResourceQuery":
+        return cast(ElasticPlatformResourceQuery, super().create_from(*args))
 
 
 class PlatformResource(BaseModel):
@@ -147,6 +158,12 @@ def remove(
         cls,
         key: PlatformResourceKey,
     ) -> "PlatformResource":
+        """
+        Creates a PlatformResource object with the removed status set to True.
+        Removed PlatformResource objects are used to soft-delete resources from
+        the graph.
+        To hard-delete a resource, use the delete method.
+        """
         return cls(
             id=key.id,
             removed=True,
@@ -240,28 +257,38 @@ def from_datahub(
 
     @staticmethod
     def search_by_key(
-        graph_client: DataHubGraph, key: str, primary: bool = True
+        graph_client: DataHubGraph,
+        key: str,
+        primary: bool = True,
+        is_exact: bool = True,
     ) -> Iterable["PlatformResource"]:
-        extra_or_filters = []
-        extra_or_filters.append(
-            {
-                "field": "primaryKey",
-                "condition": "EQUAL",
-                "value": key,
-            }
+        """
+        Searches for PlatformResource entities by primary or secondary key.
+
+        :param graph_client: DataHubGraph client
+        :param key: The key to search for
+        :param primary: Whether to search for primary only or expand the search
+            to secondary keys (default: True)
+        :param is_exact: Whether to search for an exact match (default: True)
+        :return: An iterable of PlatformResource objects
+        """
+
+        elastic_platform_resource_group = (
+            ElasticPlatformResourceQuery.create_from()
+            .group(LogicalOperator.OR)
+            .add_field_match(
+                PlatformResourceSearchFields.PRIMARY_KEY, key, is_exact=is_exact
+            )
         )
         if not primary:  # we expand the search to secondary keys
-            extra_or_filters.append(
-                {
-                    "field": "secondaryKeys",
-                    "condition": "EQUAL",
-                    "value": key,
-                }
+            elastic_platform_resource_group.add_field_match(
+                PlatformResourceSearchFields.SECONDARY_KEYS, key, is_exact=is_exact
             )
+        query = elastic_platform_resource_group.end()
         openapi_client = OpenAPIGraphClient(graph_client)
         for urn in openapi_client.scroll_urns_by_filter(
             entity_type="platformResource",
-            extra_or_filters=extra_or_filters,
+            query=query,
         ):
             platform_resource = PlatformResource.from_datahub(graph_client, urn)
             if platform_resource:
@@ -273,18 +300,16 @@ def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None:
     @staticmethod
     def search_by_filters(
         graph_client: DataHubGraph,
-        and_filters: List[Dict[str, str]] = [],
-        or_filters: List[Dict[str, str]] = [],
+        query: Union[
+            ElasticPlatformResourceQuery,
+            ElasticDocumentQuery,
+            ElasticsearchQueryBuilder,
+        ],
     ) -> Iterable["PlatformResource"]:
-        if and_filters and or_filters:
-            raise ValueError(
-                "Only one of and_filters and or_filters should be provided"
-            )
         openapi_client = OpenAPIGraphClient(graph_client)
         for urn in openapi_client.scroll_urns_by_filter(
             entity_type="platformResource",
-            extra_or_filters=or_filters if or_filters else [],
-            extra_and_filters=and_filters if and_filters else [],
+            query=query,
         ):
             platform_resource = PlatformResource.from_datahub(graph_client, urn)
             if platform_resource:
diff --git a/metadata-ingestion/src/datahub/utilities/openapi_utils.py b/metadata-ingestion/src/datahub/utilities/openapi_utils.py
new file mode 100644
index 00000000000000..e704ff7f84cbbc
--- /dev/null
+++ b/metadata-ingestion/src/datahub/utilities/openapi_utils.py
@@ -0,0 +1,69 @@
+import logging
+from typing import Iterable, Union
+
+import datahub.metadata.schema_classes as models
+from datahub.ingestion.graph.client import DataHubGraph
+from datahub.utilities.search_utils import (
+    ElasticDocumentQuery,
+    ElasticsearchQueryBuilder,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAPIGraphClient:
+    """
+    An experimental client for the DataHubGraph that uses the OpenAPI endpoints
+    to query entities and aspects.
+    Does not support all features of the DataHubGraph.
+    API is subject to change.
+
+    DO NOT USE THIS UNLESS YOU KNOW WHAT YOU ARE DOING.
+    """
+
+    ENTITY_KEY_ASPECT_MAP = {
+        aspect_type.ASPECT_INFO.get("keyForEntity"): name
+        for name, aspect_type in models.ASPECT_NAME_MAP.items()
+        if aspect_type.ASPECT_INFO.get("keyForEntity")
+    }
+
+    def __init__(self, graph: DataHubGraph):
+        self.graph = graph
+        self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3"
+
+    def scroll_urns_by_filter(
+        self,
+        entity_type: str,
+        query: Union[ElasticDocumentQuery, ElasticsearchQueryBuilder],
+    ) -> Iterable[str]:
+        """
+        Scroll through all urns that match the given filters.
+
+        """
+
+        key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type)
+        assert key_aspect, f"No key aspect found for entity type {entity_type}"
+
+        count = 1000
+        string_query = query.build()
+        scroll_id = None
+        logger.debug(f"Scrolling with query: {string_query}")
+        while True:
+            response = self.graph._get_generic(
+                self.openapi_base + f"/entity/{entity_type.lower()}",
+                params={
+                    "systemMetadata": "false",
+                    "includeSoftDelete": "false",
+                    "skipCache": "false",
+                    "aspects": [key_aspect],
+                    "scrollId": scroll_id,
+                    "count": count,
+                    "query": string_query,
+                },
+            )
+            entities = response.get("entities", [])
+            scroll_id = response.get("scrollId")
+            for entity in entities:
+                yield entity["urn"]
+            if not scroll_id:
+                break
diff --git a/metadata-ingestion/src/datahub/utilities/search_utils.py b/metadata-ingestion/src/datahub/utilities/search_utils.py
new file mode 100644
index 00000000000000..0bd88addd86600
--- /dev/null
+++ b/metadata-ingestion/src/datahub/utilities/search_utils.py
@@ -0,0 +1,285 @@
+import logging
+import re
+from enum import Enum
+from typing import Generic, List, Optional, Tuple, Type, TypeVar, Union
+
+logger = logging.getLogger(__name__)
+
+
+class LogicalOperator(Enum):
+    AND = "AND"
+    OR = "OR"
+
+
+class SearchField:
+    def __init__(self, field_name: str):
+        self.field_name = field_name
+
+    def get_search_value(self, value: str) -> str:
+        return value
+
+    def __str__(self) -> str:
+        return self.field_name
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    @classmethod
+    def from_string_field(cls, field_name: str) -> "SearchField":
+        return cls(field_name)
+
+
+class QueryNode:
+    def __init__(self, operator: Optional[LogicalOperator] = None):
+        self.operator = operator
+        self.children: List[Union[QueryNode, str]] = []
+
+    def add_child(self, child: Union["QueryNode", str]) -> None:
+        self.children.append(child)
+
+    def build(self) -> str:
+        if not self.children:
+            return ""
+
+        if self.operator is None:
+            return (
+                self.children[0]
+                if isinstance(self.children[0], str)
+                else self.children[0].build()
+            )
+
+        child_queries = []
+        for child in self.children:
+            if isinstance(child, str):
+                child_queries.append(child)
+            else:
+                child_queries.append(child.build())
+
+        joined_queries = f" {self.operator.value} ".join(child_queries)
+        return f"({joined_queries})" if len(child_queries) > 1 else joined_queries
+
+
+class ElasticsearchQueryBuilder:
+    SPECIAL_CHARACTERS = r'+-=&|><!(){}[]^"~*?:\/'
+
+    def __init__(self, operator: LogicalOperator = LogicalOperator.AND) -> None:
+        self.root = QueryNode(operator=operator)
+
+    @classmethod
+    def escape_special_characters(cls, value: str) -> str:
+        """
+        Escape special characters in the search term.
+        """
+        return re.sub(f"([{re.escape(cls.SPECIAL_CHARACTERS)}])", r"\\\1", value)
+
+    def _create_term(
+        self, field: SearchField, value: str, is_exact: bool = False
+    ) -> str:
+        escaped_value = self.escape_special_characters(field.get_search_value(value))
+        field_name: str = field.field_name
+        if is_exact:
+            return f'{field_name}:"{escaped_value}"'
+        return f"{field_name}:{escaped_value}"
+
+    def add_field_match(
+        self, field: SearchField, value: str, is_exact: bool = True
+    ) -> "ElasticsearchQueryBuilder":
+        term = self._create_term(field, value, is_exact)
+        self.root.add_child(term)
+        return self
+
+    def add_field_not_match(
+        self, field: SearchField, value: str, is_exact: bool = True
+    ) -> "ElasticsearchQueryBuilder":
+        term = f"-{self._create_term(field, value, is_exact)}"
+        self.root.add_child(term)
+        return self
+
+    def add_range(
+        self,
+        field: str,
+        min_value: Optional[str] = None,
+        max_value: Optional[str] = None,
+        include_min: bool = True,
+        include_max: bool = True,
+    ) -> "ElasticsearchQueryBuilder":
+        min_bracket = "[" if include_min else "{"
+        max_bracket = "]" if include_max else "}"
+        min_val = min_value if min_value is not None else "*"
+        max_val = max_value if max_value is not None else "*"
+        range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}"
+        self.root.add_child(range_query)
+        return self
+
+    def add_wildcard(self, field: str, pattern: str) -> "ElasticsearchQueryBuilder":
+        wildcard_query = f"{field}:{pattern}"
+        self.root.add_child(wildcard_query)
+        return self
+
+    def add_fuzzy(
+        self, field: str, value: str, fuzziness: int = 2
+    ) -> "ElasticsearchQueryBuilder":
+        fuzzy_query = f"{field}:{value}~{fuzziness}"
+        self.root.add_child(fuzzy_query)
+        return self
+
+    def add_boost(
+        self, field: str, value: str, boost: float
+    ) -> "ElasticsearchQueryBuilder":
+        boosted_query = f"{field}:{value}^{boost}"
+        self.root.add_child(boosted_query)
+        return self
+
+    def group(self, operator: LogicalOperator) -> "QueryGroup":
+        return QueryGroup(self, operator)
+
+    def build(self) -> str:
+        return self.root.build()
+
+
+class QueryGroup:
+    def __init__(self, parent: ElasticsearchQueryBuilder, operator: LogicalOperator):
+        self.parent = parent
+        self.node = QueryNode(operator)
+        self.parent.root.add_child(self.node)
+
+    def add_field_match(
+        self, field: Union[str, SearchField], value: str, is_exact: bool = True
+    ) -> "QueryGroup":
+        if isinstance(field, str):
+            field = SearchField.from_string_field(field)
+        term = self.parent._create_term(field, value, is_exact)
+        self.node.add_child(term)
+        return self
+
+    def add_field_not_match(
+        self, field: Union[str, SearchField], value: str, is_exact: bool = True
+    ) -> "QueryGroup":
+        if isinstance(field, str):
+            field = SearchField.from_string_field(field)
+        term = f"-{self.parent._create_term(field, value, is_exact)}"
+        self.node.add_child(term)
+        return self
+
+    def add_range(
+        self,
+        field: str,
+        min_value: Optional[str] = None,
+        max_value: Optional[str] = None,
+        include_min: bool = True,
+        include_max: bool = True,
+    ) -> "QueryGroup":
+        min_bracket = "[" if include_min else "{"
+        max_bracket = "]" if include_max else "}"
+        min_val = min_value if min_value is not None else "*"
+        max_val = max_value if max_value is not None else "*"
+        range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}"
+        self.node.add_child(range_query)
+        return self
+
+    def add_wildcard(self, field: str, pattern: str) -> "QueryGroup":
+        wildcard_query = f"{field}:{pattern}"
+        self.node.add_child(wildcard_query)
+        return self
+
+    def add_fuzzy(self, field: str, value: str, fuzziness: int = 2) -> "QueryGroup":
+        fuzzy_query = f"{field}:{value}~{fuzziness}"
+        self.node.add_child(fuzzy_query)
+        return self
+
+    def add_boost(self, field: str, value: str, boost: float) -> "QueryGroup":
+        boosted_query = f"{field}:{value}^{boost}"
+        self.node.add_child(boosted_query)
+        return self
+
+    def group(self, operator: LogicalOperator) -> "QueryGroup":
+        new_group = QueryGroup(self.parent, operator)
+        self.node.add_child(new_group.node)
+        return new_group
+
+    def end(self) -> ElasticsearchQueryBuilder:
+        return self.parent
+
+
+SF = TypeVar("SF", bound=SearchField)
+
+
+class ElasticDocumentQuery(Generic[SF]):
+    def __init__(self) -> None:
+        self.query_builder = ElasticsearchQueryBuilder()
+
+    @classmethod
+    def create_from(
+        cls: Type["ElasticDocumentQuery[SF]"],
+        *args: Tuple[Union[str, SF], str],
+    ) -> "ElasticDocumentQuery[SF]":
+        instance = cls()
+        for arg in args:
+            if isinstance(arg, SearchField):
+                # If the value is empty, we treat it as a wildcard search
+                logger.info(f"Adding wildcard search for field {arg}")
+                instance.add_wildcard(arg, "*")
+            elif isinstance(arg, tuple) and len(arg) == 2:
+                field, value = arg
+                assert isinstance(value, str)
+                if isinstance(field, SearchField):
+                    instance.add_field_match(field, value)
+                elif isinstance(field, str):
+                    instance.add_field_match(
+                        SearchField.from_string_field(field), value
+                    )
+                else:
+                    raise ValueError("Invalid field type {}".format(type(field)))
+        return instance
+
+    def add_field_match(
+        self, field: Union[str, SearchField], value: str, is_exact: bool = True
+    ) -> "ElasticDocumentQuery":
+        if isinstance(field, str):
+            field = SearchField.from_string_field(field)
+        self.query_builder.add_field_match(field, value, is_exact)
+        return self
+
+    def add_field_not_match(
+        self, field: SearchField, value: str, is_exact: bool = True
+    ) -> "ElasticDocumentQuery":
+        self.query_builder.add_field_not_match(field, value, is_exact)
+        return self
+
+    def add_range(
+        self,
+        field: SearchField,
+        min_value: Optional[str] = None,
+        max_value: Optional[str] = None,
+        include_min: bool = True,
+        include_max: bool = True,
+    ) -> "ElasticDocumentQuery":
+        field_name: str = field.field_name  # type: ignore
+        self.query_builder.add_range(
+            field_name, min_value, max_value, include_min, include_max
+        )
+        return self
+
+    def add_wildcard(self, field: SearchField, pattern: str) -> "ElasticDocumentQuery":
+        field_name: str = field.field_name  # type: ignore
+        self.query_builder.add_wildcard(field_name, pattern)
+        return self
+
+    def add_fuzzy(
+        self, field: SearchField, value: str, fuzziness: int = 2
+    ) -> "ElasticDocumentQuery":
+        field_name: str = field.field_name  # type: ignore
+        self.query_builder.add_fuzzy(field_name, value, fuzziness)
+        return self
+
+    def add_boost(
+        self, field: SearchField, value: str, boost: float
+    ) -> "ElasticDocumentQuery":
+        self.query_builder.add_boost(field.field_name, value, boost)
+        return self
+
+    def group(self, operator: LogicalOperator) -> QueryGroup:
+        return self.query_builder.group(operator)
+
+    def build(self) -> str:
+        return self.query_builder.build()
diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py
index e6c9a9466d62b4..a84e373dbe72c2 100644
--- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py
+++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py
@@ -4,9 +4,12 @@
 
 import datahub.metadata.schema_classes as models
 from datahub.api.entities.platformresource.platform_resource import (
+    ElasticPlatformResourceQuery,
     PlatformResource,
     PlatformResourceKey,
+    PlatformResourceSearchFields,
 )
+from datahub.utilities.search_utils import LogicalOperator
 
 
 def test_platform_resource_dict():
@@ -179,3 +182,15 @@ class TestModel(BaseModel):
     ).encode("utf-8")
     assert platform_resource_info_mcp.aspect.value.schemaType == "JSON"
     assert platform_resource_info_mcp.aspect.value.schemaRef == TestModel.__name__
+
+
+def test_platform_resource_filters():
+
+    query = (
+        ElasticPlatformResourceQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match(PlatformResourceSearchFields.PRIMARY_KEY, "test_1")
+        .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, "server")
+        .end()
+    )
+    assert query.build() == '(primaryKey:"test_1" AND resourceType:"server")'
diff --git a/metadata-ingestion/tests/unit/utilities/test_search_utils.py b/metadata-ingestion/tests/unit/utilities/test_search_utils.py
new file mode 100644
index 00000000000000..6fa2e46c7f20e8
--- /dev/null
+++ b/metadata-ingestion/tests/unit/utilities/test_search_utils.py
@@ -0,0 +1,71 @@
+from datahub.utilities.search_utils import (
+    ElasticDocumentQuery,
+    LogicalOperator,
+    SearchField,
+)
+
+
+def test_simple_and_filters():
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match("field1", "value1")
+        .add_field_match("field2", "value2")
+        .end()
+    )
+
+    assert query.build() == '(field1:"value1" AND field2:"value2")'
+
+
+def test_simple_or_filters():
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.OR)
+        .add_field_match("field1", "value1")
+        .add_field_match("field2", "value2")
+        .end()
+    )
+
+    assert query.build() == '(field1:"value1" OR field2:"value2")'
+
+    # Use SearchFilter to create this query
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.OR)
+        .add_field_match(SearchField.from_string_field("field1"), "value1")
+        .add_field_match(SearchField.from_string_field("field2"), "value2")
+        .end()
+    )
+    assert query.build() == '(field1:"value1" OR field2:"value2")'
+
+
+def test_simple_field_match():
+    query: ElasticDocumentQuery = ElasticDocumentQuery.create_from(
+        ("field1", "value1:1")
+    )
+    assert query.build() == 'field1:"value1\\:1"'
+
+    # Another way to create the same query
+    query = ElasticDocumentQuery.create_from()
+    query.add_field_match("field1", "value1:1")
+    assert query.build() == 'field1:"value1\\:1"'
+
+
+def test_negation():
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match("field1", "value1")
+        .add_field_not_match("field2", "value2")
+        .end()
+    )
+
+    assert query.build() == '(field1:"value1" AND -field2:"value2")'
+
+
+def test_multi_arg_create_from():
+    query: ElasticDocumentQuery = ElasticDocumentQuery.create_from(
+        ("field1", "value1"),
+        ("field2", "value2"),
+    )
+    assert query.build() == '(field1:"value1" AND field2:"value2")'
diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py
index 7ebfd4d6ea15b4..39d15f2e8dea6d 100644
--- a/smoke-test/tests/platform_resources/test_platform_resource.py
+++ b/smoke-test/tests/platform_resources/test_platform_resource.py
@@ -5,8 +5,10 @@
 
 import pytest
 from datahub.api.entities.platformresource.platform_resource import (
+    ElasticPlatformResourceQuery,
     PlatformResource,
     PlatformResourceKey,
+    PlatformResourceSearchFields,
 )
 
 from tests.utils import wait_for_healthcheck_util, wait_for_writes_to_sync
@@ -42,7 +44,12 @@ def cleanup_resources(graph_client):
             logger.warning(f"Failed to delete resource: {e}")
 
     # Additional cleanup for any resources that might have been missed
-    for resource in PlatformResource.search_by_key(graph_client, "test_"):
+    for resource in PlatformResource.search_by_filters(
+        graph_client,
+        ElasticPlatformResourceQuery.create_from().add_wildcard(
+            PlatformResourceSearchFields.PRIMARY_KEY, "test_*"
+        ),
+    ):
         try:
             resource.delete(graph_client)
         except Exception as e:
@@ -114,7 +121,7 @@ def test_platform_resource_non_existent(graph_client, test_id):
     assert platform_resource is None
 
 
-def test_platform_resource_urn_secondary_key(graph_client, test_id):
+def test_platform_resource_urn_secondary_key(graph_client, test_id, cleanup_resources):
     key = PlatformResourceKey(
         platform=f"test_platform_{test_id}",
         resource_type=f"test_resource_type_{test_id}",
@@ -129,6 +136,7 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id):
         secondary_keys=[dataset_urn],
     )
     platform_resource.to_datahub(graph_client)
+    cleanup_resources.append(platform_resource)
     wait_for_writes_to_sync()
 
     read_platform_resources = [
@@ -141,7 +149,9 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id):
     assert read_platform_resources[0] == platform_resource
 
 
-def test_platform_resource_listing_by_resource_type(graph_client, test_id):
+def test_platform_resource_listing_by_resource_type(
+    graph_client, test_id, cleanup_resources
+):
     # Generate two resources with the same resource type
     key1 = PlatformResourceKey(
         platform=f"test_platform_{test_id}",
@@ -171,13 +181,9 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id):
         r
         for r in PlatformResource.search_by_filters(
             graph_client,
-            and_filters=[
-                {
-                    "field": "resourceType",
-                    "condition": "EQUAL",
-                    "value": key1.resource_type,
-                }
-            ],
+            query=ElasticPlatformResourceQuery.create_from(
+                (PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type)
+            ),
         )
     ]
     assert len(search_results) == 2
@@ -186,3 +192,55 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id):
     read_platform_resource_2 = next(r for r in search_results if r.id == key2.id)
     assert read_platform_resource_1 == platform_resource1
     assert read_platform_resource_2 == platform_resource2
+
+
+def test_platform_resource_listing_complex_queries(graph_client, test_id):
+    # Generate two resources with the same resource type
+    key1 = PlatformResourceKey(
+        platform=f"test_platform1_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_1_{test_id}",
+    )
+    platform_resource1 = PlatformResource.create(
+        key=key1,
+        value={"test_key": f"test_value_1_{test_id}"},
+    )
+    platform_resource1.to_datahub(graph_client)
+
+    key2 = PlatformResourceKey(
+        platform=f"test_platform2_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_2_{test_id}",
+    )
+    platform_resource2 = PlatformResource.create(
+        key=key2,
+        value={"test_key": f"test_value_2_{test_id}"},
+    )
+    platform_resource2.to_datahub(graph_client)
+
+    wait_for_writes_to_sync()
+    from datahub.api.entities.platformresource.platform_resource import (
+        ElasticPlatformResourceQuery,
+        LogicalOperator,
+        PlatformResourceSearchFields,
+    )
+
+    query = (
+        ElasticPlatformResourceQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type)
+        .add_field_not_match(PlatformResourceSearchFields.PLATFORM, key1.platform)
+        .end()
+    )
+
+    search_results = [
+        r
+        for r in PlatformResource.search_by_filters(
+            graph_client,
+            query=query,
+        )
+    ]
+    assert len(search_results) == 1
+
+    read_platform_resource = search_results[0]
+    assert read_platform_resource == platform_resource2

From 1957753fa287ec17f5e7ab3e6e6d1d84440ed93d Mon Sep 17 00:00:00 2001
From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com>
Date: Sun, 20 Oct 2024 06:58:32 +0530
Subject: [PATCH 26/31] fix(docs): fix businessattributes doc (#11653)

---
 docs/businessattributes.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/businessattributes.md b/docs/businessattributes.md
index 1744f48f879e82..3e912e7e609805 100644
--- a/docs/businessattributes.md
+++ b/docs/businessattributes.md
@@ -28,7 +28,6 @@ Taking the example of "United States- Social Security Number", if an application
 What you need to create/update and associate business attributes to dataset schema field
 
 * **Manage Business Attributes** platform privilege to create/update/delete business attributes.
-* **Edit Dataset Column Business Attribute** metadata privilege to associate business attributes to dataset schema field.
 
 ## Using Business Attributes
 As of now Business Attributes can only be created through UI

From 9a98e495434a6a73cb2459f2436a2855a987f5b0 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Sun, 20 Oct 2024 23:59:45 -0700
Subject: [PATCH 27/31] feat(ingest/fivetran): add safeguards on table/column
 lineage (#11674)

---
 .../ingestion/source/fivetran/config.py       |  19 +--
 .../ingestion/source/fivetran/data_classes.py |   2 +-
 .../ingestion/source/fivetran/fivetran.py     |  23 ++-
 .../source/fivetran/fivetran_log_api.py       |  86 +++++------
 .../source/fivetran/fivetran_query.py         | 143 +++++++++++-------
 .../integration/fivetran/test_fivetran.py     |   6 +-
 6 files changed, 156 insertions(+), 123 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
index 02eb096b240f52..2fb5ffd16ea34c 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
@@ -1,6 +1,6 @@
+import dataclasses
 import logging
-from dataclasses import dataclass, field as dataclass_field
-from typing import Dict, List, Optional
+from typing import Dict, Optional
 
 import pydantic
 from pydantic import Field, root_validator
@@ -23,6 +23,7 @@
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.perf_timer import PerfTimer
 
 logger = logging.getLogger(__name__)
@@ -114,24 +115,24 @@ def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
         return values
 
 
-@dataclass
+@dataclasses.dataclass
 class MetadataExtractionPerfReport(Report):
-    connectors_metadata_extraction_sec: PerfTimer = dataclass_field(
+    connectors_metadata_extraction_sec: PerfTimer = dataclasses.field(
         default_factory=PerfTimer
     )
-    connectors_lineage_extraction_sec: PerfTimer = dataclass_field(
+    connectors_lineage_extraction_sec: PerfTimer = dataclasses.field(
         default_factory=PerfTimer
     )
-    connectors_jobs_extraction_sec: PerfTimer = dataclass_field(
+    connectors_jobs_extraction_sec: PerfTimer = dataclasses.field(
         default_factory=PerfTimer
     )
 
 
-@dataclass
+@dataclasses.dataclass
 class FivetranSourceReport(StaleEntityRemovalSourceReport):
     connectors_scanned: int = 0
-    filtered_connectors: List[str] = dataclass_field(default_factory=list)
-    metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field(
+    filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
+    metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
         default_factory=MetadataExtractionPerfReport
     )
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
index 18de2b01edd3b7..046aa9efe3f59b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
@@ -24,7 +24,7 @@ class Connector:
     sync_frequency: int
     destination_id: str
     user_id: str
-    table_lineage: List[TableLineage]
+    lineage: List[TableLineage]
     jobs: List["Job"]
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index 334bb58ea84f8e..c27ec57c2e99ec 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -27,9 +27,10 @@
     PlatformDetail,
 )
 from datahub.ingestion.source.fivetran.data_classes import Connector, Job
-from datahub.ingestion.source.fivetran.fivetran_log_api import (
+from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI
+from datahub.ingestion.source.fivetran.fivetran_query import (
     MAX_JOBS_PER_CONNECTOR,
-    FivetranLogAPI,
+    MAX_TABLE_LINEAGE_PER_CONNECTOR,
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
@@ -106,13 +107,21 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
                 f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity."
             )
 
-        for table_lineage in connector.table_lineage:
+        if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR:
+            self.report.warning(
+                title="Table lineage truncated",
+                message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. "
+                f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.",
+                context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
+            )
+
+        for lineage in connector.lineage:
             input_dataset_urn = DatasetUrn.create_from_ids(
                 platform_id=source_platform,
                 table_name=(
-                    f"{source_database.lower()}.{table_lineage.source_table}"
+                    f"{source_database.lower()}.{lineage.source_table}"
                     if source_database
-                    else table_lineage.source_table
+                    else lineage.source_table
                 ),
                 env=source_platform_detail.env,
                 platform_instance=source_platform_detail.platform_instance,
@@ -121,14 +130,14 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
 
             output_dataset_urn = DatasetUrn.create_from_ids(
                 platform_id=self.config.fivetran_log_config.destination_platform,
-                table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}",
+                table_name=f"{self.audit_log.fivetran_log_database.lower()}.{lineage.destination_table}",
                 env=destination_platform_detail.env,
                 platform_instance=destination_platform_detail.platform_instance,
             )
             output_dataset_urn_list.append(output_dataset_urn)
 
             if self.config.include_column_lineage:
-                for column_lineage in table_lineage.column_lineage:
+                for column_lineage in lineage.column_lineage:
                     fine_grained_lineage.append(
                         FineGrainedLineage(
                             upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
index 5908efe39e2b40..b55c8bbbd607fa 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -1,6 +1,7 @@
 import functools
 import json
 import logging
+from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple
 
 import sqlglot
@@ -22,10 +23,6 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
-# We don't want to generate a massive number of dataProcesses for a single connector.
-# This is primarily used as a safeguard to prevent performance issues.
-MAX_JOBS_PER_CONNECTOR = 1000
-
 
 class FivetranLogAPI:
     def __init__(self, fivetran_log_config: FivetranLogConfig) -> None:
@@ -91,55 +88,51 @@ def _query(self, query: str) -> List[Dict]:
         resp = self.engine.execute(query)
         return [row for row in resp]
 
-    def _get_column_lineage_metadata(self) -> Dict[str, List]:
+    def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]:
         """
-        Return's dict of column lineage metadata with key as '<SOURCE_TABLE_ID>-<DESTINATION_TABLE_ID>'
+        Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
         """
-        all_column_lineage: Dict[str, List] = {}
+        all_column_lineage = defaultdict(list)
         column_lineage_result = self._query(
             self.fivetran_log_query.get_column_lineage_query()
         )
         for column_lineage in column_lineage_result:
-            key = f"{column_lineage[Constant.SOURCE_TABLE_ID]}-{column_lineage[Constant.DESTINATION_TABLE_ID]}"
-            if key not in all_column_lineage:
-                all_column_lineage[key] = [column_lineage]
-            else:
-                all_column_lineage[key].append(column_lineage)
-        return all_column_lineage
+            key = (
+                column_lineage[Constant.SOURCE_TABLE_ID],
+                column_lineage[Constant.DESTINATION_TABLE_ID],
+            )
+            all_column_lineage[key].append(column_lineage)
+        return dict(all_column_lineage)
 
-    def _get_connectors_table_lineage_metadata(self) -> Dict[str, List]:
+    def _get_table_lineage_metadata(self) -> Dict[str, List]:
         """
-        Return's dict of table lineage metadata with key as 'CONNECTOR_ID'
+        Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
         """
-        connectors_table_lineage_metadata: Dict[str, List] = {}
+        connectors_table_lineage_metadata = defaultdict(list)
         table_lineage_result = self._query(
             self.fivetran_log_query.get_table_lineage_query()
         )
         for table_lineage in table_lineage_result:
-            if (
+            connectors_table_lineage_metadata[
                 table_lineage[Constant.CONNECTOR_ID]
-                not in connectors_table_lineage_metadata
-            ):
-                connectors_table_lineage_metadata[
-                    table_lineage[Constant.CONNECTOR_ID]
-                ] = [table_lineage]
-            else:
-                connectors_table_lineage_metadata[
-                    table_lineage[Constant.CONNECTOR_ID]
-                ].append(table_lineage)
-        return connectors_table_lineage_metadata
+            ].append(table_lineage)
+        return dict(connectors_table_lineage_metadata)
 
-    def _get_table_lineage(
+    def _extract_connector_lineage(
         self,
-        column_lineage_metadata: Dict[str, List],
         table_lineage_result: Optional[List],
+        column_lineage_metadata: Dict[Tuple[str, str], List],
     ) -> List[TableLineage]:
         table_lineage_list: List[TableLineage] = []
         if table_lineage_result is None:
             return table_lineage_list
         for table_lineage in table_lineage_result:
+            # Join the column lineage into the table lineage.
             column_lineage_result = column_lineage_metadata.get(
-                f"{table_lineage[Constant.SOURCE_TABLE_ID]}-{table_lineage[Constant.DESTINATION_TABLE_ID]}"
+                (
+                    table_lineage[Constant.SOURCE_TABLE_ID],
+                    table_lineage[Constant.DESTINATION_TABLE_ID],
+                )
             )
             column_lineage_list: List[ColumnLineage] = []
             if column_lineage_result:
@@ -152,6 +145,7 @@ def _get_table_lineage(
                     )
                     for column_lineage in column_lineage_result
                 ]
+
             table_lineage_list.append(
                 TableLineage(
                     source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}",
@@ -167,14 +161,9 @@ def _get_all_connector_sync_logs(
     ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]:
         sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {}
 
-        # Format connector_ids as a comma-separated string of quoted IDs
-        formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
-
-        query = self.fivetran_log_query.get_sync_logs_query().format(
-            db_clause=self.fivetran_log_query.db_clause,
+        query = self.fivetran_log_query.get_sync_logs_query(
             syncs_interval=syncs_interval,
-            max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR,
-            connector_ids=formatted_connector_ids,
+            connector_ids=connector_ids,
         )
 
         for row in self._query(query):
@@ -234,13 +223,13 @@ def get_user_email(self, user_id: str) -> Optional[str]:
             return None
         return self._get_users().get(user_id)
 
-    def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None:
-        table_lineage_metadata = self._get_connectors_table_lineage_metadata()
+    def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
+        table_lineage_metadata = self._get_table_lineage_metadata()
         column_lineage_metadata = self._get_column_lineage_metadata()
         for connector in connectors:
-            connector.table_lineage = self._get_table_lineage(
-                column_lineage_metadata=column_lineage_metadata,
+            connector.lineage = self._extract_connector_lineage(
                 table_lineage_result=table_lineage_metadata.get(connector.connector_id),
+                column_lineage_metadata=column_lineage_metadata,
             )
 
     def _fill_connectors_jobs(
@@ -262,6 +251,7 @@ def get_allowed_connectors_list(
     ) -> List[Connector]:
         connectors: List[Connector] = []
         with report.metadata_extraction_perf.connectors_metadata_extraction_sec:
+            logger.info("Fetching connector list")
             connector_list = self._query(self.fivetran_log_query.get_connectors_query())
             for connector in connector_list:
                 if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]):
@@ -279,12 +269,20 @@ def get_allowed_connectors_list(
                         sync_frequency=connector[Constant.SYNC_FREQUENCY],
                         destination_id=connector[Constant.DESTINATION_ID],
                         user_id=connector[Constant.CONNECTING_USER_ID],
-                        table_lineage=[],
-                        jobs=[],
+                        lineage=[],  # filled later
+                        jobs=[],  # filled later
                     )
                 )
+
+        if not connectors:
+            # Some of our queries don't work well when there's no connectors, since
+            # we push down connector id filters.
+            return []
+
         with report.metadata_extraction_perf.connectors_lineage_extraction_sec:
-            self._fill_connectors_table_lineage(connectors)
+            logger.info("Fetching connector lineage")
+            self._fill_connectors_lineage(connectors)
         with report.metadata_extraction_perf.connectors_jobs_extraction_sec:
+            logger.info("Fetching connector job run history")
             self._fill_connectors_jobs(connectors, syncs_interval)
         return connectors
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
index c4680b4b1037a2..c9e329b706768f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
@@ -1,3 +1,11 @@
+from typing import List
+
+# Safeguards to prevent fetching massive amounts of data.
+MAX_TABLE_LINEAGE_PER_CONNECTOR = 100
+MAX_COLUMN_LINEAGE_PER_CONNECTOR = 3000
+MAX_JOBS_PER_CONNECTOR = 1000
+
+
 class FivetranLogQuery:
     # Note: All queries are written in Snowflake SQL.
     # They will be transpiled to the target database's SQL dialect at runtime.
@@ -24,69 +32,88 @@ def get_connectors_query(self) -> str:
   destination_id
 FROM {self.db_clause}connector
 WHERE
-  _fivetran_deleted = FALSE\
+  _fivetran_deleted = FALSE
 """
 
     def get_users_query(self) -> str:
-        return f"""
-        SELECT id as user_id,
-        given_name,
-        family_name,
-        email
-        FROM {self.db_clause}user"""
+        return f"""\
+SELECT id as user_id,
+given_name,
+family_name,
+email
+FROM {self.db_clause}user
+"""
 
-    def get_sync_logs_query(self) -> str:
-        return """
-        WITH ranked_syncs AS (
-            SELECT
-                connector_id,
-                sync_id,
-                MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
-                MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
-                MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
-                ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
-            FROM {db_clause}log
-            WHERE message_event in ('sync_start', 'sync_end')
-            AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
-            AND connector_id IN ({connector_ids})
-            GROUP BY connector_id, sync_id
-        )
-        SELECT
-            connector_id,
-            sync_id,
-            start_time,
-            end_time,
-            end_message_data
-        FROM ranked_syncs
-        WHERE rn <= {max_jobs_per_connector}
-          AND start_time IS NOT NULL
-          AND end_time IS NOT NULL
-        ORDER BY connector_id, end_time DESC
-        """
+    def get_sync_logs_query(
+        self,
+        syncs_interval: int,
+        connector_ids: List[str],
+    ) -> str:
+        # Format connector_ids as a comma-separated string of quoted IDs
+        formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
+
+        return f"""\
+WITH ranked_syncs AS (
+    SELECT
+        connector_id,
+        sync_id,
+        MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
+        MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
+        MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
+        ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
+    FROM {self.db_clause}log
+    WHERE message_event in ('sync_start', 'sync_end')
+    AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
+    AND connector_id IN ({formatted_connector_ids})
+    GROUP BY connector_id, sync_id
+)
+SELECT
+    connector_id,
+    sync_id,
+    start_time,
+    end_time,
+    end_message_data
+FROM ranked_syncs
+WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
+    AND start_time IS NOT NULL
+    AND end_time IS NOT NULL
+ORDER BY connector_id, end_time DESC
+"""
 
     def get_table_lineage_query(self) -> str:
-        return f"""
-        SELECT stm.connector_id as connector_id,
-        stm.id as source_table_id,
-        stm.name as source_table_name,
-        ssm.name as source_schema_name,
-        dtm.id as destination_table_id,
-        dtm.name as destination_table_name,
-        dsm.name as destination_schema_name
-        FROM {self.db_clause}table_lineage as tl
-        JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
-        JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
-        JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
-        JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id"""
+        return f"""\
+SELECT
+    stm.connector_id as connector_id,
+    stm.id as source_table_id,
+    stm.name as source_table_name,
+    ssm.name as source_schema_name,
+    dtm.id as destination_table_id,
+    dtm.name as destination_table_name,
+    dsm.name as destination_schema_name
+FROM {self.db_clause}table_lineage as tl
+JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
+JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
+JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
+JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
+QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
+ORDER BY stm.connector_id, tl.created_at DESC
+"""
 
     def get_column_lineage_query(self) -> str:
-        return f"""
-        SELECT scm.table_id as source_table_id,
-        dcm.table_id as destination_table_id,
-        scm.name as source_column_name,
-        dcm.name as destination_column_name
-        FROM {self.db_clause}column_lineage as cl
-        JOIN {self.db_clause}source_column_metadata as scm
-        on cl.source_column_id = scm.id
-        JOIN {self.db_clause}destination_column_metadata as dcm
-        on cl.destination_column_id = dcm.id"""
+        return f"""\
+SELECT
+    scm.table_id as source_table_id,
+    dcm.table_id as destination_table_id,
+    scm.name as source_column_name,
+    dcm.name as destination_column_name
+FROM {self.db_clause}column_lineage as cl
+JOIN {self.db_clause}source_column_metadata as scm
+  ON cl.source_column_id = scm.id
+JOIN {self.db_clause}destination_column_metadata as dcm
+  ON cl.destination_column_id = dcm.id
+-- Only joining source_table_metadata to get the connector_id.
+JOIN {self.db_clause}source_table_metadata as stm
+  ON scm.table_id = stm.id
+QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
+ORDER BY stm.connector_id, cl.created_at DESC
+"""
diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
index 33ac09e69a3c0a..e72162b12e48fd 100644
--- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
+++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
@@ -100,11 +100,9 @@ def default_query_results(
                 "email": "abc.xyz@email.com",
             }
         ]
-    elif query == fivetran_log_query.get_sync_logs_query().format(
-        db_clause=fivetran_log_query.db_clause,
+    elif query == fivetran_log_query.get_sync_logs_query(
         syncs_interval=7,
-        max_jobs_per_connector=1000,
-        connector_ids="'calendar_elected'",
+        connector_ids=["calendar_elected"],
     ):
         return [
             {

From 725d76a8ff8084865d6407c63bcf40ef813a72b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20L=C3=BCdin?=
 <13187726+Masterchen09@users.noreply.github.com>
Date: Mon, 21 Oct 2024 09:00:09 +0200
Subject: [PATCH 28/31] fix(ui): show DataHub logo for DataHub sources in
 ingestion souces list (#11658)

Co-authored-by: Shirshanka Das <shirshanka@apache.org>
---
 .../src/app/ingest/source/builder/constants.ts        | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts
index b67ca388c10546..0e0ba8b22e37ef 100644
--- a/datahub-web-react/src/app/ingest/source/builder/constants.ts
+++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts
@@ -35,6 +35,7 @@ import csvLogo from '../../../../images/csv-logo.png';
 import qlikLogo from '../../../../images/qliklogo.png';
 import sigmaLogo from '../../../../images/sigmalogo.png';
 import sacLogo from '../../../../images/saclogo.svg';
+import datahubLogo from '../../../../images/datahublogo.png';
 
 export const ATHENA = 'athena';
 export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`;
@@ -125,6 +126,11 @@ export const SIGMA = 'sigma';
 export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`;
 export const SAC = 'sac';
 export const SAC_URN = `urn:li:dataPlatform:${SAC}`;
+export const DATAHUB = 'datahub';
+export const DATAHUB_GC = 'datahub-gc';
+export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file';
+export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary';
+export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`;
 
 export const PLATFORM_URN_TO_LOGO = {
     [ATHENA_URN]: athenaLogo,
@@ -165,6 +171,7 @@ export const PLATFORM_URN_TO_LOGO = {
     [QLIK_SENSE_URN]: qlikLogo,
     [SIGMA_URN]: sigmaLogo,
     [SAC_URN]: sacLogo,
+    [DATAHUB_URN]: datahubLogo,
 };
 
 export const SOURCE_TO_PLATFORM_URN = {
@@ -178,5 +185,7 @@ export const SOURCE_TO_PLATFORM_URN = {
     [SNOWFLAKE_USAGE]: SNOWFLAKE_URN,
     [STARBURST_TRINO_USAGE]: TRINO_URN,
     [DBT_CLOUD]: DBT_URN,
-    [VERTICA]: VERTICA_URN,
+    [DATAHUB_GC]: DATAHUB_URN,
+    [DATAHUB_LINEAGE_FILE]: DATAHUB_URN,
+    [DATAHUB_BUSINESS_GLOSSARY]: DATAHUB_URN,
 };

From 4adfba1a45ac91b48afd9da2bbfa705417a074ef Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:08:48 -0700
Subject: [PATCH 29/31] build(deps): bump express from 4.18.2 to 4.19.2 in
 /docs-website (#10128)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

From dd487de00bf84405bae5c73a96690efc86a765eb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:08:48 -0700
Subject: [PATCH 30/31] build(deps): bump express from 4.18.2 to 4.19.2 in
 /docs-website (#10128)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

From 76386ae85885d0c3d08e6ed99537bead01508a41 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:08:48 -0700
Subject: [PATCH 31/31] build(deps): bump express from 4.18.2 to 4.19.2 in
 /docs-website (#10128)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>