From 5ecfa89cedac543a248837d7eb6368ecff4ef180 Mon Sep 17 00:00:00 2001
From: treff7es <treff7es@gmail.com>
Date: Wed, 6 Nov 2024 15:24:37 +0100
Subject: [PATCH] Fix performance issue with profiling by not adding anymore
 the column patterns but deny explicitly column types

---
 .../source/bigquery_v2/bigquery_schema.py     | 33 +++++++++++--------
 .../source/bigquery_v2/bigquery_schema_gen.py | 29 +++-------------
 .../ingestion/source/bigquery_v2/profiler.py  |  6 ----
 .../ingestion/source/ge_data_profiler.py      |  2 ++
 4 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
index 58317b108bef4..836fda8a0ca3a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
     active_billable_bytes: Optional[int] = None
     long_term_billable_bytes: Optional[int] = None
     partition_info: Optional[PartitionInfo] = None
-    columns_ignore_from_profiling: List[str] = field(default_factory=list)
     external: bool = False
     constraints: List[BigqueryTableConstraint] = field(default_factory=list)
     table_type: Optional[str] = None
@@ -541,18 +540,26 @@ def get_table_constraints_for_dataset(
                         table_name=constraint.table_name,
                         type=constraint.constraint_type,
                         field_path=constraint.column_name,
-                        referenced_project_id=constraint.referenced_catalog
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
-                        referenced_dataset=constraint.referenced_schema
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
-                        referenced_table_name=constraint.referenced_table
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
-                        referenced_column_name=constraint.referenced_column
-                        if constraint.constraint_type == "FOREIGN KEY"
-                        else None,
+                        referenced_project_id=(
+                            constraint.referenced_catalog
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
+                        referenced_dataset=(
+                            constraint.referenced_schema
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
+                        referenced_table_name=(
+                            constraint.referenced_table
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
+                        referenced_column_name=(
+                            constraint.referenced_column
+                            if constraint.constraint_type == "FOREIGN KEY"
+                            else None
+                        ),
                     )
                 )
             self.report.num_get_table_constraints_for_dataset_api_requests += 1
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
index 907e5c12e99a1..d48a19cd5d54b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py
@@ -598,18 +598,6 @@ def _process_schema(
                     dataset_name=dataset_name,
                 )
 
-    # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
-    # because the profiler doesn't have access to columns
-    def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
-        ignore_list: List[str] = []
-        for column in columns:
-            if not column.data_type or any(
-                word in column.data_type.lower()
-                for word in ["array", "struct", "geography", "json"]
-            ):
-                ignore_list.append(column.field_path)
-        return ignore_list
-
     def _process_table(
         self,
         table: BigqueryTable,
@@ -631,15 +619,6 @@ def _process_table(
             )
         table.column_count = len(columns)
 
-        # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
-        if (
-            self.config.is_profiling_enabled()
-            and not self.config.profiling.profile_table_level_only
-        ):
-            table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
-                columns
-            )
-
         if not table.column_count:
             logger.warning(
                 f"Table doesn't have any column or unable to get columns for table: {table_identifier}"
@@ -1161,9 +1140,11 @@ def gen_schema_metadata(
             # fields=[],
             fields=self.gen_schema_fields(
                 columns,
-                table.constraints
-                if (isinstance(table, BigqueryTable) and table.constraints)
-                else [],
+                (
+                    table.constraints
+                    if (isinstance(table, BigqueryTable) and table.constraints)
+                    else []
+                ),
             ),
             foreignKeys=foreign_keys if foreign_keys else None,
         )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
index 6af8166fbf70c..182ae2265cb16 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
@@ -166,12 +166,6 @@ def get_workunits(
                 normalized_table_name = BigqueryTableIdentifier(
                     project_id=project_id, dataset=dataset, table=table.name
                 ).get_table_name()
-                for column in table.columns_ignore_from_profiling:
-                    # Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
-                    # We also filter columns without data type as it means that column is part of a complex type.
-                    self.config.profile_pattern.deny.append(
-                        f"^{normalized_table_name}.{column}$"
-                    )
 
                 if table.external and not self.config.profiling.profile_external_tables:
                     self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
index d175fce04a52c..1f2190570b393 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -1397,6 +1397,8 @@ def _get_ge_dataset(
 def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
     if dialect_name.lower() == POSTGRESQL:
         return ["JSON"]
+    elif dialect_name.lower() == BIGQUERY:
+        return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]
 
     return []