From 5ecfa89cedac543a248837d7eb6368ecff4ef180 Mon Sep 17 00:00:00 2001 From: treff7es Date: Wed, 6 Nov 2024 15:24:37 +0100 Subject: [PATCH] Fix performance issue with profiling by not adding anymore the column patterns but deny explicitly column types --- .../source/bigquery_v2/bigquery_schema.py | 33 +++++++++++-------- .../source/bigquery_v2/bigquery_schema_gen.py | 29 +++------------- .../ingestion/source/bigquery_v2/profiler.py | 6 ---- .../ingestion/source/ge_data_profiler.py | 2 ++ 4 files changed, 27 insertions(+), 43 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 58317b108bef4..836fda8a0ca3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -118,7 +118,6 @@ class BigqueryTable(BaseTable): active_billable_bytes: Optional[int] = None long_term_billable_bytes: Optional[int] = None partition_info: Optional[PartitionInfo] = None - columns_ignore_from_profiling: List[str] = field(default_factory=list) external: bool = False constraints: List[BigqueryTableConstraint] = field(default_factory=list) table_type: Optional[str] = None @@ -541,18 +540,26 @@ def get_table_constraints_for_dataset( table_name=constraint.table_name, type=constraint.constraint_type, field_path=constraint.column_name, - referenced_project_id=constraint.referenced_catalog - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_dataset=constraint.referenced_schema - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_table_name=constraint.referenced_table - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_column_name=constraint.referenced_column - if constraint.constraint_type == "FOREIGN KEY" - else None, + referenced_project_id=( + constraint.referenced_catalog + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_dataset=( + constraint.referenced_schema + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_table_name=( + constraint.referenced_table + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_column_name=( + constraint.referenced_column + if constraint.constraint_type == "FOREIGN KEY" + else None + ), ) ) self.report.num_get_table_constraints_for_dataset_api_requests += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 907e5c12e99a1..d48a19cd5d54b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -598,18 +598,6 @@ def _process_schema( dataset_name=dataset_name, ) - # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here - # because the profiler doesn't have access to columns - def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]: - ignore_list: List[str] = [] - for column in columns: - if not column.data_type or any( - word in column.data_type.lower() - for word in ["array", "struct", "geography", "json"] - ): - ignore_list.append(column.field_path) - return ignore_list - def _process_table( self, table: BigqueryTable, @@ -631,15 +619,6 @@ def _process_table( ) table.column_count = len(columns) - # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false - if ( - self.config.is_profiling_enabled() - and not self.config.profiling.profile_table_level_only - ): - table.columns_ignore_from_profiling = self.generate_profile_ignore_list( - columns - ) - if not table.column_count: logger.warning( f"Table doesn't have any column or unable to get columns for table: {table_identifier}" @@ -1161,9 +1140,11 @@ def gen_schema_metadata( # fields=[], fields=self.gen_schema_fields( columns, - table.constraints - if (isinstance(table, BigqueryTable) and table.constraints) - else [], + ( + table.constraints + if (isinstance(table, BigqueryTable) and table.constraints) + else [] + ), ), foreignKeys=foreign_keys if foreign_keys else None, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index 6af8166fbf70c..182ae2265cb16 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -166,12 +166,6 @@ def get_workunits( normalized_table_name = BigqueryTableIdentifier( project_id=project_id, dataset=dataset, table=table.name ).get_table_name() - for column in table.columns_ignore_from_profiling: - # Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling - # We also filter columns without data type as it means that column is part of a complex type. - self.config.profile_pattern.deny.append( - f"^{normalized_table_name}.{column}$" - ) if table.external and not self.config.profiling.profile_external_tables: self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index d175fce04a52c..1f2190570b393 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -1397,6 +1397,8 @@ def _get_ge_dataset( def _get_column_types_to_ignore(dialect_name: str) -> List[str]: if dialect_name.lower() == POSTGRESQL: return ["JSON"] + elif dialect_name.lower() == BIGQUERY: + return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"] return []