Skip to content

Commit

Permalink
Fix performance issue with profiling by not adding anymore the column…
Browse files Browse the repository at this point in the history
… patterns but deny explicitly column types
  • Loading branch information
treff7es committed Nov 6, 2024
1 parent a67981b commit 5ecfa89
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ class BigqueryTable(BaseTable):
active_billable_bytes: Optional[int] = None
long_term_billable_bytes: Optional[int] = None
partition_info: Optional[PartitionInfo] = None
columns_ignore_from_profiling: List[str] = field(default_factory=list)
external: bool = False
constraints: List[BigqueryTableConstraint] = field(default_factory=list)
table_type: Optional[str] = None
Expand Down Expand Up @@ -541,18 +540,26 @@ def get_table_constraints_for_dataset(
table_name=constraint.table_name,
type=constraint.constraint_type,
field_path=constraint.column_name,
referenced_project_id=constraint.referenced_catalog
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_dataset=constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_table_name=constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_column_name=constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None,
referenced_project_id=(
constraint.referenced_catalog
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_dataset=(
constraint.referenced_schema
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_table_name=(
constraint.referenced_table
if constraint.constraint_type == "FOREIGN KEY"
else None
),
referenced_column_name=(
constraint.referenced_column
if constraint.constraint_type == "FOREIGN KEY"
else None
),
)
)
self.report.num_get_table_constraints_for_dataset_api_requests += 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -598,18 +598,6 @@ def _process_schema(
dataset_name=dataset_name,
)

# This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here
# because the profiler doesn't have access to columns
def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]:
ignore_list: List[str] = []
for column in columns:
if not column.data_type or any(
word in column.data_type.lower()
for word in ["array", "struct", "geography", "json"]
):
ignore_list.append(column.field_path)
return ignore_list

def _process_table(
self,
table: BigqueryTable,
Expand All @@ -631,15 +619,6 @@ def _process_table(
)
table.column_count = len(columns)

# We only collect profile ignore list if profiling is enabled and profile_table_level_only is false
if (
self.config.is_profiling_enabled()
and not self.config.profiling.profile_table_level_only
):
table.columns_ignore_from_profiling = self.generate_profile_ignore_list(
columns
)

if not table.column_count:
logger.warning(
f"Table doesn't have any column or unable to get columns for table: {table_identifier}"
Expand Down Expand Up @@ -1161,9 +1140,11 @@ def gen_schema_metadata(
# fields=[],
fields=self.gen_schema_fields(
columns,
table.constraints
if (isinstance(table, BigqueryTable) and table.constraints)
else [],
(
table.constraints
if (isinstance(table, BigqueryTable) and table.constraints)
else []
),
),
foreignKeys=foreign_keys if foreign_keys else None,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,6 @@ def get_workunits(
normalized_table_name = BigqueryTableIdentifier(
project_id=project_id, dataset=dataset, table=table.name
).get_table_name()
for column in table.columns_ignore_from_profiling:
# Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling
# We also filter columns without data type as it means that column is part of a complex type.
self.config.profile_pattern.deny.append(
f"^{normalized_table_name}.{column}$"
)

if table.external and not self.config.profiling.profile_external_tables:
self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1397,6 +1397,8 @@ def _get_ge_dataset(
def _get_column_types_to_ignore(dialect_name: str) -> List[str]:
if dialect_name.lower() == POSTGRESQL:
return ["JSON"]
elif dialect_name.lower() == BIGQUERY:
return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"]

return []

Expand Down

0 comments on commit 5ecfa89

Please sign in to comment.