From b84601c9380a996e36dcf6fda77e422bedc9ee18 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Wed, 9 Aug 2023 20:45:24 +0530 Subject: [PATCH 01/11] feat(bigquery): add better timers around every API call --- .../ingestion/source/bigquery_v2/bigquery.py | 75 ++--- .../source/bigquery_v2/bigquery_audit_api.py | 208 +++++++++++++ .../source/bigquery_v2/bigquery_config.py | 73 +++-- .../source/bigquery_v2/bigquery_report.py | 27 +- .../source/bigquery_v2/bigquery_schema.py | 278 ++++++++++-------- .../ingestion/source/bigquery_v2/common.py | 34 --- .../ingestion/source/bigquery_v2/lineage.py | 31 +- .../ingestion/source/bigquery_v2/usage.py | 221 ++------------ .../source/snowflake/snowflake_v2.py | 21 +- .../src/datahub/utilities/perf_timer.py | 66 ++++- .../tests/unit/test_bigquery_source.py | 35 ++- .../unit/test_bigqueryv2_usage_source.py | 11 +- .../tests/unit/utilities/test_perf_timer.py | 41 +++ 13 files changed, 651 insertions(+), 470 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_perf_timer.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d1f39a3ba1ba6..442e1e525fdf6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -54,7 +54,6 @@ from datahub.ingestion.source.bigquery_v2.common import ( BQ_EXTERNAL_DATASET_URL_TEMPLATE, BQ_EXTERNAL_TABLE_URL_TEMPLATE, - get_bigquery_client, ) from datahub.ingestion.source.bigquery_v2.lineage import ( BigqueryLineageExtractor, @@ -227,6 +226,8 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) + self.bigquery_data_dictionary = BigQueryDataDictionary(self.report) + # For database, schema, tables, views, etc self.lineage_extractor = BigqueryLineageExtractor(config, self.report) self.usage_extractor = BigQueryUsageExtractor( @@ -271,6 +272,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): platform=self.platform, env=self.config.env ) + self.add_config_to_report() atexit.register(cleanup, config) @classmethod @@ -295,18 +297,20 @@ def metadata_read_capability_test( for project_id in project_ids: try: logger.info((f"Metadata read capability test for project {project_id}")) - client: bigquery.Client = get_bigquery_client(config) + client: bigquery.Client = config.get_bigquery_client() assert client - result = BigQueryDataDictionary.get_datasets_for_project_id( - client, project_id, 10 + report = BigQueryV2Report() + bigquery_data_dictionary = BigQueryDataDictionary(report) + bigquery_data_dictionary.set_client(client) + result = bigquery_data_dictionary.get_datasets_for_project_id( + project_id, 10 ) if len(result) == 0: return CapabilityReport( capable=False, failure_reason=f"Dataset query returned empty dataset. It is either empty or no dataset in project {project_id}", ) - tables = BigQueryDataDictionary.get_tables_for_dataset( - conn=client, + tables = bigquery_data_dictionary.get_tables_for_dataset( project_id=project_id, dataset_name=result[0].name, tables={}, @@ -378,7 +382,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport: try: connection_conf = BigQueryV2Config.parse_obj_allow_extras(config_dict) - client: bigquery.Client = get_bigquery_client(connection_conf) + client: bigquery.Client = connection_conf.get_bigquery_client() assert client test_report.basic_connectivity = BigqueryV2Source.connectivity_test(client) @@ -498,17 +502,17 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - conn: bigquery.Client = get_bigquery_client(self.config) - self.add_config_to_report() + bq_client: bigquery.Client = self.config.get_bigquery_client() + self.bigquery_data_dictionary.set_client(bq_client) - projects = self._get_projects(conn) + projects = self._get_projects() if not projects: return for project_id in projects: self.report.set_ingestion_stage(project_id.id, "Metadata Extraction") logger.info(f"Processing project: {project_id.id}") - yield from self._process_project(conn, project_id) + yield from self._process_project(project_id) if self._should_ingest_usage(): yield from self.usage_extractor.get_usage_workunits( @@ -563,7 +567,7 @@ def _should_ingest_lineage(self) -> bool: ) return True - def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]: + def _get_projects(self) -> List[BigqueryProject]: logger.info("Getting projects") if self.config.project_ids or self.config.project_id: project_ids = self.config.project_ids or [self.config.project_id] # type: ignore @@ -572,11 +576,11 @@ def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]: for project_id in project_ids ] else: - return list(self._get_project_list(conn)) + return list(self._query_project_list()) - def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]: + def _query_project_list(self) -> Iterable[BigqueryProject]: try: - projects = BigQueryDataDictionary.get_projects(conn) + projects = self.bigquery_data_dictionary.get_projects() except Exception as e: logger.error(f"Error getting projects. {e}", exc_info=True) projects = [] @@ -597,7 +601,7 @@ def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]: self.report.report_dropped(project.id) def _process_project( - self, conn: bigquery.Client, bigquery_project: BigqueryProject + self, bigquery_project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: db_tables: Dict[str, List[BigqueryTable]] = {} db_views: Dict[str, List[BigqueryView]] = {} @@ -608,7 +612,7 @@ def _process_project( try: bigquery_project.datasets = ( - BigQueryDataDictionary.get_datasets_for_project_id(conn, project_id) + self.bigquery_data_dictionary.get_datasets_for_project_id(project_id) ) except Exception as e: error_message = f"Unable to get datasets for project {project_id}, skipping. The error was: {e}" @@ -642,7 +646,7 @@ def _process_project( try: # db_tables and db_views are populated in the this method yield from self._process_schema( - conn, project_id, bigquery_dataset, db_tables, db_views + project_id, bigquery_dataset, db_tables, db_views ) except Exception as e: @@ -735,7 +739,6 @@ def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]: def _process_schema( self, - conn: bigquery.Client, project_id: str, bigquery_dataset: BigqueryDataset, db_tables: Dict[str, List[BigqueryTable]], @@ -749,8 +752,7 @@ def _process_schema( columns = None if self.config.include_tables or self.config.include_views: - columns = BigQueryDataDictionary.get_columns_for_dataset( - conn, + columns = self.bigquery_data_dictionary.get_columns_for_dataset( project_id=project_id, dataset_name=dataset_name, column_limit=self.config.column_limit, @@ -759,7 +761,7 @@ def _process_schema( if self.config.include_tables: db_tables[dataset_name] = list( - self.get_tables_for_dataset(conn, project_id, dataset_name) + self.get_tables_for_dataset(project_id, dataset_name) ) for table in db_tables[dataset_name]: @@ -772,7 +774,9 @@ def _process_schema( ) elif self.config.include_table_lineage or self.config.include_usage_statistics: # Need table_refs to calculate lineage and usage - for table_item in conn.list_tables(f"{project_id}.{dataset_name}"): + for table_item in self.bigquery_data_dictionary.list_tables( + dataset_name, project_id + ): identifier = BigqueryTableIdentifier( project_id=project_id, dataset=dataset_name, @@ -792,8 +796,8 @@ def _process_schema( if self.config.include_views: db_views[dataset_name] = list( - BigQueryDataDictionary.get_views_for_dataset( - conn, project_id, dataset_name, self.config.is_profiling_enabled() + self.bigquery_data_dictionary.get_views_for_dataset( + project_id, dataset_name, self.config.is_profiling_enabled() ) ) @@ -1205,7 +1209,6 @@ def get_report(self) -> BigQueryV2Report: def get_tables_for_dataset( self, - conn: bigquery.Client, project_id: str, dataset_name: str, ) -> Iterable[BigqueryTable]: @@ -1224,14 +1227,15 @@ def get_tables_for_dataset( # We get the list of tables in the dataset to get core table properties and to be able to process the tables in batches # We collect only the latest shards from sharded tables (tables with _YYYYMMDD suffix) and ignore temporary tables - table_items = self.get_core_table_details(conn, dataset_name, project_id) + table_items = self.get_core_table_details( + dataset_name, project_id, self.config.temp_table_dataset_prefix + ) items_to_get: Dict[str, TableListItem] = {} for table_item in table_items.keys(): items_to_get[table_item] = table_items[table_item] if len(items_to_get) % max_batch_size == 0: - yield from BigQueryDataDictionary.get_tables_for_dataset( - conn, + yield from self.bigquery_data_dictionary.get_tables_for_dataset( project_id, dataset_name, items_to_get, @@ -1240,8 +1244,7 @@ def get_tables_for_dataset( items_to_get.clear() if items_to_get: - yield from BigQueryDataDictionary.get_tables_for_dataset( - conn, + yield from self.bigquery_data_dictionary.get_tables_for_dataset( project_id, dataset_name, items_to_get, @@ -1253,13 +1256,15 @@ def get_tables_for_dataset( ) def get_core_table_details( - self, conn: bigquery.Client, dataset_name: str, project_id: str + self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str ) -> Dict[str, TableListItem]: table_items: Dict[str, TableListItem] = {} # Dict to store sharded table and the last seen max shard id sharded_tables: Dict[str, TableListItem] = {} - for table in conn.list_tables(f"{project_id}.{dataset_name}"): + for table in self.bigquery_data_dictionary.list_tables( + dataset_name, project_id + ): table_identifier = BigqueryTableIdentifier( project_id=project_id, dataset=dataset_name, @@ -1296,9 +1301,7 @@ def get_core_table_details( if stored_shard < shard: sharded_tables[table_name] = table continue - elif str(table_identifier).startswith( - self.config.temp_table_dataset_prefix - ): + elif str(table_identifier).startswith(temp_table_dataset_prefix): logger.debug(f"Dropping temporary table {table_identifier.table}") self.report.report_dropped(table_identifier.raw_table_name()) continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py new file mode 100644 index 0000000000000..09a9098ced338 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py @@ -0,0 +1,208 @@ +import logging +import textwrap +from datetime import datetime +from typing import Iterable, List, Optional + +from google.cloud import bigquery +from google.cloud.logging_v2.client import Client as GCPLoggingClient +from ratelimiter import RateLimiter + +from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( + BQ_AUDIT_V2, + BQ_FILTER_RULE_TEMPLATE, + AuditLogEntry, + BigQueryAuditMetadata, +) +from datahub.ingestion.source.bigquery_v2.bigquery_report import ( + BigQueryAuditLogApiPerfReport, +) +from datahub.ingestion.source.bigquery_v2.common import ( + BQ_DATE_SHARD_FORMAT, + BQ_DATETIME_FORMAT, +) + +logger: logging.Logger = logging.getLogger(__name__) + + +class BigQueryAuditLogApi: + def __init__( + self, + report: BigQueryAuditLogApiPerfReport, + rate_limit: bool, + requests_per_min: int, + ) -> None: + self.api_perf_report = report + self.rate_limit = rate_limit + self.requests_per_min = requests_per_min + + def get_exported_bigquery_audit_metadata( + self, + bigquery_client: bigquery.Client, + bigquery_audit_metadata_datasets: Optional[List[str]], + use_date_sharded_audit_log_tables: bool, + start_time: datetime, + end_time: datetime, + limit: Optional[int] = None, + ) -> Iterable[BigQueryAuditMetadata]: + if bigquery_audit_metadata_datasets is None: + return + + audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT) + audit_start_date = start_time.strftime(BQ_DATE_SHARD_FORMAT) + + audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) + audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT) + + with self.api_perf_report.get_exported_bigquery_audit_metadata as current_timer: + for dataset in bigquery_audit_metadata_datasets: + logger.info( + f"Start loading log entries from BigQueryAuditMetadata in {dataset}" + ) + + query = bigquery_audit_metadata_query_template( + dataset, + use_date_sharded_audit_log_tables, + limit=limit, + ).format( + start_time=audit_start_time, + end_time=audit_end_time, + start_date=audit_start_date, + end_date=audit_end_date, + ) + + query_job = bigquery_client.query(query) + logger.info( + f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" + ) + with current_timer.pause_timer(): + if self.rate_limit: + with RateLimiter(max_calls=self.requests_per_min, period=60): + yield from query_job + else: + yield from query_job + + def get_bigquery_log_entries_via_gcp_logging( + self, + client: GCPLoggingClient, + start_time: datetime, + end_time: datetime, + log_page_size: int, + limit: Optional[int] = None, + ) -> Iterable[AuditLogEntry]: + filter = self._generate_filter(start_time, end_time) + logger.debug(filter) + + list_entries: Iterable[AuditLogEntry] + rate_limiter: Optional[RateLimiter] = None + if self.rate_limit: + # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging + # to properly ratelimit we multiply the page size by the number of requests per minute + rate_limiter = RateLimiter( + max_calls=self.requests_per_min * log_page_size, + period=60, + ) + + with self.api_perf_report.get_bigquery_log_entries_via_gcp_logging as current_timer: + list_entries = client.list_entries( + filter_=filter, + page_size=log_page_size, + max_results=limit, + ) + + for i, entry in enumerate(list_entries): + if i == 0: + logger.info( + f"Starting log load from GCP Logging for {client.project}" + ) + if i % 1000 == 0: + logger.info( + f"Loaded {i} log entries from GCP Log for {client.project}" + ) + + with current_timer.pause_timer(): + if rate_limiter: + with rate_limiter: + yield entry + else: + yield entry + + def _generate_filter(self, start_time: datetime, end_time: datetime) -> str: + audit_start_time = (start_time).strftime(BQ_DATETIME_FORMAT) + + audit_end_time = (end_time).strftime(BQ_DATETIME_FORMAT) + + filter = BQ_AUDIT_V2[BQ_FILTER_RULE_TEMPLATE].format( + start_time=audit_start_time, end_time=audit_end_time + ) + return filter + + +def bigquery_audit_metadata_query_template( + dataset: str, + use_date_sharded_tables: bool, + limit: Optional[int] = None, +) -> str: + """ + Receives a dataset (with project specified) and returns a query template that is used to query exported + v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + :param dataset: the dataset to query against in the form of $PROJECT.$DATASET + :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log + tables + :param limit: maximum number of events to query for + :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery + """ + + limit_text = f"limit {limit}" if limit else "" + + shard_condition = "" + if use_date_sharded_tables: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" + shard_condition = ( + """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + ) + else: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" + + # Deduplicates insertId via QUALIFY, see: + # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field + query = f""" + SELECT + timestamp, + logName, + insertId, + protopayload_auditlog AS protoPayload, + protopayload_auditlog.metadataJson AS metadata + FROM + {from_table} + WHERE ( + timestamp >= "{{start_time}}" + AND timestamp < "{{end_time}}" + ) + {shard_condition} + AND protopayload_auditlog.serviceName="bigquery.googleapis.com" + AND + ( + ( + protopayload_auditlog.methodName IN + ( + "google.cloud.bigquery.v2.JobService.Query", + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL + AND ( + JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson, + "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL + OR + JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL + ) + ) + OR + JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB" + ) + QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 + {limit_text}; + """ + + return textwrap.dedent(query) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 7287dc1b67d73..fa689d571c8cc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -4,9 +4,11 @@ from typing import Any, Dict, List, Optional import pydantic +from google.cloud import bigquery +from google.cloud.logging_v2.client import Client as GCPLoggingClient from pydantic import Field, PositiveInt, PrivateAttr, root_validator -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -35,7 +37,52 @@ class BigQueryUsageConfig(BaseUsageConfig): ) +class BigQueryConnectionConfig(ConfigModel): + credential: Optional[BigQueryCredential] = Field( + description="BigQuery credential informations" + ) + + _credentials_path: Optional[str] = PrivateAttr(None) + + extra_client_options: Dict[str, Any] = Field( + default={}, + description="Additional options to pass to google.cloud.logging_v2.client.Client.", + ) + + project_on_behalf: Optional[str] = Field( + default=None, + description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.", + ) + + def __init__(self, **data: Any): + super().__init__(**data) + + if self.credential: + self._credentials_path = self.credential.create_credential_temp_file() + logger.debug( + f"Creating temporary credential file at {self._credentials_path}" + ) + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path + + def get_bigquery_client(config) -> bigquery.Client: + client_options = config.extra_client_options + return bigquery.Client(config.project_on_behalf, **client_options) + + def make_gcp_logging_client( + self, project_id: Optional[str] = None + ) -> GCPLoggingClient: + # See https://github.com/googleapis/google-cloud-python/issues/2674 for + # why we disable gRPC here. + client_options = self.extra_client_options.copy() + client_options["_use_grpc"] = False + if project_id is not None: + return GCPLoggingClient(**client_options, project=project_id) + else: + return GCPLoggingClient(**client_options) + + class BigQueryV2Config( + BigQueryConnectionConfig, BigQueryBaseConfig, SQLAlchemyConfig, StatefulUsageConfigMixin, @@ -115,11 +162,6 @@ class BigQueryV2Config( ), ) - project_on_behalf: Optional[str] = Field( - default=None, - description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.", - ) - storage_project_id: None = Field(default=None, hidden_from_docs=True) lineage_use_sql_parser: bool = Field( @@ -173,14 +215,8 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: default=1000, description="The number of log item will be queried per page for lineage collection", ) - credential: Optional[BigQueryCredential] = Field( - description="BigQuery credential informations" - ) + # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage. - extra_client_options: Dict[str, Any] = Field( - default={}, - description="Additional options to pass to google.cloud.logging_v2.client.Client.", - ) include_table_lineage: Optional[bool] = Field( default=True, description="Option to enable/disable lineage generation. Is enabled by default.", @@ -202,7 +238,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: default=False, description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.", ) - _credentials_path: Optional[str] = PrivateAttr(None) _cache_path: Optional[str] = PrivateAttr(None) @@ -223,16 +258,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="Maximum number of entries for the in-memory caches of FileBacked data structures.", ) - def __init__(self, **data: Any): - super().__init__(**data) - - if self.credential: - self._credentials_path = self.credential.create_credential_temp_file() - logger.debug( - f"Creating temporary credential file at {self._credentials_path}" - ) - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path - @root_validator(pre=False) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index b57e691411f75..fc725e0cda3c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -15,8 +15,24 @@ logger: logging.Logger = logging.getLogger(__name__) +class BigQueryApiPerfReport: + list_projects = PerfTimer() + get_datasets_for_project = PerfTimer() + get_columns_for_dataset = PerfTimer() + get_tables_for_dataset = PerfTimer() + list_tables = PerfTimer() + get_views_for_dataset = PerfTimer() + + +class BigQueryAuditLogApiPerfReport: + get_exported_bigquery_audit_metadata = PerfTimer() + get_bigquery_log_entries_via_gcp_logging = PerfTimer() + + @dataclass -class BigQueryV2Report(ProfilingSqlReport): +class BigQueryV2Report( + ProfilingSqlReport, BigQueryApiPerfReport, BigQueryAuditLogApiPerfReport +): num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict) num_skipped_lineage_entries_missing_data: TopKDict[str, int] = field( default_factory=int_top_k_dict @@ -53,10 +69,11 @@ class BigQueryV2Report(ProfilingSqlReport): log_page_size: Optional[pydantic.PositiveInt] = None use_exported_bigquery_audit_metadata: Optional[bool] = None end_time: Optional[datetime] = None - log_entry_start_time: Optional[str] = None - log_entry_end_time: Optional[str] = None - audit_start_time: Optional[str] = None - audit_end_time: Optional[str] = None + # TODO: remove one or replace by lineage ones + log_entry_start_time: Optional[datetime] = None + log_entry_end_time: Optional[datetime] = None + audit_start_time: Optional[datetime] = None + audit_end_time: Optional[datetime] = None upstream_lineage: LossyDict = field(default_factory=LossyDict) partition_info: Dict[str, str] = field(default_factory=TopKDict) profile_table_selection_criteria: Dict[str, str] = field(default_factory=TopKDict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 2450dbd0e2391..68dd0eeefe09b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -13,7 +13,10 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report +from datahub.ingestion.source.bigquery_v2.bigquery_report import ( + BigQueryApiPerfReport, + BigQueryV2Report, +) from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView logger: logging.Logger = logging.getLogger(__name__) @@ -345,26 +348,41 @@ class BigqueryQuery: class BigQueryDataDictionary: + def __init__(self, report: BigQueryApiPerfReport) -> None: + self.bq_client: Optional[bigquery.Client] = None + self.api_perf_report = report + + def set_client(self, bq_client: bigquery.Client) -> None: + self.bq_client = bq_client + + def get_client(self) -> bigquery.Client: + assert self.bq_client is not None + return self.bq_client + @staticmethod def get_query_result(conn: bigquery.Client, query: str) -> RowIterator: logger.debug(f"Query : {query}") resp = conn.query(query) return resp.result() - @staticmethod - def get_projects(conn: bigquery.Client) -> List[BigqueryProject]: - projects = conn.list_projects() + def get_projects(self) -> List[BigqueryProject]: + with self.api_perf_report.list_projects: + projects = self.get_client().list_projects() - return [ - BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects - ] + return [ + BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects + ] - @staticmethod def get_datasets_for_project_id( - conn: bigquery.Client, project_id: str, maxResults: Optional[int] = None + self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: - datasets = conn.list_datasets(project_id, max_results=maxResults) - return [BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets] + with self.api_perf_report.get_datasets_for_project: + datasets = self.get_client().list_datasets( + project_id, max_results=maxResults + ) + return [ + BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets + ] @staticmethod def get_datasets_for_project_id_with_information_schema( @@ -391,56 +409,69 @@ def get_datasets_for_project_id_with_information_schema( for s in schemas ] - @staticmethod + def list_tables( + self, dataset_name: str, project_id: str + ) -> Iterator[TableListItem]: + with self.api_perf_report.list_tables as current_timer: + for table in self.get_client().list_tables(f"{project_id}.{dataset_name}"): + with current_timer.pause_timer(): + yield table + def get_tables_for_dataset( - conn: bigquery.Client, + self, project_id: str, dataset_name: str, tables: Dict[str, TableListItem], with_data_read_permission: bool = False, report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryTable]: - filter: str = ", ".join(f"'{table}'" for table in tables.keys()) - - if with_data_read_permission: - # Tables are ordered by name and table suffix to make sure we always process the latest sharded table - # and skip the others. Sharded tables are tables with suffix _20220102 - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.tables_for_dataset.format( - project_id=project_id, - dataset_name=dataset_name, - table_filter=f" and t.table_name in ({filter})" if filter else "", - ), - ) - else: - # Tables are ordered by name and table suffix to make sure we always process the latest sharded table - # and skip the others. Sharded tables are tables with suffix _20220102 - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.tables_for_dataset_without_partition_data.format( - project_id=project_id, - dataset_name=dataset_name, - table_filter=f" and t.table_name in ({filter})" if filter else "", - ), - ) - - for table in cur: - try: - yield BigQueryDataDictionary._make_bigquery_table( - table, tables.get(table.table_name) + with self.api_perf_report.get_tables_for_dataset as current_timer: + filter: str = ", ".join(f"'{table}'" for table in tables.keys()) + + if with_data_read_permission: + # Tables are ordered by name and table suffix to make sure we always process the latest sharded table + # and skip the others. Sharded tables are tables with suffix _20220102 + cur = BigQueryDataDictionary.get_query_result( + self.get_client(), + BigqueryQuery.tables_for_dataset.format( + project_id=project_id, + dataset_name=dataset_name, + table_filter=f" and t.table_name in ({filter})" + if filter + else "", + ), ) - except Exception as e: - table_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing table {table_name}", - exc_info=True, + else: + # Tables are ordered by name and table suffix to make sure we always process the latest sharded table + # and skip the others. Sharded tables are tables with suffix _20220102 + cur = BigQueryDataDictionary.get_query_result( + self.get_client(), + BigqueryQuery.tables_for_dataset_without_partition_data.format( + project_id=project_id, + dataset_name=dataset_name, + table_filter=f" and t.table_name in ({filter})" + if filter + else "", + ), ) - if report: - report.report_warning( - "metadata-extraction", - f"Failed to get table {table_name}: {e}", + + for table in cur: + try: + with current_timer.pause_timer(): + yield BigQueryDataDictionary._make_bigquery_table( + table, tables.get(table.table_name) + ) + except Exception as e: + table_name = f"{project_id}.{dataset_name}.{table.table_name}" + logger.warning( + f"Error while processing table {table_name}", + exc_info=True, ) + if report: + report.report_warning( + "metadata-extraction", + f"Failed to get table {table_name}: {e}", + ) @staticmethod def _make_bigquery_table( @@ -480,43 +511,44 @@ def _make_bigquery_table( long_term_billable_bytes=table.get("long_term_billable_bytes"), ) - @staticmethod def get_views_for_dataset( - conn: bigquery.Client, + self, project_id: str, dataset_name: str, has_data_read: bool, report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryView]: - if has_data_read: - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.views_for_dataset.format( - project_id=project_id, dataset_name=dataset_name - ), - ) - else: - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.views_for_dataset_without_data_read.format( - project_id=project_id, dataset_name=dataset_name - ), - ) - - for table in cur: - try: - yield BigQueryDataDictionary._make_bigquery_view(table) - except Exception as e: - view_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing view {view_name}", - exc_info=True, + with self.api_perf_report.get_views_for_dataset as current_timer: + if has_data_read: + cur = BigQueryDataDictionary.get_query_result( + self.get_client(), + BigqueryQuery.views_for_dataset.format( + project_id=project_id, dataset_name=dataset_name + ), + ) + else: + cur = BigQueryDataDictionary.get_query_result( + self.get_client(), + BigqueryQuery.views_for_dataset_without_data_read.format( + project_id=project_id, dataset_name=dataset_name + ), ) - if report: - report.report_warning( - "metadata-extraction", - f"Failed to get view {view_name}: {e}", + + for table in cur: + try: + with current_timer.pause_timer(): + yield BigQueryDataDictionary._make_bigquery_view(table) + except Exception as e: + view_name = f"{project_id}.{dataset_name}.{table.table_name}" + logger.warning( + f"Error while processing view {view_name}", + exc_info=True, ) + if report: + report.report_warning( + "metadata-extraction", + f"Failed to get view {view_name}: {e}", + ) @staticmethod def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: @@ -533,58 +565,58 @@ def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW, ) - @staticmethod def get_columns_for_dataset( - conn: bigquery.Client, + self, project_id: str, dataset_name: str, column_limit: int, run_optimized_column_query: bool = False, ) -> Optional[Dict[str, List[BigqueryColumn]]]: columns: Dict[str, List[BigqueryColumn]] = defaultdict(list) - try: - cur = BigQueryDataDictionary.get_query_result( - conn, - BigqueryQuery.columns_for_dataset.format( - project_id=project_id, dataset_name=dataset_name - ) - if not run_optimized_column_query - else BigqueryQuery.optimized_columns_for_dataset.format( - project_id=project_id, - dataset_name=dataset_name, - column_limit=column_limit, - ), - ) - except Exception as e: - logger.warning(f"Columns for dataset query failed with exception: {e}") - # Error - Information schema query returned too much data. - # Please repeat query with more selective predicates. - return None - - last_seen_table: str = "" - for column in cur: - if ( - column_limit - and column.table_name in columns - and len(columns[column.table_name]) >= column_limit - ): - if last_seen_table != column.table_name: - logger.warning( - f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" - ) - last_seen_table = column.table_name - else: - columns[column.table_name].append( - BigqueryColumn( - name=column.column_name, - ordinal_position=column.ordinal_position, - field_path=column.field_path, - is_nullable=column.is_nullable == "YES", - data_type=column.data_type, - comment=column.comment, - is_partition_column=column.is_partitioning_column == "YES", + with self.api_perf_report.get_columns_for_dataset: + try: + cur = BigQueryDataDictionary.get_query_result( + self.get_client(), + BigqueryQuery.columns_for_dataset.format( + project_id=project_id, dataset_name=dataset_name ) + if not run_optimized_column_query + else BigqueryQuery.optimized_columns_for_dataset.format( + project_id=project_id, + dataset_name=dataset_name, + column_limit=column_limit, + ), ) + except Exception as e: + logger.warning(f"Columns for dataset query failed with exception: {e}") + # Error - Information schema query returned too much data. + # Please repeat query with more selective predicates. + return None + + last_seen_table: str = "" + for column in cur: + if ( + column_limit + and column.table_name in columns + and len(columns[column.table_name]) >= column_limit + ): + if last_seen_table != column.table_name: + logger.warning( + f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" + ) + last_seen_table = column.table_name + else: + columns[column.table_name].append( + BigqueryColumn( + name=column.column_name, + ordinal_position=column.ordinal_position, + field_path=column.field_path, + is_nullable=column.is_nullable == "YES", + data_type=column.data_type, + comment=column.comment, + is_partition_column=column.is_partitioning_column == "YES", + ) + ) return columns diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py index 4ff509858b87d..e38ab07855b8b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py @@ -1,39 +1,5 @@ -from typing import Any, Dict, Optional - -from google.cloud import bigquery -from google.cloud.logging_v2.client import Client as GCPLoggingClient - -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config - BQ_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" BQ_DATE_SHARD_FORMAT = "%Y%m%d" BQ_EXTERNAL_TABLE_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m5!1m4!4m3!1s{project}!2s{dataset}!3s{table}" BQ_EXTERNAL_DATASET_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m4!1m3!3m2!1s{project}!2s{dataset}" - - -def _make_gcp_logging_client( - project_id: Optional[str] = None, extra_client_options: Dict[str, Any] = {} -) -> GCPLoggingClient: - # See https://github.com/googleapis/google-cloud-python/issues/2674 for - # why we disable gRPC here. - client_options = extra_client_options.copy() - client_options["_use_grpc"] = False - if project_id is not None: - return GCPLoggingClient(**client_options, project=project_id) - else: - return GCPLoggingClient(**client_options) - - -def get_bigquery_client(config: BigQueryV2Config) -> bigquery.Client: - client_options = config.extra_client_options - return bigquery.Client(config.project_on_behalf, **client_options) - - -def get_sql_alchemy_url(config: BigQueryV2Config) -> str: - if config.project_on_behalf: - return f"bigquery://{config.project_on_behalf}" - # When project_id is not set, we will attempt to detect the project ID - # based on the credentials or environment variables. - # See https://github.com/mxmzdlv/pybigquery#authentication. - return "bigquery://" diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 255a673026252..210018e55be15 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -26,8 +26,6 @@ from datahub.ingestion.source.bigquery_v2.common import ( BQ_DATE_SHARD_FORMAT, BQ_DATETIME_FORMAT, - _make_gcp_logging_client, - get_bigquery_client, ) from datahub.metadata.schema_classes import ( AuditStampClass, @@ -133,7 +131,6 @@ def _follow_column_lineage( def make_lineage_edges_from_parsing_result( sql_lineage: SqlParsingResult, audit_stamp: datetime, lineage_type: str ) -> List[LineageEdge]: - # Note: This ignores the out_tables section of the sql parsing result. audit_stamp = datetime.now(timezone.utc) @@ -295,7 +292,7 @@ def lineage_via_catalog_lineage_api( try: lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() - bigquery_client: BigQueryClient = get_bigquery_client(self.config) + bigquery_client: BigQueryClient = self.config.get_bigquery_client() # Filtering datasets datasets = list(bigquery_client.list_datasets(project_id)) project_tables = [] @@ -381,12 +378,12 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]] if self.config.use_exported_bigquery_audit_metadata: logger.info("Populating lineage info via exported GCP audit logs") - bq_client = get_bigquery_client(self.config) + bq_client = self.config.get_bigquery_client() entries = self._get_exported_bigquery_audit_metadata(bq_client) parse_fn = self._parse_exported_bigquery_audit_metadata else: logger.info("Populating lineage info via exported GCP audit logs") - logging_client = _make_gcp_logging_client(project_id) + logging_client = self.config.make_gcp_logging_client(project_id) entries = self._get_bigquery_log_entries(logging_client) parse_fn = self._parse_bigquery_log_entries @@ -406,15 +403,13 @@ def _get_bigquery_log_entries( ) -> Iterable[AuditLogEntry]: self.report.num_total_log_entries[client.project] = 0 # Add a buffer to start and end time to account for delays in logging events. - start_time = (self.config.start_time - self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_start_time = start_time + corrected_start_time = self.config.start_time - self.config.max_query_duration + start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) + self.report.log_entry_start_time = corrected_start_time - end_time = (self.config.end_time + self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_end_time = end_time + corrected_end_time = self.config.end_time + self.config.max_query_duration + end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) + self.report.log_entry_end_time = corrected_end_time filter = self.BQ_FILTER_RULE_TEMPLATE_V2.format( start_time=start_time, @@ -465,12 +460,12 @@ def _get_exported_bigquery_audit_metadata( corrected_start_time = self.config.start_time - self.config.max_query_duration start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_start_time = start_time + self.report.audit_start_time = corrected_start_time corrected_end_time = self.config.end_time + self.config.max_query_duration end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_end_time = end_time + self.report.audit_end_time = corrected_end_time for dataset in self.config.bigquery_audit_metadata_datasets: logger.info( @@ -827,8 +822,8 @@ def test_capability(self, project_id: str) -> None: f"Connection test got one exported_bigquery_audit_metadata {entry}" ) else: - gcp_logging_client: GCPLoggingClient = _make_gcp_logging_client( - project_id, self.config.extra_client_options + gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client( + project_id ) for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1): logger.debug(f"Connection test got one audit metadata entry {entry}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 1081dd8eec1ec..20014f2f3fac4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -2,7 +2,6 @@ import json import logging import os -import textwrap import time import uuid from dataclasses import dataclass @@ -21,9 +20,6 @@ ) import humanfriendly -from google.cloud.bigquery import Client as BigQueryClient -from google.cloud.logging_v2.client import Client as GCPLoggingClient -from ratelimiter import RateLimiter from datahub.configuration.time_window_config import get_time_bucket from datahub.emitter.mce_builder import make_user_urn @@ -32,8 +28,6 @@ from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( - BQ_AUDIT_V2, - BQ_FILTER_RULE_TEMPLATE, AuditEvent, AuditLogEntry, BigQueryAuditMetadata, @@ -42,14 +36,9 @@ QueryEvent, ReadEvent, ) +from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.common import ( - BQ_DATE_SHARD_FORMAT, - BQ_DATETIME_FORMAT, - _make_gcp_logging_client, - get_bigquery_client, -) from datahub.ingestion.source.usage.usage_common import ( TOTAL_BUDGET_FOR_QUERY_LIST, make_usage_workunit, @@ -101,77 +90,6 @@ class OperationalDataMeta: custom_type: Optional[str] = None -def bigquery_audit_metadata_query_template( - dataset: str, - use_date_sharded_tables: bool, - limit: Optional[int] = None, -) -> str: - """ - Receives a dataset (with project specified) and returns a query template that is used to query exported - v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata. - :param dataset: the dataset to query against in the form of $PROJECT.$DATASET - :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log - tables - :param limit: maximum number of events to query for - :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery - """ - - limit_text = f"limit {limit}" if limit else "" - - shard_condition = "" - if use_date_sharded_tables: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" - shard_condition = ( - """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ - ) - else: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" - - # Deduplicates insertId via QUALIFY, see: - # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field - query = f""" - SELECT - timestamp, - logName, - insertId, - protopayload_auditlog AS protoPayload, - protopayload_auditlog.metadataJson AS metadata - FROM - {from_table} - WHERE ( - timestamp >= "{{start_time}}" - AND timestamp < "{{end_time}}" - ) - {shard_condition} - AND protopayload_auditlog.serviceName="bigquery.googleapis.com" - AND - ( - ( - protopayload_auditlog.methodName IN - ( - "google.cloud.bigquery.v2.JobService.Query", - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL - AND ( - JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson, - "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL - OR - JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL - ) - ) - OR - JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB" - ) - QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 - {limit_text}; - """ - - return textwrap.dedent(query) - - class BigQueryUsageState(Closeable): read_events: FileBackedDict[ReadEvent] query_events: FileBackedDict[QueryEvent] @@ -617,109 +535,6 @@ def _store_usage_event( return True return False - def _get_exported_bigquery_audit_metadata( - self, - bigquery_client: BigQueryClient, - limit: Optional[int] = None, - ) -> Iterable[BigQueryAuditMetadata]: - if self.config.bigquery_audit_metadata_datasets is None: - return - - corrected_start_time = self.config.start_time - self.config.max_query_duration - start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) - start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_start_time = start_time - - corrected_end_time = self.config.end_time + self.config.max_query_duration - end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) - end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_end_time = end_time - - for dataset in self.config.bigquery_audit_metadata_datasets: - logger.info( - f"Start loading log entries from BigQueryAuditMetadata in {dataset}" - ) - - query = bigquery_audit_metadata_query_template( - dataset, - self.config.use_date_sharded_audit_log_tables, - limit=limit, - ).format( - start_time=start_time, - end_time=end_time, - start_date=start_date, - end_date=end_date, - ) - - query_job = bigquery_client.query(query) - logger.info( - f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" - ) - if self.config.rate_limit: - with RateLimiter(max_calls=self.config.requests_per_min, period=60): - yield from query_job - else: - yield from query_job - - def _get_bigquery_log_entries_via_gcp_logging( - self, client: GCPLoggingClient, limit: Optional[int] = None - ) -> Iterable[AuditLogEntry]: - - filter = self._generate_filter(BQ_AUDIT_V2) - logger.debug(filter) - - list_entries: Iterable[AuditLogEntry] - rate_limiter: Optional[RateLimiter] = None - if self.config.rate_limit: - # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging - # to properly ratelimit we multiply the page size by the number of requests per minute - rate_limiter = RateLimiter( - max_calls=self.config.requests_per_min * self.config.log_page_size, - period=60, - ) - - list_entries = client.list_entries( - filter_=filter, - page_size=self.config.log_page_size, - max_results=limit, - ) - - for i, entry in enumerate(list_entries): - if i == 0: - logger.info(f"Starting log load from GCP Logging for {client.project}") - if i % 1000 == 0: - logger.info(f"Loaded {i} log entries from GCP Log for {client.project}") - self.report.total_query_log_entries += 1 - - if rate_limiter: - with rate_limiter: - yield entry - else: - yield entry - - logger.info( - f"Finished loading {self.report.total_query_log_entries} log entries from GCP Logging for {client.project}" - ) - - def _generate_filter(self, audit_templates: Dict[str, str]) -> str: - # We adjust the filter values a bit, since we need to make sure that the join - # between query events and read events is complete. For example, this helps us - # handle the case where the read happens within our time range but the query - # completion event is delayed and happens after the configured end time. - - start_time = (self.config.start_time - self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_start_time = start_time - end_time = (self.config.end_time + self.config.max_query_duration).strftime( - BQ_DATETIME_FORMAT - ) - self.report.log_entry_end_time = end_time - filter = audit_templates[BQ_FILTER_RULE_TEMPLATE].format( - start_time=start_time, end_time=end_time - ) - return filter - @staticmethod def _get_destination_table(event: AuditEvent) -> Optional[BigQueryTableRef]: if ( @@ -954,25 +769,45 @@ def _parse_exported_bigquery_audit_metadata( def _get_parsed_bigquery_log_events( self, project_id: str, limit: Optional[int] = None ) -> Iterable[AuditEvent]: + audit_log_api = BigQueryAuditLogApi( + self.report, self.config.rate_limit, self.config.requests_per_min + ) + # We adjust the filter values a bit, since we need to make sure that the join + # between query events and read events is complete. For example, this helps us + # handle the case where the read happens within our time range but the query + # completion event is delayed and happens after the configured end time. + corrected_start_time = self.config.start_time - self.config.max_query_duration + corrected_end_time = self.config.end_time + -self.config.max_query_duration + self.report.audit_start_time = corrected_start_time + self.report.audit_end_time = corrected_end_time + parse_fn: Callable[[Any], Optional[AuditEvent]] if self.config.use_exported_bigquery_audit_metadata: - bq_client = get_bigquery_client(self.config) - entries = self._get_exported_bigquery_audit_metadata( + bq_client = self.config.get_bigquery_client() + + entries = audit_log_api.get_exported_bigquery_audit_metadata( bigquery_client=bq_client, + bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, + start_time=corrected_start_time, + end_time=corrected_end_time, limit=limit, ) parse_fn = self._parse_exported_bigquery_audit_metadata else: - logging_client = _make_gcp_logging_client( - project_id, self.config.extra_client_options - ) - entries = self._get_bigquery_log_entries_via_gcp_logging( - logging_client, limit=limit + logging_client = self.config.make_gcp_logging_client(project_id) + entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging( + logging_client, + start_time=corrected_start_time, + end_time=corrected_end_time, + log_page_size=self.config.log_page_size, + limit=limit, ) parse_fn = self._parse_bigquery_log_entry for entry in entries: try: + self.report.total_query_log_entries += 1 event = parse_fn(entry) if event: yield event diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 7dd51d5b20e8e..dd8b0c75302c2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -520,15 +520,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.connection.close() - lru_cache_functions: List[Callable] = [ - self.data_dictionary.get_tables_for_database, - self.data_dictionary.get_views_for_database, - self.data_dictionary.get_columns_for_schema, - self.data_dictionary.get_pk_constraints_for_schema, - self.data_dictionary.get_fk_constraints_for_schema, - ] - for func in lru_cache_functions: - self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict() # type: ignore + self.report_cache_info() # TODO: The checkpoint state for stale entity detection can be committed here. @@ -593,6 +585,17 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.usage_extractor.get_usage_workunits(discovered_datasets) + def report_cache_info(self): + lru_cache_functions: List[Callable] = [ + self.data_dictionary.get_tables_for_database, + self.data_dictionary.get_views_for_database, + self.data_dictionary.get_columns_for_schema, + self.data_dictionary.get_pk_constraints_for_schema, + self.data_dictionary.get_fk_constraints_for_schema, + ] + for func in lru_cache_functions: + self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict() # type: ignore + def report_warehouse_failure(self): if self.config.warehouse is not None: self.report_error( diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 3fac1d68c3a9e..46eb0e25e4fbf 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -6,21 +6,53 @@ class PerfTimer(AbstractContextManager): """ A context manager that gives easy access to elapsed time for performance measurement. + """ - start_time: Optional[float] = None - end_time: Optional[float] = None + def __init__(self) -> None: + self.start_time: Optional[float] = None + self.end_time: Optional[float] = None + self._past_active_time: float = 0 + self.paused: Optional[bool] = None def start(self) -> None: + # TODO + # assert ( + # self.end_time is None + # ), "Can not start a finished timer. Did you accidentally re-use this timer ?" + + if self.end_time is not None: + self._past_active_time = self.elapsed_seconds() + self.start_time = time.perf_counter() self.end_time = None + if self.paused: + self.paused = False + + def pause_timer(self) -> "PerfTimer": + assert ( + not self.paused and not self.end_time + ), "Can not pause a paused/stopped timer" + assert ( + self.start_time is not None + ), "Can not pause a timer that hasn't started. Did you forget to start the timer ?" + self._past_active_time = self.elapsed_seconds() + self.start_time = None + self.end_time = None + self.paused = True + return self def finish(self) -> None: - assert self.start_time is not None + assert ( + self.start_time is not None + ), "Can not stop a timer that hasn't started. Did you forget to start the timer ?" self.end_time = time.perf_counter() def __enter__(self) -> "PerfTimer": - self.start() + if self.paused: # Entering paused timer context, NO OP + pass + else: + self.start() return self def __exit__( @@ -29,16 +61,34 @@ def __exit__( exc: Any, traceback: Any, ) -> Optional[bool]: - self.finish() + if self.paused: # Exiting paused timer context, resume timer + self.start() + else: + self.finish() return None def elapsed_seconds(self) -> float: """ Returns the elapsed time in seconds. """ + if self.paused: + return self._past_active_time - assert self.start_time is not None + assert self.start_time is not None, "Did you forget to start the timer ?" if self.end_time is None: - return time.perf_counter() - self.start_time + return (time.perf_counter() - self.start_time) + (self._past_active_time) + else: + return (self.end_time - self.start_time) + self._past_active_time + + def __repr__(self) -> str: + return repr(self.as_obj()) + + def __str__(self) -> str: + return self.__repr__() + + def as_obj(self) -> Optional[str]: + if self.start_time is None: + return None else: - return self.end_time - self.start_time + time_taken = self.elapsed_seconds() + return f"{time_taken:.3f} seconds" diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index fc8ca166b105a..bc9a3f41a9655 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -18,6 +18,7 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigQueryDataDictionary, BigqueryProject, @@ -100,7 +101,7 @@ def test_get_projects_with_project_ids(client_mock): } ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-1", "test-1"), BigqueryProject("test-2", "test-2"), ] @@ -110,7 +111,7 @@ def test_get_projects_with_project_ids(client_mock): {"project_ids": ["test-1", "test-2"], "project_id": "test-3"} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test2")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-1", "test-1"), BigqueryProject("test-2", "test-2"), ] @@ -125,7 +126,7 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern(): } ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert projects == [ BigqueryProject(id="test-project", name="test-project"), BigqueryProject(id="test-project-2", name="test-project-2"), @@ -156,7 +157,7 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): def test_get_projects_with_single_project_id(client_mock): config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - assert source._get_projects(client_mock) == [ + assert source._get_projects() == [ BigqueryProject("test-3", "test-3"), ] assert client_mock.list_projects.call_count == 0 @@ -177,7 +178,8 @@ def test_get_projects_by_list(client_mock): config = BigQueryV2Config.parse_obj({}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - assert source._get_projects(client_mock) == [ + source.bigquery_data_dictionary.set_client(client_mock) + assert source._get_projects() == [ BigqueryProject("test-1", "one"), BigqueryProject("test-2", "two"), ] @@ -195,7 +197,7 @@ def test_get_projects_filter_by_pattern(get_projects_mock): {"project_id_pattern": {"deny": ["^test-project$"]}} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert projects == [ BigqueryProject(id="test-project-2", name="Test Project 2"), ] @@ -209,7 +211,7 @@ def test_get_projects_list_empty(get_projects_mock): {"project_id_pattern": {"deny": ["^test-project$"]}} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert len(source.report.failures) == 1 assert projects == [] @@ -227,7 +229,7 @@ def test_get_projects_list_failure( source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) caplog.records.clear() with caplog.at_level(logging.ERROR): - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert len(caplog.records) == 1 assert error_str in caplog.records[0].msg assert len(source.report.failures) == 1 @@ -242,7 +244,7 @@ def test_get_projects_list_fully_filtered(get_projects_mock): {"project_id_pattern": {"deny": ["^test-project$"]}} ) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - projects = source._get_projects(MagicMock()) + projects = source._get_projects() assert len(source.report.failures) == 0 assert projects == [] @@ -496,10 +498,11 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): data_dictionary_mock.get_tables_for_dataset.return_value = None source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + source.bigquery_data_dictionary.set_client(client_mock) _ = list( source.get_tables_for_dataset( - conn=client_mock, project_id="test-project", dataset_name="test-dataset" + project_id="test-project", dataset_name="test-dataset" ) ) @@ -507,7 +510,7 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): # args only available from python 3.8 and that's why call_args_list is sooo ugly tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][ - 3 + 2 ] # alternatively for table in tables.keys(): assert table in ["test-table", "test-sharded-table_20220102"] @@ -568,10 +571,11 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m data_dictionary_mock.get_tables_for_dataset.return_value = None source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + source.bigquery_data_dictionary.set_client(client_mock) _ = list( source.get_tables_for_dataset( - conn=client_mock, project_id="test-project", dataset_name="test-dataset" + project_id="test-project", dataset_name="test-dataset" ) ) @@ -579,7 +583,7 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m # args only available from python 3.8 and that's why call_args_list is sooo ugly tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][ - 3 + 2 ] # alternatively for table in tables.keys(): assert tables[table].table_id in ["test-table", "20220103"] @@ -651,9 +655,10 @@ def test_get_views_for_dataset( ) ) query_mock.return_value = [row1, row2] + bigquery_data_dictionary = BigQueryDataDictionary(BigQueryV2Report()) + bigquery_data_dictionary.set_client(client_mock) - views = BigQueryDataDictionary.get_views_for_dataset( - conn=client_mock, + views = bigquery_data_dictionary.get_views_for_dataset( project_id="test-project", dataset_name="test-dataset", has_data_read=False, diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 6ee1f05f0582c..8c50619bee53d 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -4,13 +4,12 @@ from freezegun import freeze_time from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( - BQ_AUDIT_V2, BigqueryTableIdentifier, BigQueryTableRef, ) +from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor FROZEN_TIME = "2021-07-20 00:00:00" @@ -111,10 +110,12 @@ def test_bigqueryv2_filters(): OR protoPayload.metadata.tableDataRead.reason = "JOB" )""" # noqa: W293 - source = BigQueryUsageExtractor( - config, BigQueryV2Report(), dataset_urn_builder=lambda _: "" + api = BigQueryAuditLogApi( + BigQueryV2Report(), config.rate_limit, config.requests_per_min ) - filter: str = source._generate_filter(BQ_AUDIT_V2) + corrected_start_time = config.start_time - config.max_query_duration + corrected_end_time = config.end_time + config.max_query_duration + filter: str = api._generate_filter(corrected_start_time, corrected_end_time) assert filter == expected_filter diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py new file mode 100644 index 0000000000000..9fbd3a7b5d9cd --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py @@ -0,0 +1,41 @@ +import time + +from datahub.utilities.perf_timer import PerfTimer + + +def test_perf_timer_simple(): + with PerfTimer() as timer: + time.sleep(1) + assert round(timer.elapsed_seconds()) == 1 + + assert round(timer.elapsed_seconds()) == 1 + + +def test_perf_timer_paused_timer(): + with PerfTimer() as current_timer: + time.sleep(1) + assert round(current_timer.elapsed_seconds()) == 1 + with current_timer.pause_timer(): + time.sleep(2) + assert round(current_timer.elapsed_seconds()) == 1 + assert round(current_timer.elapsed_seconds()) == 1 + time.sleep(1) + + assert round(current_timer.elapsed_seconds()) == 2 + + +def test_generator_with_paused_timer(): + def generator_function(): + with PerfTimer() as inner_timer: + time.sleep(1) + for i in range(10): + time.sleep(0.2) + with inner_timer.pause_timer(): + time.sleep(0.2) + yield i + assert round(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10 + + with PerfTimer() as outer_timer: + seq = generator_function() + list([i for i in seq]) + assert round(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10 From 95bbcbed84b7e91b52626ecd606953c0b68d6bd8 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Thu, 10 Aug 2023 18:09:17 +0530 Subject: [PATCH 02/11] wip, timers not added for unused methods - remove these ? --- .../ingestion/source/bigquery_v2/bigquery.py | 138 +----- .../source/bigquery_v2/bigquery_audit.py | 43 -- ...audit_api.py => bigquery_audit_log_api.py} | 176 ++++++-- .../source/bigquery_v2/bigquery_config.py | 13 +- .../source/bigquery_v2/bigquery_report.py | 21 +- ...query_schema.py => bigquery_schema_api.py} | 45 +- .../ingestion/source/bigquery_v2/lineage.py | 401 +++++++++--------- .../ingestion/source/bigquery_v2/profiler.py | 2 +- .../ingestion/source/bigquery_v2/usage.py | 27 +- .../integration/bigquery_v2/test_bigquery.py | 6 +- .../tests/unit/test_bigquery_lineage.py | 5 +- .../tests/unit/test_bigquery_profiler.py | 2 +- .../tests/unit/test_bigquery_source.py | 20 +- .../unit/test_bigqueryv2_usage_source.py | 10 +- 14 files changed, 460 insertions(+), 449 deletions(-) rename metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/{bigquery_audit_api.py => bigquery_audit_log_api.py} (54%) rename metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/{bigquery_schema.py => bigquery_schema_api.py} (95%) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 442e1e525fdf6..a38117cce346d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -5,7 +5,7 @@ import re import traceback from collections import defaultdict -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast from google.cloud import bigquery @@ -43,22 +43,19 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( BigqueryColumn, - BigQueryDataDictionary, BigqueryDataset, BigqueryProject, BigqueryTable, + BigQueryTechnicalSchemaApi, BigqueryView, ) from datahub.ingestion.source.bigquery_v2.common import ( BQ_EXTERNAL_DATASET_URL_TEMPLATE, BQ_EXTERNAL_TABLE_URL_TEMPLATE, ) -from datahub.ingestion.source.bigquery_v2.lineage import ( - BigqueryLineageExtractor, - make_lineage_edges_from_parsing_result, -) +from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.ingestion.source.common.subtypes import ( @@ -88,7 +85,6 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetProperties, - UpstreamLineage, ViewProperties, ) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( @@ -107,11 +103,9 @@ ) from datahub.metadata.schema_classes import ( DataPlatformInstanceClass, - DatasetLineageTypeClass, GlobalTagsClass, TagAssociationClass, ) -from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.hive_schema_to_avro import ( HiveColumnToAvroConverter, @@ -120,7 +114,7 @@ from datahub.utilities.mapping import Constants from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.registries.domain_registry import DomainRegistry -from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage +from datahub.utilities.sqlglot_lineage import SchemaResolver from datahub.utilities.time import datetime_to_ts_millis logger: logging.Logger = logging.getLogger(__name__) @@ -226,10 +220,14 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) - self.bigquery_data_dictionary = BigQueryDataDictionary(self.report) + self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi(self.report) # For database, schema, tables, views, etc - self.lineage_extractor = BigqueryLineageExtractor(config, self.report) + self.lineage_extractor = BigqueryLineageExtractor( + config, + self.report, + dataset_urn_builder=self.gen_dataset_urn_from_ref, + ) self.usage_extractor = BigQueryUsageExtractor( config, self.report, dataset_urn_builder=self.gen_dataset_urn_from_ref ) @@ -300,7 +298,7 @@ def metadata_read_capability_test( client: bigquery.Client = config.get_bigquery_client() assert client report = BigQueryV2Report() - bigquery_data_dictionary = BigQueryDataDictionary(report) + bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report) bigquery_data_dictionary.set_client(client) result = bigquery_data_dictionary.get_datasets_for_project_id( project_id, 10 @@ -336,7 +334,9 @@ def lineage_capability_test( project_ids: List[str], report: BigQueryV2Report, ) -> CapabilityReport: - lineage_extractor = BigqueryLineageExtractor(connection_conf, report) + lineage_extractor = BigqueryLineageExtractor( + connection_conf, report, lambda ref: "" + ) for project_id in project_ids: try: logger.info(f"Lineage capability test for project {project_id}") @@ -520,9 +520,12 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) if self._should_ingest_lineage(): - for project in projects: - self.report.set_ingestion_stage(project.id, "Lineage Extraction") - yield from self.generate_lineage(project.id) + yield from self.lineage_extractor.get_lineage_workunits( + projects, + self.sql_parser_schema_resolver, + self.view_definition_ids, + self.table_refs, + ) def _should_ingest_usage(self) -> bool: if not self.config.include_usage_statistics: @@ -671,72 +674,6 @@ def _process_project( tables=db_tables, ) - def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]: - logger.info(f"Generate lineage for {project_id}") - lineage = self.lineage_extractor.calculate_lineage_for_project( - project_id, - sql_parser_schema_resolver=self.sql_parser_schema_resolver, - ) - - if self.config.lineage_parse_view_ddl: - for view, view_definition_id in self.view_definition_ids[ - project_id - ].items(): - view_definition = self.view_definitions[view_definition_id] - raw_view_lineage = sqlglot_lineage( - view_definition, - schema_resolver=self.sql_parser_schema_resolver, - default_db=project_id, - ) - if raw_view_lineage.debug_info.table_error: - logger.debug( - f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}" - ) - self.report.num_view_definitions_failed_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}" - ) - continue - elif raw_view_lineage.debug_info.column_error: - self.report.num_view_definitions_failed_column_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}" - ) - else: - self.report.num_view_definitions_parsed += 1 - - # For views, we override the upstreams obtained by parsing audit logs - # as they may contain indirectly referenced tables. - ts = datetime.now(timezone.utc) - lineage[view] = set( - make_lineage_edges_from_parsing_result( - raw_view_lineage, - audit_stamp=ts, - lineage_type=DatasetLineageTypeClass.VIEW, - ) - ) - - for lineage_key in lineage.keys(): - if lineage_key not in self.table_refs: - continue - - table_ref = BigQueryTableRef.from_string_name(lineage_key) - dataset_urn = self.gen_dataset_urn( - project_id=table_ref.table_identifier.project_id, - dataset_name=table_ref.table_identifier.dataset, - table=table_ref.table_identifier.get_table_display_name(), - ) - - lineage_info = self.lineage_extractor.get_lineage_for_table( - bq_table=table_ref, - bq_table_urn=dataset_urn, - platform=self.platform, - lineage_metadata=lineage, - ) - - if lineage_info: - yield from self.gen_lineage(dataset_urn, lineage_info) - def _process_schema( self, project_id: str, @@ -1069,39 +1006,6 @@ def gen_dataset_workunits( domain_config=self.config.domain, ) - def gen_lineage( - self, - dataset_urn: str, - upstream_lineage: Optional[UpstreamLineage] = None, - ) -> Iterable[MetadataWorkUnit]: - if upstream_lineage is None: - return - - if upstream_lineage is not None: - if self.config.incremental_lineage: - patch_builder: DatasetPatchBuilder = DatasetPatchBuilder( - urn=dataset_urn - ) - for upstream in upstream_lineage.upstreams: - patch_builder.add_upstream_lineage(upstream) - - yield from [ - MetadataWorkUnit( - id=f"upstreamLineage-for-{dataset_urn}", - mcp_raw=mcp, - ) - for mcp in patch_builder.build() - ] - else: - if not self.config.extract_column_lineage: - upstream_lineage.fineGrainedLineages = None - - yield from [ - MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=upstream_lineage - ).as_workunit() - ] - def gen_tags_aspect_workunit( self, dataset_urn: str, tags_to_add: List[str] ) -> MetadataWorkUnit: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 0f9b37c93feaa..b0ac77201b415 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -13,48 +13,6 @@ get_first_missing_key_any, ) -BQ_FILTER_RULE_TEMPLATE = "BQ_FILTER_RULE_TEMPLATE" - -BQ_AUDIT_V2 = { - BQ_FILTER_RULE_TEMPLATE: """ -resource.type=("bigquery_project" OR "bigquery_dataset") -AND -timestamp >= "{start_time}" -AND -timestamp < "{end_time}" -AND protoPayload.serviceName="bigquery.googleapis.com" -AND -( - ( - protoPayload.methodName= - ( - "google.cloud.bigquery.v2.JobService.Query" - OR - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" - AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* - AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:* - AND - ( - ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* - AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*" - ) - OR - ( - protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:* - ) - ) - ) - OR - protoPayload.metadata.tableDataRead.reason = "JOB" -) -""".strip( - "\t \n" - ), -} - AuditLogEntry = Any # BigQueryAuditMetadata is the v2 format in which audit logs are exported to BigQuery @@ -606,7 +564,6 @@ def from_query_event( query_event: QueryEvent, debug_include_full_payloads: bool = False, ) -> "ReadEvent": - readEvent = ReadEvent( actor_email=query_event.actor_email, timestamp=query_event.timestamp, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py similarity index 54% rename from metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py rename to metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index 09a9098ced338..048f90de7ee0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -1,15 +1,13 @@ import logging import textwrap from datetime import datetime -from typing import Iterable, List, Optional +from typing import Callable, Iterable, List, Optional from google.cloud import bigquery from google.cloud.logging_v2.client import Client as GCPLoggingClient from ratelimiter import RateLimiter from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( - BQ_AUDIT_V2, - BQ_FILTER_RULE_TEMPLATE, AuditLogEntry, BigQueryAuditMetadata, ) @@ -24,6 +22,7 @@ logger: logging.Logger = logging.getLogger(__name__) +# TODO: separation of api/extractor classes - client wise, functionality wise ? class BigQueryAuditLogApi: def __init__( self, @@ -31,13 +30,17 @@ def __init__( rate_limit: bool, requests_per_min: int, ) -> None: - self.api_perf_report = report + self.report = report self.rate_limit = rate_limit self.requests_per_min = requests_per_min + # TODO; should we refractor and move this to schema api , as this uses bigquery client ? def get_exported_bigquery_audit_metadata( self, bigquery_client: bigquery.Client, + bigquery_audit_metadata_query_template: Callable[ + [str, bool, Optional[int]], str + ], bigquery_audit_metadata_datasets: Optional[List[str]], use_date_sharded_audit_log_tables: bool, start_time: datetime, @@ -53,7 +56,7 @@ def get_exported_bigquery_audit_metadata( audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT) - with self.api_perf_report.get_exported_bigquery_audit_metadata as current_timer: + with self.report.get_exported_log_entries as current_timer: for dataset in bigquery_audit_metadata_datasets: logger.info( f"Start loading log entries from BigQueryAuditMetadata in {dataset}" @@ -62,7 +65,7 @@ def get_exported_bigquery_audit_metadata( query = bigquery_audit_metadata_query_template( dataset, use_date_sharded_audit_log_tables, - limit=limit, + limit, ).format( start_time=audit_start_time, end_time=audit_end_time, @@ -74,22 +77,24 @@ def get_exported_bigquery_audit_metadata( logger.info( f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" ) - with current_timer.pause_timer(): - if self.rate_limit: - with RateLimiter(max_calls=self.requests_per_min, period=60): - yield from query_job - else: - yield from query_job + + if self.rate_limit: + with RateLimiter(max_calls=self.requests_per_min, period=60): + for entry in query_job: + with current_timer.pause_timer(): + yield entry + else: + for entry in query_job: + with current_timer.pause_timer(): + yield entry def get_bigquery_log_entries_via_gcp_logging( self, client: GCPLoggingClient, - start_time: datetime, - end_time: datetime, + filter: str, log_page_size: int, limit: Optional[int] = None, ) -> Iterable[AuditLogEntry]: - filter = self._generate_filter(start_time, end_time) logger.debug(filter) list_entries: Iterable[AuditLogEntry] @@ -102,7 +107,7 @@ def get_bigquery_log_entries_via_gcp_logging( period=60, ) - with self.api_perf_report.get_bigquery_log_entries_via_gcp_logging as current_timer: + with self.report.list_log_entries as current_timer: list_entries = client.list_entries( filter_=filter, page_size=log_page_size, @@ -114,6 +119,7 @@ def get_bigquery_log_entries_via_gcp_logging( logger.info( f"Starting log load from GCP Logging for {client.project}" ) + if i % 1000 == 0: logger.info( f"Loaded {i} log entries from GCP Log for {client.project}" @@ -126,18 +132,12 @@ def get_bigquery_log_entries_via_gcp_logging( else: yield entry - def _generate_filter(self, start_time: datetime, end_time: datetime) -> str: - audit_start_time = (start_time).strftime(BQ_DATETIME_FORMAT) - - audit_end_time = (end_time).strftime(BQ_DATETIME_FORMAT) - - filter = BQ_AUDIT_V2[BQ_FILTER_RULE_TEMPLATE].format( - start_time=audit_start_time, end_time=audit_end_time - ) - return filter + logger.info( + f"Finished loading log entries from GCP Log for {client.project}" + ) -def bigquery_audit_metadata_query_template( +def bigquery_audit_metadata_query_template_usage( dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None, @@ -206,3 +206,127 @@ def bigquery_audit_metadata_query_template( """ return textwrap.dedent(query) + + +def bigquery_audit_metadata_query_template_lineage( + dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None +) -> str: + """ + Receives a dataset (with project specified) and returns a query template that is used to query exported + AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + Include only those that: + - have been completed (jobStatus.jobState = "DONE") + - do not contain errors (jobStatus.errorResults is none) + :param dataset: the dataset to query against in the form of $PROJECT.$DATASET + :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log + tables + :param limit: set a limit for the maximum event to return. It is used for connection testing currently + :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery + """ + limit_text = f"limit {limit}" if limit else "" + + shard_condition = "" + if use_date_sharded_tables: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" + shard_condition = ( + """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + ) + else: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" + + query = f""" + SELECT + timestamp, + logName, + insertId, + protopayload_auditlog AS protoPayload, + protopayload_auditlog.metadataJson AS metadata + FROM + {from_table} + WHERE ( + timestamp >= "{{start_time}}" + AND timestamp < "{{end_time}}" + ) + {shard_condition} + AND protopayload_auditlog.serviceName="bigquery.googleapis.com" + AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL + {limit_text}; + """ + + return textwrap.dedent(query) + + +BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """ +resource.type=("bigquery_project" OR "bigquery_dataset") +AND +timestamp >= "{start_time}" +AND +timestamp < "{end_time}" +AND protoPayload.serviceName="bigquery.googleapis.com" +AND +( + ( + protoPayload.methodName= + ( + "google.cloud.bigquery.v2.JobService.Query" + OR + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" + AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* + AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:* + AND + ( + ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* + AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*" + ) + OR + ( + protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:* + ) + ) + ) + OR + protoPayload.metadata.tableDataRead.reason = "JOB" +) +""".strip( + "\t \n" +) + +BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """ +resource.type=("bigquery_project") +AND +( + protoPayload.methodName= + ( + "google.cloud.bigquery.v2.JobService.Query" + OR + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND + protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" + AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* + AND ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* + OR + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:* + ) + AND ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*" + AND + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*" + AND + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__" + AND + protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*" + ) + +) +AND +timestamp >= "{start_time}" +AND +timestamp < "{end_time}" +""".strip() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index fa689d571c8cc..84fdead338ee6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -6,7 +6,7 @@ import pydantic from google.cloud import bigquery from google.cloud.logging_v2.client import Client as GCPLoggingClient -from pydantic import Field, PositiveInt, PrivateAttr, root_validator +from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.validate_field_removal import pydantic_removed_field @@ -266,6 +266,17 @@ def profile_default_settings(cls, values: Dict) -> Dict: return values + @validator("bigquery_audit_metadata_datasets") + def validate_bigquery_audit_metadata_datasets( + cls, v: Optional[List[str]], values: Dict + ) -> Dict: + if values.get("use_exported_bigquery_audit_metadata"): + assert ( + v and len(v) > 0 + ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata` for usage/lineage." + + return values + @root_validator(pre=False) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: project_id = values.get("project_id") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index fc725e0cda3c4..62fd23bb1b68a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -17,7 +17,7 @@ class BigQueryApiPerfReport: list_projects = PerfTimer() - get_datasets_for_project = PerfTimer() + list_datasets = PerfTimer() get_columns_for_dataset = PerfTimer() get_tables_for_dataset = PerfTimer() list_tables = PerfTimer() @@ -25,8 +25,8 @@ class BigQueryApiPerfReport: class BigQueryAuditLogApiPerfReport: - get_exported_bigquery_audit_metadata = PerfTimer() - get_bigquery_log_entries_via_gcp_logging = PerfTimer() + get_exported_log_entries = PerfTimer() + list_log_entries = PerfTimer() @dataclass @@ -46,8 +46,12 @@ class BigQueryV2Report( num_skipped_lineage_entries_other: TopKDict[str, int] = field( default_factory=int_top_k_dict ) - num_total_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict) - num_parsed_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict) + num_lineage_total_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + num_lineage_parsed_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) num_lineage_log_parse_failures: TopKDict[str, int] = field( default_factory=int_top_k_dict ) @@ -57,7 +61,14 @@ class BigQueryV2Report( lineage_mem_size: Dict[str, str] = field(default_factory=TopKDict) lineage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict) usage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict) + num_usage_total_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + num_usage_parsed_log_entries: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) usage_error_count: Dict[str, int] = field(default_factory=int_top_k_dict) + num_usage_resources_dropped: int = 0 num_usage_operations_dropped: int = 0 operation_dropped: LossyList[str] = field(default_factory=LossyList) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py similarity index 95% rename from metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py rename to metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py index 68dd0eeefe09b..e7fcd5bd390c0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py @@ -347,11 +347,14 @@ class BigqueryQuery: table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" -class BigQueryDataDictionary: +# TODO: remove static methods from here +# TODO: move queries into separate file +class BigQueryTechnicalSchemaApi: def __init__(self, report: BigQueryApiPerfReport) -> None: self.bq_client: Optional[bigquery.Client] = None self.api_perf_report = report + # TODO: remove need to set_client. maybe pass in constructor? def set_client(self, bq_client: bigquery.Client) -> None: self.bq_client = bq_client @@ -359,10 +362,9 @@ def get_client(self) -> bigquery.Client: assert self.bq_client is not None return self.bq_client - @staticmethod - def get_query_result(conn: bigquery.Client, query: str) -> RowIterator: + def get_query_result(self, query: str) -> RowIterator: logger.debug(f"Query : {query}") - resp = conn.query(query) + resp = self.get_client().query(query) return resp.result() def get_projects(self) -> List[BigqueryProject]: @@ -376,7 +378,7 @@ def get_projects(self) -> List[BigqueryProject]: def get_datasets_for_project_id( self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: - with self.api_perf_report.get_datasets_for_project: + with self.api_perf_report.list_datasets: datasets = self.get_client().list_datasets( project_id, max_results=maxResults ) @@ -384,9 +386,9 @@ def get_datasets_for_project_id( BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets ] - @staticmethod + # This is not used anywhere def get_datasets_for_project_id_with_information_schema( - conn: bigquery.Client, project_id: str + self, project_id: str ) -> List[BigqueryDataset]: """ This method is not used as of now, due to below limitation. @@ -394,8 +396,7 @@ def get_datasets_for_project_id_with_information_schema( We'll need Region wise separate queries to fetch all datasets https://cloud.google.com/bigquery/docs/information-schema-datasets-schemata """ - schemas = BigQueryDataDictionary.get_query_result( - conn, + schemas = self.get_query_result( BigqueryQuery.datasets_for_project_id.format(project_id=project_id), ) return [ @@ -431,8 +432,7 @@ def get_tables_for_dataset( if with_data_read_permission: # Tables are ordered by name and table suffix to make sure we always process the latest sharded table # and skip the others. Sharded tables are tables with suffix _20220102 - cur = BigQueryDataDictionary.get_query_result( - self.get_client(), + cur = self.get_query_result( BigqueryQuery.tables_for_dataset.format( project_id=project_id, dataset_name=dataset_name, @@ -444,8 +444,7 @@ def get_tables_for_dataset( else: # Tables are ordered by name and table suffix to make sure we always process the latest sharded table # and skip the others. Sharded tables are tables with suffix _20220102 - cur = BigQueryDataDictionary.get_query_result( - self.get_client(), + cur = self.get_query_result( BigqueryQuery.tables_for_dataset_without_partition_data.format( project_id=project_id, dataset_name=dataset_name, @@ -458,7 +457,7 @@ def get_tables_for_dataset( for table in cur: try: with current_timer.pause_timer(): - yield BigQueryDataDictionary._make_bigquery_table( + yield BigQueryTechnicalSchemaApi._make_bigquery_table( table, tables.get(table.table_name) ) except Exception as e: @@ -520,15 +519,13 @@ def get_views_for_dataset( ) -> Iterator[BigqueryView]: with self.api_perf_report.get_views_for_dataset as current_timer: if has_data_read: - cur = BigQueryDataDictionary.get_query_result( - self.get_client(), + cur = self.get_query_result( BigqueryQuery.views_for_dataset.format( project_id=project_id, dataset_name=dataset_name ), ) else: - cur = BigQueryDataDictionary.get_query_result( - self.get_client(), + cur = self.get_query_result( BigqueryQuery.views_for_dataset_without_data_read.format( project_id=project_id, dataset_name=dataset_name ), @@ -537,7 +534,7 @@ def get_views_for_dataset( for table in cur: try: with current_timer.pause_timer(): - yield BigQueryDataDictionary._make_bigquery_view(table) + yield BigQueryTechnicalSchemaApi._make_bigquery_view(table) except Exception as e: view_name = f"{project_id}.{dataset_name}.{table.table_name}" logger.warning( @@ -575,8 +572,7 @@ def get_columns_for_dataset( columns: Dict[str, List[BigqueryColumn]] = defaultdict(list) with self.api_perf_report.get_columns_for_dataset: try: - cur = BigQueryDataDictionary.get_query_result( - self.get_client(), + cur = self.get_query_result( BigqueryQuery.columns_for_dataset.format( project_id=project_id, dataset_name=dataset_name ) @@ -620,14 +616,13 @@ def get_columns_for_dataset( return columns - @staticmethod + # This is not used anywhere def get_columns_for_table( - conn: bigquery.Client, + self, table_identifier: BigqueryTableIdentifier, column_limit: Optional[int], ) -> List[BigqueryColumn]: - cur = BigQueryDataDictionary.get_query_result( - conn, + cur = self.get_query_result( BigqueryQuery.columns_for_table.format(table_identifier=table_identifier), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 210018e55be15..d1bc687ebaca0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -1,7 +1,6 @@ import collections import itertools import logging -import textwrap from dataclasses import dataclass from datetime import datetime, timezone from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set, Union @@ -10,9 +9,10 @@ from google.cloud.bigquery import Client as BigQueryClient from google.cloud.datacatalog import lineage_v1 from google.cloud.logging_v2.client import Client as GCPLoggingClient -from ratelimiter import RateLimiter from datahub.emitter import mce_builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( AuditLogEntry, BigQueryAuditMetadata, @@ -21,12 +21,18 @@ QueryEvent, ReadEvent, ) +from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import ( + BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE, + BigQueryAuditLogApi, + bigquery_audit_metadata_query_template_lineage, +) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.common import ( - BQ_DATE_SHARD_FORMAT, - BQ_DATETIME_FORMAT, +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( + BigqueryProject, + BigQueryTechnicalSchemaApi, ) +from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT from datahub.metadata.schema_classes import ( AuditStampClass, DatasetLineageTypeClass, @@ -36,6 +42,7 @@ UpstreamClass, UpstreamLineageClass, ) +from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities import memory_footprint from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.sqlglot_lineage import ( @@ -177,98 +184,132 @@ def make_lineage_edges_from_parsing_result( class BigqueryLineageExtractor: - BQ_FILTER_RULE_TEMPLATE_V2 = """ -resource.type=("bigquery_project") -AND -( - protoPayload.methodName= - ( - "google.cloud.bigquery.v2.JobService.Query" - OR - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND - protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" - AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* - AND ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* - OR - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:* - ) - AND ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*" - AND - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*" - AND - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__" - AND - protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*" - ) - -) -AND -timestamp >= "{start_time}" -AND -timestamp < "{end_time}" -""".strip() - - def __init__(self, config: BigQueryV2Config, report: BigQueryV2Report): + def __init__( + self, + config: BigQueryV2Config, + report: BigQueryV2Report, + dataset_urn_builder: Callable[[BigQueryTableRef], str], + ): self.config = config self.report = report + self.dataset_urn_builder = dataset_urn_builder def error(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason) log.error(f"{key} => {reason}") - @staticmethod - def bigquery_audit_metadata_query_template( - dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None - ) -> str: - """ - Receives a dataset (with project specified) and returns a query template that is used to query exported - AuditLogs containing protoPayloads of type BigQueryAuditMetadata. - Include only those that: - - have been completed (jobStatus.jobState = "DONE") - - do not contain errors (jobStatus.errorResults is none) - :param dataset: the dataset to query against in the form of $PROJECT.$DATASET - :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log - tables - :param limit: set a limit for the maximum event to return. It is used for connection testing currently - :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery - """ - limit_text = f"limit {limit}" if limit else "" - - shard_condition = "" - if use_date_sharded_tables: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" - shard_condition = ( - """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + def get_lineage_workunits( + self, + projects: List[BigqueryProject], + sql_parser_schema_resolver: SchemaResolver, + view_definition_ids: Dict[str, Dict[str, str]], + table_refs: Set[str], + ) -> Iterable[MetadataWorkUnit]: + for project in projects: + self.report.set_ingestion_stage(project.id, "Lineage Extraction") + yield from self.generate_lineage( + project.id, + sql_parser_schema_resolver, + view_definition_ids[project.id], + table_refs, ) - else: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" - - query = f""" - SELECT - timestamp, - logName, - insertId, - protopayload_auditlog AS protoPayload, - protopayload_auditlog.metadataJson AS metadata - FROM - {from_table} - WHERE ( - timestamp >= "{{start_time}}" - AND timestamp < "{{end_time}}" + + def generate_lineage( + self, + project_id: str, + sql_parser_schema_resolver: SchemaResolver, + view_definition_ids: Dict[str, str], + table_refs: Set[str], + ) -> Iterable[MetadataWorkUnit]: + logger.info(f"Generate lineage for {project_id}") + lineage = self.calculate_lineage_for_project( + project_id, sql_parser_schema_resolver=sql_parser_schema_resolver + ) + + if self.config.lineage_parse_view_ddl: + for view, view_definition_id in view_definition_ids.items(): + view_definition = view_definition_ids[view_definition_id] + raw_view_lineage = sqlglot_lineage( + view_definition, + schema_resolver=sql_parser_schema_resolver, + default_db=project_id, + ) + if raw_view_lineage.debug_info.table_error: + logger.debug( + f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}" + ) + self.report.num_view_definitions_failed_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}" + ) + continue + elif raw_view_lineage.debug_info.column_error: + self.report.num_view_definitions_failed_column_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}" + ) + else: + self.report.num_view_definitions_parsed += 1 + + # For views, we override the upstreams obtained by parsing audit logs + # as they may contain indirectly referenced tables. + ts = datetime.now(timezone.utc) + lineage[view] = set( + make_lineage_edges_from_parsing_result( + raw_view_lineage, + audit_stamp=ts, + lineage_type=DatasetLineageTypeClass.VIEW, + ) + ) + + for lineage_key in lineage.keys(): + if lineage_key not in table_refs: + continue + + table_ref = BigQueryTableRef.from_string_name(lineage_key) + dataset_urn = self.dataset_urn_builder(table_ref) + + lineage_info = self.get_lineage_for_table( + bq_table=table_ref, + bq_table_urn=dataset_urn, + lineage_metadata=lineage, ) - {shard_condition} - AND protopayload_auditlog.serviceName="bigquery.googleapis.com" - AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL - {limit_text}; - """ - return textwrap.dedent(query) + if lineage_info: + yield from self.gen_lineage(dataset_urn, lineage_info) + + def gen_lineage( + self, + dataset_urn: str, + upstream_lineage: Optional[UpstreamLineageClass] = None, + ) -> Iterable[MetadataWorkUnit]: + if upstream_lineage is None: + return + + if upstream_lineage is not None: + if self.config.incremental_lineage: + patch_builder: DatasetPatchBuilder = DatasetPatchBuilder( + urn=dataset_urn + ) + for upstream in upstream_lineage.upstreams: + patch_builder.add_upstream_lineage(upstream) + + yield from [ + MetadataWorkUnit( + id=f"upstreamLineage-for-{dataset_urn}", + mcp_raw=mcp, + ) + for mcp in patch_builder.build() + ] + else: + if not self.config.extract_column_lineage: + upstream_lineage.fineGrainedLineages = None + + yield from [ + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=upstream_lineage + ).as_workunit() + ] def lineage_via_catalog_lineage_api( self, project_id: str @@ -292,22 +333,26 @@ def lineage_via_catalog_lineage_api( try: lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() - bigquery_client: BigQueryClient = self.config.get_bigquery_client() + + data_dictionary = BigQueryTechnicalSchemaApi(self.report) + data_dictionary.set_client(self.config.get_bigquery_client()) # Filtering datasets - datasets = list(bigquery_client.list_datasets(project_id)) + datasets = list(data_dictionary.get_datasets_for_project_id(project_id)) project_tables = [] for dataset in datasets: # Enables only tables where type is TABLE, VIEW or MATERIALIZED_VIEW (not EXTERNAL) project_tables.extend( [ table - for table in bigquery_client.list_tables(dataset.dataset_id) + for table in data_dictionary.list_tables( + dataset.name, project_id + ) if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"] ] ) # Convert project tables to .. format - project_tables = list( + project_table_names = list( map( lambda table: "{}.{}.{}".format( table.project, table.dataset_id, table.table_id @@ -318,7 +363,7 @@ def lineage_via_catalog_lineage_api( lineage_map: Dict[str, Set[LineageEdge]] = {} curr_date = datetime.now() - for table in project_tables: + for table in project_table_names: logger.info("Creating lineage map for table %s", table) upstreams = set() downstream_table = lineage_v1.EntityReference() @@ -375,126 +420,61 @@ def lineage_via_catalog_lineage_api( raise e def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: + audit_log_api = BigQueryAuditLogApi( + self.report, self.config.rate_limit, self.config.requests_per_min + ) + # We adjust the filter values a bit, since we need to make sure that the join + # between query events and read events is complete. For example, this helps us + # handle the case where the read happens within our time range but the query + # completion event is delayed and happens after the configured end time. + corrected_start_time = self.config.start_time - self.config.max_query_duration + corrected_end_time = self.config.end_time + -self.config.max_query_duration + self.report.audit_start_time = corrected_start_time + self.report.audit_end_time = corrected_end_time + parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]] if self.config.use_exported_bigquery_audit_metadata: logger.info("Populating lineage info via exported GCP audit logs") bq_client = self.config.get_bigquery_client() - entries = self._get_exported_bigquery_audit_metadata(bq_client) + # TODO: make this call simpler + entries = audit_log_api.get_exported_bigquery_audit_metadata( + bigquery_client=bq_client, + bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage, + bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, + start_time=corrected_start_time, + end_time=corrected_end_time, + ) parse_fn = self._parse_exported_bigquery_audit_metadata else: logger.info("Populating lineage info via exported GCP audit logs") + logging_client = self.config.make_gcp_logging_client(project_id) - entries = self._get_bigquery_log_entries(logging_client) + logger.info( + f"Start loading log entries from BigQuery for {project_id} " + f"with start_time={corrected_start_time} and end_time={corrected_end_time}" + ) + entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging( + logging_client, + BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( + corrected_start_time.strftime(BQ_DATETIME_FORMAT), + corrected_end_time.strftime(BQ_DATETIME_FORMAT), + ), + self.config.log_page_size, + ) parse_fn = self._parse_bigquery_log_entries for entry in entries: - self.report.num_total_log_entries[project_id] += 1 + self.report.num_lineage_total_log_entries[project_id] += 1 try: event = parse_fn(entry) if event: - self.report.num_parsed_log_entries[project_id] += 1 + self.report.num_lineage_parsed_log_entries[project_id] += 1 yield event except Exception as e: logger.warning(f"Unable to parse log entry `{entry}`: {e}") self.report.num_lineage_log_parse_failures[project_id] += 1 - def _get_bigquery_log_entries( - self, client: GCPLoggingClient, limit: Optional[int] = None - ) -> Iterable[AuditLogEntry]: - self.report.num_total_log_entries[client.project] = 0 - # Add a buffer to start and end time to account for delays in logging events. - corrected_start_time = self.config.start_time - self.config.max_query_duration - start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) - self.report.log_entry_start_time = corrected_start_time - - corrected_end_time = self.config.end_time + self.config.max_query_duration - end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) - self.report.log_entry_end_time = corrected_end_time - - filter = self.BQ_FILTER_RULE_TEMPLATE_V2.format( - start_time=start_time, - end_time=end_time, - ) - - logger.info( - f"Start loading log entries from BigQuery for {client.project} with start_time={start_time} and end_time={end_time}" - ) - - if self.config.rate_limit: - with RateLimiter(max_calls=self.config.requests_per_min, period=60): - entries = client.list_entries( - filter_=filter, - page_size=self.config.log_page_size, - max_results=limit, - ) - else: - entries = client.list_entries( - filter_=filter, page_size=self.config.log_page_size, max_results=limit - ) - - logger.info( - f"Start iterating over log entries from BigQuery for {client.project}" - ) - for entry in entries: - self.report.num_total_log_entries[client.project] += 1 - if self.report.num_total_log_entries[client.project] % 1000 == 0: - logger.info( - f"{self.report.num_total_log_entries[client.project]} log entries loaded for project {client.project} so far..." - ) - yield entry - - logger.info( - f"Finished loading {self.report.num_total_log_entries[client.project]} log entries from BigQuery project {client.project} so far" - ) - - def _get_exported_bigquery_audit_metadata( - self, bigquery_client: BigQueryClient, limit: Optional[int] = None - ) -> Iterable[BigQueryAuditMetadata]: - if self.config.bigquery_audit_metadata_datasets is None: - self.error( - logger, "audit-metadata", "bigquery_audit_metadata_datasets not set" - ) - self.report.bigquery_audit_metadata_datasets_missing = True - return - - corrected_start_time = self.config.start_time - self.config.max_query_duration - start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT) - start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_start_time = corrected_start_time - - corrected_end_time = self.config.end_time + self.config.max_query_duration - end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT) - end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT) - self.report.audit_end_time = corrected_end_time - - for dataset in self.config.bigquery_audit_metadata_datasets: - logger.info( - f"Start loading log entries from BigQueryAuditMetadata in {dataset}" - ) - - query: str = self.bigquery_audit_metadata_query_template( - dataset=dataset, - use_date_sharded_tables=self.config.use_date_sharded_audit_log_tables, - limit=limit, - ).format( - start_time=start_time, - end_time=end_time, - start_date=start_date, - end_date=end_date, - ) - - query_job = bigquery_client.query(query) - - logger.info( - f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" - ) - - if self.config.rate_limit: - with RateLimiter(max_calls=self.config.requests_per_min, period=60): - yield from query_job - else: - yield from query_job - # Currently we only parse JobCompleted events but in future we would want to parse other # events to also create field level lineage. def _parse_bigquery_log_entries( @@ -749,7 +729,6 @@ def get_lineage_for_table( bq_table: BigQueryTableRef, bq_table_urn: str, lineage_metadata: Dict[str, Set[LineageEdge]], - platform: str, ) -> Optional[UpstreamLineageClass]: upstream_list: List[UpstreamClass] = [] fine_grained_lineages: List[FineGrainedLineageClass] = [] @@ -757,12 +736,7 @@ def get_lineage_for_table( # even if the lineage is same but the order is different. for upstream in sorted(self.get_upstream_tables(bq_table, lineage_metadata)): upstream_table = BigQueryTableRef.from_string_name(upstream.table) - upstream_table_urn = mce_builder.make_dataset_urn_with_platform_instance( - platform, - upstream_table.table_identifier.get_table_name(), - self.config.platform_instance, - self.config.env, - ) + upstream_table_urn = self.dataset_urn_builder(upstream_table) # Generate table-level lineage. upstream_table_class = UpstreamClass( @@ -812,12 +786,21 @@ def get_lineage_for_table( return None def test_capability(self, project_id: str) -> None: + audit_log_api = BigQueryAuditLogApi( + self.report, self.config.rate_limit, self.config.requests_per_min + ) + if self.config.use_exported_bigquery_audit_metadata: bigquery_client: BigQueryClient = BigQueryClient(project=project_id) - entries = self._get_exported_bigquery_audit_metadata( - bigquery_client=bigquery_client, limit=1 - ) - for entry in entries: + for entry in audit_log_api.get_exported_bigquery_audit_metadata( + bigquery_client=bigquery_client, + bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage, + bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, + start_time=self.config.start_time, + end_time=self.config.end_time, + limit=1, + ): logger.debug( f"Connection test got one exported_bigquery_audit_metadata {entry}" ) @@ -825,5 +808,13 @@ def test_capability(self, project_id: str) -> None: gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client( project_id ) - for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1): + for entry in audit_log_api.get_bigquery_log_entries_via_gcp_logging( + gcp_logging_client, + filter=BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( + self.config.start_time.strftime(BQ_DATETIME_FORMAT), + self.config.end_time.strftime(BQ_DATETIME_FORMAT), + ), + log_page_size=self.config.log_page_size, + limit=1, + ): logger.debug(f"Connection test got one audit metadata entry {entry}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index c9dcb4fe35c3f..f825bbf666b64 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -11,7 +11,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( RANGE_PARTITION_NAME, BigqueryTable, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 20014f2f3fac4..51d74168c4970 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -36,9 +36,14 @@ QueryEvent, ReadEvent, ) -from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi +from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import ( + BQ_FILTER_RULE_TEMPLATE_V2_USAGE, + BigQueryAuditLogApi, + bigquery_audit_metadata_query_template_usage, +) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report +from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT from datahub.ingestion.source.usage.usage_common import ( TOTAL_BUDGET_FOR_QUERY_LIST, make_usage_workunit, @@ -286,7 +291,8 @@ class BigQueryUsageExtractor: * Aggregation of these statistics into buckets, by day or hour granularity :::note - 1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. In that case, use either admin or private log viewer permission. + 1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. + In that case, use either admin or private log viewer permission. ::: """ @@ -788,6 +794,7 @@ def _get_parsed_bigquery_log_events( entries = audit_log_api.get_exported_bigquery_audit_metadata( bigquery_client=bq_client, bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_usage, use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, start_time=corrected_start_time, end_time=corrected_end_time, @@ -796,10 +803,13 @@ def _get_parsed_bigquery_log_events( parse_fn = self._parse_exported_bigquery_audit_metadata else: logging_client = self.config.make_gcp_logging_client(project_id) + logger.info( + f"Start loading log entries from BigQuery for {project_id} " + f"with start_time={corrected_start_time} and end_time={corrected_end_time}" + ) entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging( logging_client, - start_time=corrected_start_time, - end_time=corrected_end_time, + filter=self._generate_filter(corrected_start_time, corrected_end_time), log_page_size=self.config.log_page_size, limit=limit, ) @@ -807,9 +817,10 @@ def _get_parsed_bigquery_log_events( for entry in entries: try: - self.report.total_query_log_entries += 1 + self.report.num_usage_total_log_entries[project_id] += 1 event = parse_fn(entry) if event: + self.report.num_usage_parsed_log_entries[project_id] += 1 yield event except Exception as e: logger.warning( @@ -820,6 +831,12 @@ def _get_parsed_bigquery_log_events( f"log-parse-{project_id}", e, group="usage-log-parse" ) + def _generate_filter(self, corrected_start_time, corrected_end_time): + return BQ_FILTER_RULE_TEMPLATE_V2_USAGE.format( + start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT), + end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT), + ) + def get_tables_from_query( self, default_project: str, query: str ) -> Optional[List[BigQueryTableRef]]: diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 3bda6c5cce84b..ba3ea06b07623 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -4,7 +4,7 @@ from freezegun import freeze_time from google.cloud.bigquery.table import TableListItem -from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( BigqueryDataset, BigqueryTable, ) @@ -16,13 +16,13 @@ @freeze_time(FROZEN_TIME) @patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset" + "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" ) @patch( "datahub.ingestion.source.bigquery_v2.bigquery.BigqueryV2Source.get_core_table_details" ) @patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_datasets_for_project_id" + "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_datasets_for_project_id" ) @patch("google.cloud.bigquery.Client") def test_bigquery_v2_ingest( diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py index c9308fd89ef72..aab923585b6fb 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py @@ -17,7 +17,9 @@ def test_lineage_with_timestamps(): config = BigQueryV2Config() report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, report, lambda x: "" + ) lineage_entries: List[QueryEvent] = [ QueryEvent( timestamp=datetime.datetime.now(tz=datetime.timezone.utc), @@ -86,7 +88,6 @@ def test_lineage_with_timestamps(): bq_table=bq_table, bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", lineage_metadata=lineage_map, - platform="bigquery", ) assert upstream_lineage assert len(upstream_lineage.upstreams) == 4 diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py index a2aec8df93d09..a723b6d475ae3 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py +++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py @@ -2,7 +2,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( BigqueryColumn, BigqueryTable, PartitionInfo, diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index bc9a3f41a9655..84f218074d99b 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -19,9 +19,9 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( - BigQueryDataDictionary, +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( BigqueryProject, + BigQueryTechnicalSchemaApi, BigqueryView, ) from datahub.ingestion.source.bigquery_v2.lineage import ( @@ -186,7 +186,7 @@ def test_get_projects_by_list(client_mock): assert client_mock.list_projects.call_count == 1 -@patch.object(BigQueryDataDictionary, "get_projects") +@patch.object(BigQueryTechnicalSchemaApi, "get_projects") def test_get_projects_filter_by_pattern(get_projects_mock): get_projects_mock.return_value = [ BigqueryProject("test-project", "Test Project"), @@ -203,7 +203,7 @@ def test_get_projects_filter_by_pattern(get_projects_mock): ] -@patch.object(BigQueryDataDictionary, "get_projects") +@patch.object(BigQueryTechnicalSchemaApi, "get_projects") def test_get_projects_list_empty(get_projects_mock): get_projects_mock.return_value = [] @@ -216,7 +216,7 @@ def test_get_projects_list_empty(get_projects_mock): assert projects == [] -@patch.object(BigQueryDataDictionary, "get_projects") +@patch.object(BigQueryTechnicalSchemaApi, "get_projects") def test_get_projects_list_failure( get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture ) -> None: @@ -236,7 +236,7 @@ def test_get_projects_list_failure( assert projects == [] -@patch.object(BigQueryDataDictionary, "get_projects") +@patch.object(BigQueryTechnicalSchemaApi, "get_projects") def test_get_projects_list_fully_filtered(get_projects_mock): get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")] @@ -445,7 +445,7 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr @patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset" + "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" ) @patch("google.cloud.bigquery.client.Client") def test_table_processing_logic(client_mock, data_dictionary_mock): @@ -517,7 +517,7 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): @patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset" + "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" ) @patch("google.cloud.bigquery.client.Client") def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock): @@ -625,7 +625,7 @@ def bigquery_view_2() -> BigqueryView: @patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_query_result" + "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_query_result" ) @patch("google.cloud.bigquery.client.Client") def test_get_views_for_dataset( @@ -655,7 +655,7 @@ def test_get_views_for_dataset( ) ) query_mock.return_value = [row1, row2] - bigquery_data_dictionary = BigQueryDataDictionary(BigQueryV2Report()) + bigquery_data_dictionary = BigQueryTechnicalSchemaApi(BigQueryV2Report()) bigquery_data_dictionary.set_client(client_mock) views = bigquery_data_dictionary.get_views_for_dataset( diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 8c50619bee53d..4cf42da4395f9 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -7,9 +7,9 @@ BigqueryTableIdentifier, BigQueryTableRef, ) -from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report +from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor FROZEN_TIME = "2021-07-20 00:00:00" @@ -110,12 +110,12 @@ def test_bigqueryv2_filters(): OR protoPayload.metadata.tableDataRead.reason = "JOB" )""" # noqa: W293 - api = BigQueryAuditLogApi( - BigQueryV2Report(), config.rate_limit, config.requests_per_min - ) + corrected_start_time = config.start_time - config.max_query_duration corrected_end_time = config.end_time + config.max_query_duration - filter: str = api._generate_filter(corrected_start_time, corrected_end_time) + filter: str = BigQueryUsageExtractor( + config, BigQueryV2Report(), lambda x: "" + )._generate_filter(corrected_start_time, corrected_end_time) assert filter == expected_filter From 76ddc3f878f37c0f95e96ec7094d944b977cf80d Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Fri, 11 Aug 2023 18:02:25 +0530 Subject: [PATCH 03/11] refractor in lineage.py --- .../ingestion/source/bigquery_v2/bigquery.py | 12 +- .../bigquery_v2/bigquery_audit_log_api.py | 5 +- .../source/bigquery_v2/bigquery_report.py | 3 - .../source/bigquery_v2/bigquery_schema_api.py | 240 +------------- .../ingestion/source/bigquery_v2/lineage.py | 302 +++++++++--------- .../ingestion/source/bigquery_v2/queries.py | 224 +++++++++++++ .../tests/unit/test_bigquery_source.py | 123 +++++-- 7 files changed, 492 insertions(+), 417 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index a38117cce346d..4ff4648657959 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -220,7 +220,9 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) - self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi(self.report) + self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi( + self.report, self.config.get_bigquery_client() + ) # For database, schema, tables, views, etc self.lineage_extractor = BigqueryLineageExtractor( @@ -298,8 +300,7 @@ def metadata_read_capability_test( client: bigquery.Client = config.get_bigquery_client() assert client report = BigQueryV2Report() - bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report) - bigquery_data_dictionary.set_client(client) + bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report, client) result = bigquery_data_dictionary.get_datasets_for_project_id( project_id, 10 ) @@ -502,9 +503,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - bq_client: bigquery.Client = self.config.get_bigquery_client() - self.bigquery_data_dictionary.set_client(bq_client) - projects = self._get_projects() if not projects: return @@ -521,7 +519,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self._should_ingest_lineage(): yield from self.lineage_extractor.get_lineage_workunits( - projects, + [p.id for p in projects], self.sql_parser_schema_resolver, self.view_definition_ids, self.table_refs, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index 048f90de7ee0a..b017b1d08a1ee 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -22,7 +22,9 @@ logger: logging.Logger = logging.getLogger(__name__) -# TODO: separation of api/extractor classes - client wise, functionality wise ? +# Api interfaces are separated based on functionality they provide +# rather than the underlying bigquery client that is used to +# provide the functionality. class BigQueryAuditLogApi: def __init__( self, @@ -34,7 +36,6 @@ def __init__( self.rate_limit = rate_limit self.requests_per_min = requests_per_min - # TODO; should we refractor and move this to schema api , as this uses bigquery client ? def get_exported_bigquery_audit_metadata( self, bigquery_client: bigquery.Client, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 62fd23bb1b68a..6d5822723ec64 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -80,9 +80,6 @@ class BigQueryV2Report( log_page_size: Optional[pydantic.PositiveInt] = None use_exported_bigquery_audit_metadata: Optional[bool] = None end_time: Optional[datetime] = None - # TODO: remove one or replace by lineage ones - log_entry_start_time: Optional[datetime] = None - log_entry_end_time: Optional[datetime] = None audit_start_time: Optional[datetime] = None audit_end_time: Optional[datetime] = None upstream_lineage: LossyDict = field(default_factory=LossyDict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py index e7fcd5bd390c0..b627af15ca213 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py @@ -17,21 +17,15 @@ BigQueryApiPerfReport, BigQueryV2Report, ) +from datahub.ingestion.source.bigquery_v2.queries import ( + BigqueryQuery, + BigqueryTableType, +) from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView logger: logging.Logger = logging.getLogger(__name__) -class BigqueryTableType: - # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema - BASE_TABLE = "BASE TABLE" - EXTERNAL = "EXTERNAL" - VIEW = "VIEW" - MATERIALIZED_VIEW = "MATERIALIZED VIEW" - CLONE = "CLONE" - SNAPSHOT = "SNAPSHOT" - - @dataclass class BigqueryColumn(BaseColumn): field_path: str @@ -131,233 +125,11 @@ class BigqueryProject: datasets: List[BigqueryDataset] = field(default_factory=list) -class BigqueryQuery: - show_datasets: str = ( - "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA" - ) - - datasets_for_project_id: str = """ -select - s.CATALOG_NAME as catalog_name, - s.schema_name as table_schema, - s.location as location, - s.CREATION_TIME as created, - s.LAST_MODIFIED_TIME as last_altered, - o.OPTION_VALUE as comment -from - `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s - left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name - and o.option_name = "description" -order by - s.schema_name -""" - - # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en - tables_for_dataset = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - ts.last_modified_time as last_altered, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl, - row_count, - size_bytes as bytes, - num_partitions, - max_partition_id, - active_billable_bytes, - long_term_billable_bytes, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base - -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" - left join ( - select - table_name, - sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions, - max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id, - sum(total_rows) as total_rows, - sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes, - sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes, - from - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS - group by - table_name) as p on - t.table_name = p.table_name -WHERE - table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') -{{table_filter}} -order by - table_schema ASC, - table_base ASC, - table_suffix DESC -""" - - tables_for_dataset_without_partition_data = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base - -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" -WHERE - table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') -{{table_filter}} -order by - table_schema ASC, - table_base ASC, - table_suffix DESC -""" - - views_for_dataset: str = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - ts.last_modified_time as last_altered, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl as view_definition, - row_count, - size_bytes -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" -WHERE - table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') -order by - table_schema ASC, - table_name ASC -""" - - views_for_dataset_without_data_read: str = f""" -SELECT - t.table_catalog as table_catalog, - t.table_schema as table_schema, - t.table_name as table_name, - t.table_type as table_type, - t.creation_time as created, - tos.OPTION_VALUE as comment, - is_insertable_into, - ddl as view_definition -FROM - `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t - left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema - and t.TABLE_NAME = tos.TABLE_NAME - and tos.OPTION_NAME = "description" -WHERE - table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') -order by - table_schema ASC, - table_name ASC -""" - - columns_for_dataset: str = """ -select - c.table_catalog as table_catalog, - c.table_schema as table_schema, - c.table_name as table_name, - c.column_name as column_name, - c.ordinal_position as ordinal_position, - cfp.field_path as field_path, - c.is_nullable as is_nullable, - CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, - description as comment, - c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column -from - `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c - join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name - and cfp.column_name = c.column_name -ORDER BY - table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" - - optimized_columns_for_dataset: str = """ -select * from -(select - c.table_catalog as table_catalog, - c.table_schema as table_schema, - c.table_name as table_name, - c.column_name as column_name, - c.ordinal_position as ordinal_position, - cfp.field_path as field_path, - c.is_nullable as is_nullable, - CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, - description as comment, - c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column, - -- We count the columns to be able limit it later - row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, - -- Getting the maximum shard for each table - row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num -from - `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c - join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name - and cfp.column_name = c.column_name - ) --- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data -where column_num <= {column_limit} and shard_num = 1 -ORDER BY - table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC""" - - columns_for_table: str = """ -select - c.table_catalog as table_catalog, - c.table_schema as table_schema, - c.table_name as table_name, - c.column_name as column_name, - c.ordinal_position as ordinal_position, - cfp.field_path as field_path, - c.is_nullable as is_nullable, - CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, - c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column, - description as comment -from - `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c - join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name - and cfp.column_name = c.column_name -where - c.table_name = '{table_identifier.table}' -ORDER BY - table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" - - -# TODO: remove static methods from here -# TODO: move queries into separate file class BigQueryTechnicalSchemaApi: - def __init__(self, report: BigQueryApiPerfReport) -> None: - self.bq_client: Optional[bigquery.Client] = None + def __init__(self, report: BigQueryApiPerfReport, client: bigquery.Client) -> None: + self.bq_client = client self.api_perf_report = report - # TODO: remove need to set_client. maybe pass in constructor? - def set_client(self, bq_client: bigquery.Client) -> None: - self.bq_client = bq_client - def get_client(self) -> bigquery.Client: assert self.bq_client is not None return self.bq_client diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index d1bc687ebaca0..4dc01bb1c7232 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -6,7 +6,6 @@ from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set, Union import humanfriendly -from google.cloud.bigquery import Client as BigQueryClient from google.cloud.datacatalog import lineage_v1 from google.cloud.logging_v2.client import Client as GCPLoggingClient @@ -29,7 +28,6 @@ from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( - BigqueryProject, BigQueryTechnicalSchemaApi, ) from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT @@ -193,6 +191,9 @@ def __init__( self.config = config self.report = report self.dataset_urn_builder = dataset_urn_builder + self.audit_log_api = BigQueryAuditLogApi( + report, self.config.rate_limit, self.config.requests_per_min + ) def error(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason) @@ -200,17 +201,37 @@ def error(self, log: logging.Logger, key: str, reason: str) -> None: def get_lineage_workunits( self, - projects: List[BigqueryProject], + projects: List[str], sql_parser_schema_resolver: SchemaResolver, view_definition_ids: Dict[str, Dict[str, str]], table_refs: Set[str], ) -> Iterable[MetadataWorkUnit]: + views_skip_audit_log_lineage: Set[str] = set() + if self.config.lineage_parse_view_ddl: + view_lineage: Dict[str, Set[LineageEdge]] = {} + for project in projects: + self.populate_view_lineage_with_sql_parsing( + view_lineage, + view_definition_ids[project], + sql_parser_schema_resolver, + project, + ) + + views_skip_audit_log_lineage.update(view_lineage.keys()) + for lineage_key in view_lineage.keys(): + yield from self.gen_lineage_workunits_for_table( + view_lineage, BigQueryTableRef.from_string_name(lineage_key) + ) + + if self.config.use_exported_bigquery_audit_metadata: + projects = ["*"] # project_id not used when using exported metadata + for project in projects: - self.report.set_ingestion_stage(project.id, "Lineage Extraction") + self.report.set_ingestion_stage(project, "Lineage Extraction") yield from self.generate_lineage( - project.id, + project, sql_parser_schema_resolver, - view_definition_ids[project.id], + views_skip_audit_log_lineage, table_refs, ) @@ -218,65 +239,104 @@ def generate_lineage( self, project_id: str, sql_parser_schema_resolver: SchemaResolver, - view_definition_ids: Dict[str, str], + views_skip_audit_log_lineage: Set[str], table_refs: Set[str], ) -> Iterable[MetadataWorkUnit]: logger.info(f"Generate lineage for {project_id}") - lineage = self.calculate_lineage_for_project( - project_id, sql_parser_schema_resolver=sql_parser_schema_resolver - ) - - if self.config.lineage_parse_view_ddl: - for view, view_definition_id in view_definition_ids.items(): - view_definition = view_definition_ids[view_definition_id] - raw_view_lineage = sqlglot_lineage( - view_definition, - schema_resolver=sql_parser_schema_resolver, - default_db=project_id, - ) - if raw_view_lineage.debug_info.table_error: - logger.debug( - f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}" - ) - self.report.num_view_definitions_failed_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}" - ) - continue - elif raw_view_lineage.debug_info.column_error: - self.report.num_view_definitions_failed_column_parsing += 1 - self.report.view_definitions_parsing_failures.append( - f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}" - ) + with PerfTimer() as timer: + try: + if self.config.extract_lineage_from_catalog: + lineage = self.lineage_via_catalog_lineage_api(project_id) else: - self.report.num_view_definitions_parsed += 1 - - # For views, we override the upstreams obtained by parsing audit logs - # as they may contain indirectly referenced tables. - ts = datetime.now(timezone.utc) - lineage[view] = set( - make_lineage_edges_from_parsing_result( - raw_view_lineage, - audit_stamp=ts, - lineage_type=DatasetLineageTypeClass.VIEW, + events = self._get_parsed_audit_log_events(project_id) + lineage = self._create_lineage_map( + events, sql_parser_schema_resolver ) + except Exception as e: + if project_id: + self.report.lineage_failed_extraction.append(project_id) + self.error( + logger, + "lineage", + f"{project_id}: {e}", ) + lineage = {} + + self.report.lineage_metadata_entries[project_id] = len(lineage) + logger.info(f"Built lineage map containing {len(lineage)} entries.") + logger.debug(f"lineage metadata is {lineage}") + self.report.lineage_extraction_sec[project_id] = round( + timer.elapsed_seconds(), 2 + ) + self.report.lineage_mem_size[project_id] = humanfriendly.format_size( + memory_footprint.total_size(lineage) + ) for lineage_key in lineage.keys(): - if lineage_key not in table_refs: + # For views, we do not use the upstreams obtained by parsing audit logs + # as they may contain indirectly referenced tables. + if ( + lineage_key not in table_refs + or lineage_key in views_skip_audit_log_lineage + ): continue - table_ref = BigQueryTableRef.from_string_name(lineage_key) - dataset_urn = self.dataset_urn_builder(table_ref) + yield from self.gen_lineage_workunits_for_table( + lineage, BigQueryTableRef.from_string_name(lineage_key) + ) - lineage_info = self.get_lineage_for_table( - bq_table=table_ref, - bq_table_urn=dataset_urn, - lineage_metadata=lineage, + def populate_view_lineage_with_sql_parsing( + self, + view_lineage: Dict[str, Set[LineageEdge]], + view_definition_ids: Dict[str, str], + sql_parser_schema_resolver: SchemaResolver, + default_project: str, + ) -> None: + for view, view_definition_id in view_definition_ids.items(): + view_definition = view_definition_ids[view_definition_id] + raw_view_lineage = sqlglot_lineage( + view_definition, + schema_resolver=sql_parser_schema_resolver, + default_db=default_project, ) + if raw_view_lineage.debug_info.table_error: + logger.debug( + f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}" + ) + self.report.num_view_definitions_failed_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}" + ) + continue + elif raw_view_lineage.debug_info.column_error: + self.report.num_view_definitions_failed_column_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}" + ) + else: + self.report.num_view_definitions_parsed += 1 + + ts = datetime.now(timezone.utc) + view_lineage[view] = set( + make_lineage_edges_from_parsing_result( + raw_view_lineage, + audit_stamp=ts, + lineage_type=DatasetLineageTypeClass.VIEW, + ) + ) + + def gen_lineage_workunits_for_table( + self, lineage: dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef + ) -> Iterable[MetadataWorkUnit]: + dataset_urn = self.dataset_urn_builder(table_ref) - if lineage_info: - yield from self.gen_lineage(dataset_urn, lineage_info) + lineage_info = self.get_lineage_for_table( + bq_table=table_ref, + bq_table_urn=dataset_urn, + lineage_metadata=lineage, + ) + if lineage_info: + yield from self.gen_lineage(dataset_urn, lineage_info) def gen_lineage( self, @@ -334,8 +394,10 @@ def lineage_via_catalog_lineage_api( try: lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() - data_dictionary = BigQueryTechnicalSchemaApi(self.report) - data_dictionary.set_client(self.config.get_bigquery_client()) + data_dictionary = BigQueryTechnicalSchemaApi( + self.report, self.config.get_bigquery_client() + ) + # Filtering datasets datasets = list(data_dictionary.get_datasets_for_project_id(project_id)) project_tables = [] @@ -420,9 +482,6 @@ def lineage_via_catalog_lineage_api( raise e def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: - audit_log_api = BigQueryAuditLogApi( - self.report, self.config.rate_limit, self.config.requests_per_min - ) # We adjust the filter values a bit, since we need to make sure that the join # between query events and read events is complete. For example, this helps us # handle the case where the read happens within our time range but the query @@ -434,33 +493,11 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]] if self.config.use_exported_bigquery_audit_metadata: - logger.info("Populating lineage info via exported GCP audit logs") - bq_client = self.config.get_bigquery_client() - # TODO: make this call simpler - entries = audit_log_api.get_exported_bigquery_audit_metadata( - bigquery_client=bq_client, - bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage, - bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, - use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, - start_time=corrected_start_time, - end_time=corrected_end_time, - ) + self.get_exported_log_entries(corrected_start_time, corrected_end_time) parse_fn = self._parse_exported_bigquery_audit_metadata else: - logger.info("Populating lineage info via exported GCP audit logs") - - logging_client = self.config.make_gcp_logging_client(project_id) - logger.info( - f"Start loading log entries from BigQuery for {project_id} " - f"with start_time={corrected_start_time} and end_time={corrected_end_time}" - ) - entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging( - logging_client, - BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( - corrected_start_time.strftime(BQ_DATETIME_FORMAT), - corrected_end_time.strftime(BQ_DATETIME_FORMAT), - ), - self.config.log_page_size, + entries = self.get_log_entries_via_gcp_logging( + project_id, corrected_start_time, corrected_end_time ) parse_fn = self._parse_bigquery_log_entries @@ -475,6 +512,42 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: logger.warning(f"Unable to parse log entry `{entry}`: {e}") self.report.num_lineage_log_parse_failures[project_id] += 1 + def get_exported_log_entries( + self, corrected_start_time, corrected_end_time, limit=None + ): + logger.info("Populating lineage info via exported GCP audit logs") + bq_client = self.config.get_bigquery_client() + entries = self.audit_log_api.get_exported_bigquery_audit_metadata( + bigquery_client=bq_client, + bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage, + bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, + use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, + start_time=corrected_start_time, + end_time=corrected_end_time, + limit=limit, + ) + return entries + + def get_log_entries_via_gcp_logging( + self, project_id, corrected_start_time, corrected_end_time + ): + logger.info("Populating lineage info via exported GCP audit logs") + + logging_client = self.config.make_gcp_logging_client(project_id) + logger.info( + f"Start loading log entries from BigQuery for {project_id} " + f"with start_time={corrected_start_time} and end_time={corrected_end_time}" + ) + entries = self.audit_log_api.get_bigquery_log_entries_via_gcp_logging( + logging_client, + BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( + corrected_start_time.strftime(BQ_DATETIME_FORMAT), + corrected_end_time.strftime(BQ_DATETIME_FORMAT), + ), + self.config.log_page_size, + ) + return entries + # Currently we only parse JobCompleted events but in future we would want to parse other # events to also create field level lineage. def _parse_bigquery_log_entries( @@ -616,38 +689,6 @@ def _create_lineage_map( logger.info("Exiting create lineage map function") return lineage_map - def _compute_bigquery_lineage( - self, - project_id: str, - sql_parser_schema_resolver: SchemaResolver, - ) -> Dict[str, Set[LineageEdge]]: - lineage_metadata: Dict[str, Set[LineageEdge]] - try: - if self.config.extract_lineage_from_catalog: - lineage_metadata = self.lineage_via_catalog_lineage_api(project_id) - else: - events = self._get_parsed_audit_log_events(project_id) - lineage_metadata = self._create_lineage_map( - events, sql_parser_schema_resolver - ) - except Exception as e: - if project_id: - self.report.lineage_failed_extraction.append(project_id) - self.error( - logger, - "lineage", - f"{project_id}: {e}", - ) - lineage_metadata = {} - - self.report.lineage_mem_size[project_id] = humanfriendly.format_size( - memory_footprint.total_size(lineage_metadata) - ) - self.report.lineage_metadata_entries[project_id] = len(lineage_metadata) - logger.info(f"Built lineage map containing {len(lineage_metadata)} entries.") - logger.debug(f"lineage metadata is {lineage_metadata}") - return lineage_metadata - def get_upstream_tables( self, bq_table: BigQueryTableRef, @@ -708,22 +749,6 @@ def get_upstream_tables( return set(upstreams.values()) - def calculate_lineage_for_project( - self, - project_id: str, - sql_parser_schema_resolver: SchemaResolver, - ) -> Dict[str, Set[LineageEdge]]: - with PerfTimer() as timer: - lineage = self._compute_bigquery_lineage( - project_id, sql_parser_schema_resolver - ) - - self.report.lineage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 - ) - - return lineage - def get_lineage_for_table( self, bq_table: BigQueryTableRef, @@ -786,19 +811,10 @@ def get_lineage_for_table( return None def test_capability(self, project_id: str) -> None: - audit_log_api = BigQueryAuditLogApi( - self.report, self.config.rate_limit, self.config.requests_per_min - ) - if self.config.use_exported_bigquery_audit_metadata: - bigquery_client: BigQueryClient = BigQueryClient(project=project_id) - for entry in audit_log_api.get_exported_bigquery_audit_metadata( - bigquery_client=bigquery_client, - bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage, - bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets, - use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables, - start_time=self.config.start_time, - end_time=self.config.end_time, + for entry in self.get_exported_log_entries( + self.config.start_time, + self.config.end_time, limit=1, ): logger.debug( @@ -808,7 +824,7 @@ def test_capability(self, project_id: str) -> None: gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client( project_id ) - for entry in audit_log_api.get_bigquery_log_entries_via_gcp_logging( + for entry in self.audit_log_api.get_bigquery_log_entries_via_gcp_logging( gcp_logging_client, filter=BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( self.config.start_time.strftime(BQ_DATETIME_FORMAT), diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py new file mode 100644 index 0000000000000..e04ea679584dc --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py @@ -0,0 +1,224 @@ +class BigqueryTableType: + # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema + BASE_TABLE = "BASE TABLE" + EXTERNAL = "EXTERNAL" + VIEW = "VIEW" + MATERIALIZED_VIEW = "MATERIALIZED VIEW" + CLONE = "CLONE" + SNAPSHOT = "SNAPSHOT" + + +class BigqueryQuery: + show_datasets: str = ( + "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA" + ) + + datasets_for_project_id: str = """ +select + s.CATALOG_NAME as catalog_name, + s.schema_name as table_schema, + s.location as location, + s.CREATION_TIME as created, + s.LAST_MODIFIED_TIME as last_altered, + o.OPTION_VALUE as comment +from + `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s + left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name + and o.option_name = "description" +order by + s.schema_name +""" + + # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en + tables_for_dataset = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + ts.last_modified_time as last_altered, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl, + row_count, + size_bytes as bytes, + num_partitions, + max_partition_id, + active_billable_bytes, + long_term_billable_bytes, + REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" + left join ( + select + table_name, + sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions, + max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id, + sum(total_rows) as total_rows, + sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes, + sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes, + from + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS + group by + table_name) as p on + t.table_name = p.table_name +WHERE + table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') +{{table_filter}} +order by + table_schema ASC, + table_base ASC, + table_suffix DESC +""" + + tables_for_dataset_without_partition_data = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl, + REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" +WHERE + table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}') +{{table_filter}} +order by + table_schema ASC, + table_base ASC, + table_suffix DESC +""" + + views_for_dataset: str = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + ts.last_modified_time as last_altered, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl as view_definition, + row_count, + size_bytes +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" +WHERE + table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') +order by + table_schema ASC, + table_name ASC +""" + + views_for_dataset_without_data_read: str = f""" +SELECT + t.table_catalog as table_catalog, + t.table_schema as table_schema, + t.table_name as table_name, + t.table_type as table_type, + t.creation_time as created, + tos.OPTION_VALUE as comment, + is_insertable_into, + ddl as view_definition +FROM + `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t + left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema + and t.TABLE_NAME = tos.TABLE_NAME + and tos.OPTION_NAME = "description" +WHERE + table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}') +order by + table_schema ASC, + table_name ASC +""" + + columns_for_dataset: str = """ +select + c.table_catalog as table_catalog, + c.table_schema as table_schema, + c.table_name as table_name, + c.column_name as column_name, + c.ordinal_position as ordinal_position, + cfp.field_path as field_path, + c.is_nullable as is_nullable, + CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, + description as comment, + c.is_hidden as is_hidden, + c.is_partitioning_column as is_partitioning_column +from + `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c + join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name + and cfp.column_name = c.column_name +ORDER BY + table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" + + optimized_columns_for_dataset: str = """ +select * from +(select + c.table_catalog as table_catalog, + c.table_schema as table_schema, + c.table_name as table_name, + c.column_name as column_name, + c.ordinal_position as ordinal_position, + cfp.field_path as field_path, + c.is_nullable as is_nullable, + CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, + description as comment, + c.is_hidden as is_hidden, + c.is_partitioning_column as is_partitioning_column, + -- We count the columns to be able limit it later + row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, + -- Getting the maximum shard for each table + row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num +from + `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c + join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name + and cfp.column_name = c.column_name + ) +-- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data +where column_num <= {column_limit} and shard_num = 1 +ORDER BY + table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC""" + + columns_for_table: str = """ +select + c.table_catalog as table_catalog, + c.table_schema as table_schema, + c.table_name as table_name, + c.column_name as column_name, + c.ordinal_position as ordinal_position, + cfp.field_path as field_path, + c.is_nullable as is_nullable, + CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, + c.is_hidden as is_hidden, + c.is_partitioning_column as is_partitioning_column, + description as comment +from + `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c + join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name + and cfp.column_name = c.column_name +where + c.table_name = '{table_identifier.table}' +ORDER BY + table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 84f218074d99b..4c4996ea59ed2 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -93,8 +93,12 @@ def test_bigquery_uri_with_credential(): raise e -@patch("google.cloud.bigquery.client.Client") -def test_get_projects_with_project_ids(client_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_with_project_ids(get_bigquery_client): + client_mock = MagicMock() + get_bigquery_client.return_value = client_mock config = BigQueryV2Config.parse_obj( { "project_ids": ["test-1", "test-2"], @@ -118,7 +122,12 @@ def test_get_projects_with_project_ids(client_mock): assert client_mock.list_projects.call_count == 0 -def test_get_projects_with_project_ids_overrides_project_id_pattern(): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_with_project_ids_overrides_project_id_pattern( + get_bigquery_client, +): config = BigQueryV2Config.parse_obj( { "project_ids": ["test-project", "test-project-2"], @@ -133,7 +142,10 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern(): ] -def test_get_dataplatform_instance_aspect_returns_project_id(): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_dataplatform_instance_aspect_returns_project_id(get_bigquery_client): project_id = "project_id" expected_instance = ( f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" @@ -153,8 +165,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): assert metadata.aspect.instance == expected_instance -@patch("google.cloud.bigquery.client.Client") -def test_get_projects_with_single_project_id(client_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_with_single_project_id(get_bigquery_client): + client_mock = MagicMock() + get_bigquery_client.return_value = client_mock config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) assert source._get_projects() == [ @@ -163,8 +179,12 @@ def test_get_projects_with_single_project_id(client_mock): assert client_mock.list_projects.call_count == 0 -@patch("google.cloud.bigquery.client.Client") -def test_get_projects_by_list(client_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_by_list(get_bigquery_client): + client_mock = MagicMock() + get_bigquery_client.return_value = client_mock client_mock.list_projects.return_value = [ SimpleNamespace( project_id="test-1", @@ -178,7 +198,6 @@ def test_get_projects_by_list(client_mock): config = BigQueryV2Config.parse_obj({}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) - source.bigquery_data_dictionary.set_client(client_mock) assert source._get_projects() == [ BigqueryProject("test-1", "one"), BigqueryProject("test-2", "two"), @@ -187,7 +206,10 @@ def test_get_projects_by_list(client_mock): @patch.object(BigQueryTechnicalSchemaApi, "get_projects") -def test_get_projects_filter_by_pattern(get_projects_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_filter_by_pattern(get_bigquery_client, get_projects_mock): get_projects_mock.return_value = [ BigqueryProject("test-project", "Test Project"), BigqueryProject("test-project-2", "Test Project 2"), @@ -204,7 +226,10 @@ def test_get_projects_filter_by_pattern(get_projects_mock): @patch.object(BigQueryTechnicalSchemaApi, "get_projects") -def test_get_projects_list_empty(get_projects_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_list_empty(get_bigquery_client, get_projects_mock): get_projects_mock.return_value = [] config = BigQueryV2Config.parse_obj( @@ -217,8 +242,13 @@ def test_get_projects_list_empty(get_projects_mock): @patch.object(BigQueryTechnicalSchemaApi, "get_projects") +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) def test_get_projects_list_failure( - get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture + get_bigquery_client: MagicMock, + get_projects_mock: MagicMock, + caplog: pytest.LogCaptureFixture, ) -> None: error_str = "my error" get_projects_mock.side_effect = GoogleAPICallError(error_str) @@ -237,7 +267,10 @@ def test_get_projects_list_failure( @patch.object(BigQueryTechnicalSchemaApi, "get_projects") -def test_get_projects_list_fully_filtered(get_projects_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_get_projects_list_fully_filtered(get_projects_mock, get_bigquery_client): get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")] config = BigQueryV2Config.parse_obj( @@ -249,7 +282,10 @@ def test_get_projects_list_fully_filtered(get_projects_mock): assert projects == [] -def test_simple_upstream_table_generation(): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_simple_upstream_table_generation(get_bigquery_client): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -280,7 +316,12 @@ def test_simple_upstream_table_generation(): assert list(upstreams)[0].table == str(b) -def test_upstream_table_generation_with_temporary_table_without_temp_upstream(): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_upstream_table_generation_with_temporary_table_without_temp_upstream( + get_bigquery_client, +): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -310,7 +351,10 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream(): assert list(upstreams) == [] -def test_upstream_table_column_lineage_with_temp_table(): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_upstream_table_column_lineage_with_temp_table(get_bigquery_client): from datahub.ingestion.api.common import PipelineContext a: BigQueryTableRef = BigQueryTableRef( @@ -384,7 +428,12 @@ def test_upstream_table_column_lineage_with_temp_table(): assert upstream.column_confidence == 0.7 -def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream(): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream( + get_bigquery_client, +): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -447,8 +496,12 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr @patch( "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" ) -@patch("google.cloud.bigquery.client.Client") -def test_table_processing_logic(client_mock, data_dictionary_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_table_processing_logic(get_bigquery_client, data_dictionary_mock): + client_mock = MagicMock() + get_bigquery_client.return_value = client_mock config = BigQueryV2Config.parse_obj( { "project_id": "test-project", @@ -498,7 +551,6 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): data_dictionary_mock.get_tables_for_dataset.return_value = None source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - source.bigquery_data_dictionary.set_client(client_mock) _ = list( source.get_tables_for_dataset( @@ -519,8 +571,14 @@ def test_table_processing_logic(client_mock, data_dictionary_mock): @patch( "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" ) -@patch("google.cloud.bigquery.client.Client") -def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_table_processing_logic_date_named_tables( + get_bigquery_client, data_dictionary_mock +): + client_mock = MagicMock() + get_bigquery_client.return_value = client_mock # test that tables with date names are processed correctly config = BigQueryV2Config.parse_obj( { @@ -571,7 +629,6 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m data_dictionary_mock.get_tables_for_dataset.return_value = None source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) - source.bigquery_data_dictionary.set_client(client_mock) _ = list( source.get_tables_for_dataset( @@ -627,13 +684,17 @@ def bigquery_view_2() -> BigqueryView: @patch( "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_query_result" ) -@patch("google.cloud.bigquery.client.Client") +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) def test_get_views_for_dataset( - client_mock: Mock, + get_bigquery_client: Mock, query_mock: Mock, bigquery_view_1: BigqueryView, bigquery_view_2: BigqueryView, ) -> None: + client_mock = MagicMock() + get_bigquery_client.return_value = client_mock assert bigquery_view_1.last_altered row1 = create_row( dict( @@ -655,8 +716,9 @@ def test_get_views_for_dataset( ) ) query_mock.return_value = [row1, row2] - bigquery_data_dictionary = BigQueryTechnicalSchemaApi(BigQueryV2Report()) - bigquery_data_dictionary.set_client(client_mock) + bigquery_data_dictionary = BigQueryTechnicalSchemaApi( + BigQueryV2Report(), client_mock + ) views = bigquery_data_dictionary.get_views_for_dataset( project_id="test-project", @@ -667,7 +729,12 @@ def test_get_views_for_dataset( @patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: []) -def test_gen_view_dataset_workunits(bigquery_view_1, bigquery_view_2): +@patch( + "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" +) +def test_gen_view_dataset_workunits( + get_bigquery_client, bigquery_view_1, bigquery_view_2 +): project_id = "test-project" dataset_name = "test-dataset" config = BigQueryV2Config.parse_obj( From e57f134e0a7ba880e103928ad8af825befa2c1d8 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Mon, 14 Aug 2023 16:54:41 +0530 Subject: [PATCH 04/11] report composition vs inheritance --- .../ingestion/source/bigquery_v2/bigquery.py | 10 +- .../source/bigquery_v2/bigquery_report.py | 23 ++- .../source/bigquery_v2/bigquery_schema_api.py | 12 +- .../ingestion/source/bigquery_v2/lineage.py | 12 +- .../ingestion/source/bigquery_v2/usage.py | 4 +- .../tests/unit/test_bigquery_source.py | 140 +++++++----------- 6 files changed, 83 insertions(+), 118 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 4ff4648657959..542e153303257 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -47,8 +47,8 @@ BigqueryColumn, BigqueryDataset, BigqueryProject, + BigQuerySchemaApi, BigqueryTable, - BigQueryTechnicalSchemaApi, BigqueryView, ) from datahub.ingestion.source.bigquery_v2.common import ( @@ -220,8 +220,8 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) - self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi( - self.report, self.config.get_bigquery_client() + self.bigquery_data_dictionary = BigQuerySchemaApi( + self.report.schema_api_perf, self.config.get_bigquery_client() ) # For database, schema, tables, views, etc @@ -300,7 +300,9 @@ def metadata_read_capability_test( client: bigquery.Client = config.get_bigquery_client() assert client report = BigQueryV2Report() - bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report, client) + bigquery_data_dictionary = BigQuerySchemaApi( + report.schema_api_perf, client + ) result = bigquery_data_dictionary.get_datasets_for_project_id( project_id, 10 ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 6d5822723ec64..bf11045f24c24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -1,5 +1,4 @@ import collections -import dataclasses import logging from dataclasses import dataclass, field from datetime import datetime, timezone @@ -15,7 +14,7 @@ logger: logging.Logger = logging.getLogger(__name__) -class BigQueryApiPerfReport: +class BigQuerySchemaApiPerfReport: list_projects = PerfTimer() list_datasets = PerfTimer() get_columns_for_dataset = PerfTimer() @@ -30,9 +29,7 @@ class BigQueryAuditLogApiPerfReport: @dataclass -class BigQueryV2Report( - ProfilingSqlReport, BigQueryApiPerfReport, BigQueryAuditLogApiPerfReport -): +class BigQueryV2Report(ProfilingSqlReport): num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict) num_skipped_lineage_entries_missing_data: TopKDict[str, int] = field( default_factory=int_top_k_dict @@ -106,16 +103,18 @@ class BigQueryV2Report( num_view_definitions_failed_column_parsing: int = 0 view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) - read_reasons_stat: Counter[str] = dataclasses.field( - default_factory=collections.Counter - ) - operation_types_stat: Counter[str] = dataclasses.field( - default_factory=collections.Counter - ) + read_reasons_stat: Counter[str] = field(default_factory=collections.Counter) + operation_types_stat: Counter[str] = field(default_factory=collections.Counter) + usage_state_size: Optional[str] = None ingestion_stage: Optional[str] = None ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) - + schema_api_perf: BigQuerySchemaApiPerfReport = field( + default_factory=BigQuerySchemaApiPerfReport + ) + audit_log_api_perf: BigQueryAuditLogApiPerfReport = field( + default_factory=BigQueryAuditLogApiPerfReport + ) _timer: Optional[PerfTimer] = field( default=None, init=False, repr=False, compare=False ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py index b627af15ca213..9db6b27aa24e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py @@ -14,7 +14,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_report import ( - BigQueryApiPerfReport, + BigQuerySchemaApiPerfReport, BigQueryV2Report, ) from datahub.ingestion.source.bigquery_v2.queries import ( @@ -125,8 +125,10 @@ class BigqueryProject: datasets: List[BigqueryDataset] = field(default_factory=list) -class BigQueryTechnicalSchemaApi: - def __init__(self, report: BigQueryApiPerfReport, client: bigquery.Client) -> None: +class BigQuerySchemaApi: + def __init__( + self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client + ) -> None: self.bq_client = client self.api_perf_report = report @@ -229,7 +231,7 @@ def get_tables_for_dataset( for table in cur: try: with current_timer.pause_timer(): - yield BigQueryTechnicalSchemaApi._make_bigquery_table( + yield BigQuerySchemaApi._make_bigquery_table( table, tables.get(table.table_name) ) except Exception as e: @@ -306,7 +308,7 @@ def get_views_for_dataset( for table in cur: try: with current_timer.pause_timer(): - yield BigQueryTechnicalSchemaApi._make_bigquery_view(table) + yield BigQuerySchemaApi._make_bigquery_view(table) except Exception as e: view_name = f"{project_id}.{dataset_name}.{table.table_name}" logger.warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 4dc01bb1c7232..bb2f0360c13a6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -27,9 +27,7 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( - BigQueryTechnicalSchemaApi, -) +from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import BigQuerySchemaApi from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT from datahub.metadata.schema_classes import ( AuditStampClass, @@ -192,7 +190,9 @@ def __init__( self.report = report self.dataset_urn_builder = dataset_urn_builder self.audit_log_api = BigQueryAuditLogApi( - report, self.config.rate_limit, self.config.requests_per_min + report.audit_log_api_perf, + self.config.rate_limit, + self.config.requests_per_min, ) def error(self, log: logging.Logger, key: str, reason: str) -> None: @@ -394,8 +394,8 @@ def lineage_via_catalog_lineage_api( try: lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() - data_dictionary = BigQueryTechnicalSchemaApi( - self.report, self.config.get_bigquery_client() + data_dictionary = BigQuerySchemaApi( + self.report.schema_api_perf, self.config.get_bigquery_client() ) # Filtering datasets diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 51d74168c4970..f0bfc2e477371 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -776,7 +776,9 @@ def _get_parsed_bigquery_log_events( self, project_id: str, limit: Optional[int] = None ) -> Iterable[AuditEvent]: audit_log_api = BigQueryAuditLogApi( - self.report, self.config.rate_limit, self.config.requests_per_min + self.report.audit_log_api_perf, + self.config.rate_limit, + self.config.requests_per_min, ) # We adjust the filter values a bit, since we need to make sure that the join # between query events and read events is complete. For example, this helps us diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 4c4996ea59ed2..4a7d1eef399d6 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -21,7 +21,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( BigqueryProject, - BigQueryTechnicalSchemaApi, + BigQuerySchemaApi, BigqueryView, ) from datahub.ingestion.source.bigquery_v2.lineage import ( @@ -93,12 +93,10 @@ def test_bigquery_uri_with_credential(): raise e -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_projects_with_project_ids(get_bigquery_client): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_with_project_ids(get_bq_client_mock): client_mock = MagicMock() - get_bigquery_client.return_value = client_mock + get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj( { "project_ids": ["test-1", "test-2"], @@ -122,11 +120,9 @@ def test_get_projects_with_project_ids(get_bigquery_client): assert client_mock.list_projects.call_count == 0 -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_get_projects_with_project_ids_overrides_project_id_pattern( - get_bigquery_client, + get_bq_client_mock, ): config = BigQueryV2Config.parse_obj( { @@ -142,10 +138,8 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern( ] -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_dataplatform_instance_aspect_returns_project_id(get_bigquery_client): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock): project_id = "project_id" expected_instance = ( f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" @@ -165,12 +159,10 @@ def test_get_dataplatform_instance_aspect_returns_project_id(get_bigquery_client assert metadata.aspect.instance == expected_instance -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_projects_with_single_project_id(get_bigquery_client): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_with_single_project_id(get_bq_client_mock): client_mock = MagicMock() - get_bigquery_client.return_value = client_mock + get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1")) assert source._get_projects() == [ @@ -179,12 +171,10 @@ def test_get_projects_with_single_project_id(get_bigquery_client): assert client_mock.list_projects.call_count == 0 -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_projects_by_list(get_bigquery_client): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_by_list(get_bq_client_mock): client_mock = MagicMock() - get_bigquery_client.return_value = client_mock + get_bq_client_mock.return_value = client_mock client_mock.list_projects.return_value = [ SimpleNamespace( project_id="test-1", @@ -205,11 +195,9 @@ def test_get_projects_by_list(get_bigquery_client): assert client_mock.list_projects.call_count == 1 -@patch.object(BigQueryTechnicalSchemaApi, "get_projects") -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_projects_filter_by_pattern(get_bigquery_client, get_projects_mock): +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_filter_by_pattern(get_bq_client_mock, get_projects_mock): get_projects_mock.return_value = [ BigqueryProject("test-project", "Test Project"), BigqueryProject("test-project-2", "Test Project 2"), @@ -225,11 +213,9 @@ def test_get_projects_filter_by_pattern(get_bigquery_client, get_projects_mock): ] -@patch.object(BigQueryTechnicalSchemaApi, "get_projects") -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_projects_list_empty(get_bigquery_client, get_projects_mock): +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock): get_projects_mock.return_value = [] config = BigQueryV2Config.parse_obj( @@ -241,12 +227,10 @@ def test_get_projects_list_empty(get_bigquery_client, get_projects_mock): assert projects == [] -@patch.object(BigQueryTechnicalSchemaApi, "get_projects") -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_get_projects_list_failure( - get_bigquery_client: MagicMock, + get_bq_client_mock: MagicMock, get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture, ) -> None: @@ -266,11 +250,9 @@ def test_get_projects_list_failure( assert projects == [] -@patch.object(BigQueryTechnicalSchemaApi, "get_projects") -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_get_projects_list_fully_filtered(get_projects_mock, get_bigquery_client): +@patch.object(BigQuerySchemaApi, "get_projects") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_get_projects_list_fully_filtered(get_projects_mock, get_bq_client_mock): get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")] config = BigQueryV2Config.parse_obj( @@ -282,10 +264,8 @@ def test_get_projects_list_fully_filtered(get_projects_mock, get_bigquery_client assert projects == [] -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_simple_upstream_table_generation(get_bigquery_client): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_simple_upstream_table_generation(get_bq_client_mock): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( project_id="test-project", dataset="test-dataset", table="a" @@ -316,11 +296,9 @@ def test_simple_upstream_table_generation(get_bigquery_client): assert list(upstreams)[0].table == str(b) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_upstream_table_generation_with_temporary_table_without_temp_upstream( - get_bigquery_client, + get_bq_client_mock, ): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( @@ -351,10 +329,8 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream( assert list(upstreams) == [] -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_upstream_table_column_lineage_with_temp_table(get_bigquery_client): +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_upstream_table_column_lineage_with_temp_table(get_bq_client_mock): from datahub.ingestion.api.common import PipelineContext a: BigQueryTableRef = BigQueryTableRef( @@ -428,11 +404,9 @@ def test_upstream_table_column_lineage_with_temp_table(get_bigquery_client): assert upstream.column_confidence == 0.7 -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream( - get_bigquery_client, + get_bq_client_mock, ): a: BigQueryTableRef = BigQueryTableRef( BigqueryTableIdentifier( @@ -493,15 +467,11 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr assert sorted_list[1].table == str(e) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) -def test_table_processing_logic(get_bigquery_client, data_dictionary_mock): +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigQueryV2Config, "get_bigquery_client") +def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock): client_mock = MagicMock() - get_bigquery_client.return_value = client_mock + get_bq_client_mock.return_value = client_mock config = BigQueryV2Config.parse_obj( { "project_id": "test-project", @@ -568,17 +538,13 @@ def test_table_processing_logic(get_bigquery_client, data_dictionary_mock): assert table in ["test-table", "test-sharded-table_20220102"] -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_table_processing_logic_date_named_tables( - get_bigquery_client, data_dictionary_mock + get_bq_client_mock, data_dictionary_mock ): client_mock = MagicMock() - get_bigquery_client.return_value = client_mock + get_bq_client_mock.return_value = client_mock # test that tables with date names are processed correctly config = BigQueryV2Config.parse_obj( { @@ -681,20 +647,16 @@ def bigquery_view_2() -> BigqueryView: ) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_query_result" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQuerySchemaApi, "get_query_result") +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_get_views_for_dataset( - get_bigquery_client: Mock, + get_bq_client_mock: Mock, query_mock: Mock, bigquery_view_1: BigqueryView, bigquery_view_2: BigqueryView, ) -> None: client_mock = MagicMock() - get_bigquery_client.return_value = client_mock + get_bq_client_mock.return_value = client_mock assert bigquery_view_1.last_altered row1 = create_row( dict( @@ -716,8 +678,8 @@ def test_get_views_for_dataset( ) ) query_mock.return_value = [row1, row2] - bigquery_data_dictionary = BigQueryTechnicalSchemaApi( - BigQueryV2Report(), client_mock + bigquery_data_dictionary = BigQuerySchemaApi( + BigQueryV2Report().schema_api_perf, client_mock ) views = bigquery_data_dictionary.get_views_for_dataset( @@ -729,11 +691,9 @@ def test_get_views_for_dataset( @patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: []) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client" -) +@patch.object(BigQueryV2Config, "get_bigquery_client") def test_gen_view_dataset_workunits( - get_bigquery_client, bigquery_view_1, bigquery_view_2 + get_bq_client_mock, bigquery_view_1, bigquery_view_2 ): project_id = "test-project" dataset_name = "test-dataset" From 38b18bb2c940dc53cd9585b6945e28148145b1ca Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Mon, 14 Aug 2023 20:23:59 +0530 Subject: [PATCH 05/11] more refractor and fixes --- .../ingestion/source/bigquery_v2/bigquery.py | 4 +- .../bigquery_v2/bigquery_audit_log_api.py | 211 +----------------- .../source/bigquery_v2/bigquery_config.py | 6 +- .../source/bigquery_v2/bigquery_schema_api.py | 24 +- .../ingestion/source/bigquery_v2/lineage.py | 20 +- .../ingestion/source/bigquery_v2/queries.py | 199 +++++++++++++++++ .../ingestion/source/bigquery_v2/usage.py | 6 +- 7 files changed, 246 insertions(+), 224 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 542e153303257..67b9d6556b3e8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -299,9 +299,8 @@ def metadata_read_capability_test( logger.info((f"Metadata read capability test for project {project_id}")) client: bigquery.Client = config.get_bigquery_client() assert client - report = BigQueryV2Report() bigquery_data_dictionary = BigQuerySchemaApi( - report.schema_api_perf, client + BigQueryV2Report().schema_api_perf, client ) result = bigquery_data_dictionary.get_datasets_for_project_id( project_id, 10 @@ -524,6 +523,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: [p.id for p in projects], self.sql_parser_schema_resolver, self.view_definition_ids, + self.view_definitions, self.table_refs, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index b017b1d08a1ee..fcb6200241cd7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -1,5 +1,4 @@ import logging -import textwrap from datetime import datetime from typing import Callable, Iterable, List, Optional @@ -40,7 +39,12 @@ def get_exported_bigquery_audit_metadata( self, bigquery_client: bigquery.Client, bigquery_audit_metadata_query_template: Callable[ - [str, bool, Optional[int]], str + [ + str, # dataset: str + bool, # use_date_sharded_tables: bool + Optional[int], # limit: Optional[int] = None + ], + str, ], bigquery_audit_metadata_datasets: Optional[List[str]], use_date_sharded_audit_log_tables: bool, @@ -57,6 +61,10 @@ def get_exported_bigquery_audit_metadata( audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT) audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT) + rate_limiter: Optional[RateLimiter] = None + if self.rate_limit: + rate_limiter = RateLimiter(max_calls=self.requests_per_min, period=60) + with self.report.get_exported_log_entries as current_timer: for dataset in bigquery_audit_metadata_datasets: logger.info( @@ -79,8 +87,8 @@ def get_exported_bigquery_audit_metadata( f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" ) - if self.rate_limit: - with RateLimiter(max_calls=self.requests_per_min, period=60): + if rate_limiter: + with rate_limiter: for entry in query_job: with current_timer.pause_timer(): yield entry @@ -136,198 +144,3 @@ def get_bigquery_log_entries_via_gcp_logging( logger.info( f"Finished loading log entries from GCP Log for {client.project}" ) - - -def bigquery_audit_metadata_query_template_usage( - dataset: str, - use_date_sharded_tables: bool, - limit: Optional[int] = None, -) -> str: - """ - Receives a dataset (with project specified) and returns a query template that is used to query exported - v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata. - :param dataset: the dataset to query against in the form of $PROJECT.$DATASET - :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log - tables - :param limit: maximum number of events to query for - :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery - """ - - limit_text = f"limit {limit}" if limit else "" - - shard_condition = "" - if use_date_sharded_tables: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" - shard_condition = ( - """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ - ) - else: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" - - # Deduplicates insertId via QUALIFY, see: - # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field - query = f""" - SELECT - timestamp, - logName, - insertId, - protopayload_auditlog AS protoPayload, - protopayload_auditlog.metadataJson AS metadata - FROM - {from_table} - WHERE ( - timestamp >= "{{start_time}}" - AND timestamp < "{{end_time}}" - ) - {shard_condition} - AND protopayload_auditlog.serviceName="bigquery.googleapis.com" - AND - ( - ( - protopayload_auditlog.methodName IN - ( - "google.cloud.bigquery.v2.JobService.Query", - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL - AND ( - JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson, - "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL - OR - JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL - ) - ) - OR - JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB" - ) - QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 - {limit_text}; - """ - - return textwrap.dedent(query) - - -def bigquery_audit_metadata_query_template_lineage( - dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None -) -> str: - """ - Receives a dataset (with project specified) and returns a query template that is used to query exported - AuditLogs containing protoPayloads of type BigQueryAuditMetadata. - Include only those that: - - have been completed (jobStatus.jobState = "DONE") - - do not contain errors (jobStatus.errorResults is none) - :param dataset: the dataset to query against in the form of $PROJECT.$DATASET - :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log - tables - :param limit: set a limit for the maximum event to return. It is used for connection testing currently - :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery - """ - limit_text = f"limit {limit}" if limit else "" - - shard_condition = "" - if use_date_sharded_tables: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" - shard_condition = ( - """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ - ) - else: - from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" - - query = f""" - SELECT - timestamp, - logName, - insertId, - protopayload_auditlog AS protoPayload, - protopayload_auditlog.metadataJson AS metadata - FROM - {from_table} - WHERE ( - timestamp >= "{{start_time}}" - AND timestamp < "{{end_time}}" - ) - {shard_condition} - AND protopayload_auditlog.serviceName="bigquery.googleapis.com" - AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL - AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL - {limit_text}; - """ - - return textwrap.dedent(query) - - -BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """ -resource.type=("bigquery_project" OR "bigquery_dataset") -AND -timestamp >= "{start_time}" -AND -timestamp < "{end_time}" -AND protoPayload.serviceName="bigquery.googleapis.com" -AND -( - ( - protoPayload.methodName= - ( - "google.cloud.bigquery.v2.JobService.Query" - OR - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" - AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* - AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:* - AND - ( - ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* - AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*" - ) - OR - ( - protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:* - ) - ) - ) - OR - protoPayload.metadata.tableDataRead.reason = "JOB" -) -""".strip( - "\t \n" -) - -BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """ -resource.type=("bigquery_project") -AND -( - protoPayload.methodName= - ( - "google.cloud.bigquery.v2.JobService.Query" - OR - "google.cloud.bigquery.v2.JobService.InsertJob" - ) - AND - protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" - AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* - AND ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* - OR - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:* - ) - AND ( - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*" - AND - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*" - AND - protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__" - AND - protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*" - ) - -) -AND -timestamp >= "{start_time}" -AND -timestamp < "{end_time}" -""".strip() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 84fdead338ee6..6449c6ead1e58 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -269,13 +269,13 @@ def profile_default_settings(cls, values: Dict) -> Dict: @validator("bigquery_audit_metadata_datasets") def validate_bigquery_audit_metadata_datasets( cls, v: Optional[List[str]], values: Dict - ) -> Dict: + ) -> Optional[List[str]]: if values.get("use_exported_bigquery_audit_metadata"): assert ( v and len(v) > 0 - ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata` for usage/lineage." + ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`." - return values + return v @root_validator(pre=False) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py index 9db6b27aa24e3..ca3aae7394469 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py @@ -130,7 +130,7 @@ def __init__( self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client ) -> None: self.bq_client = client - self.api_perf_report = report + self.report = report def get_client(self) -> bigquery.Client: assert self.bq_client is not None @@ -142,7 +142,7 @@ def get_query_result(self, query: str) -> RowIterator: return resp.result() def get_projects(self) -> List[BigqueryProject]: - with self.api_perf_report.list_projects: + with self.report.list_projects: projects = self.get_client().list_projects() return [ @@ -152,7 +152,7 @@ def get_projects(self) -> List[BigqueryProject]: def get_datasets_for_project_id( self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: - with self.api_perf_report.list_datasets: + with self.report.list_datasets: datasets = self.get_client().list_datasets( project_id, max_results=maxResults ) @@ -187,7 +187,7 @@ def get_datasets_for_project_id_with_information_schema( def list_tables( self, dataset_name: str, project_id: str ) -> Iterator[TableListItem]: - with self.api_perf_report.list_tables as current_timer: + with self.report.list_tables as current_timer: for table in self.get_client().list_tables(f"{project_id}.{dataset_name}"): with current_timer.pause_timer(): yield table @@ -200,8 +200,8 @@ def get_tables_for_dataset( with_data_read_permission: bool = False, report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryTable]: - with self.api_perf_report.get_tables_for_dataset as current_timer: - filter: str = ", ".join(f"'{table}'" for table in tables.keys()) + with self.report.get_tables_for_dataset as current_timer: + filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys()) if with_data_read_permission: # Tables are ordered by name and table suffix to make sure we always process the latest sharded table @@ -210,8 +210,8 @@ def get_tables_for_dataset( BigqueryQuery.tables_for_dataset.format( project_id=project_id, dataset_name=dataset_name, - table_filter=f" and t.table_name in ({filter})" - if filter + table_filter=f" and t.table_name in ({filter_clause})" + if filter_clause else "", ), ) @@ -222,8 +222,8 @@ def get_tables_for_dataset( BigqueryQuery.tables_for_dataset_without_partition_data.format( project_id=project_id, dataset_name=dataset_name, - table_filter=f" and t.table_name in ({filter})" - if filter + table_filter=f" and t.table_name in ({filter_clause})" + if filter_clause else "", ), ) @@ -291,7 +291,7 @@ def get_views_for_dataset( has_data_read: bool, report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryView]: - with self.api_perf_report.get_views_for_dataset as current_timer: + with self.report.get_views_for_dataset as current_timer: if has_data_read: cur = self.get_query_result( BigqueryQuery.views_for_dataset.format( @@ -344,7 +344,7 @@ def get_columns_for_dataset( run_optimized_column_query: bool = False, ) -> Optional[Dict[str, List[BigqueryColumn]]]: columns: Dict[str, List[BigqueryColumn]] = defaultdict(list) - with self.api_perf_report.get_columns_for_dataset: + with self.report.get_columns_for_dataset: try: cur = self.get_query_result( BigqueryQuery.columns_for_dataset.format( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index bb2f0360c13a6..c68fe5ef81745 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -21,14 +21,16 @@ ReadEvent, ) from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import ( - BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE, BigQueryAuditLogApi, - bigquery_audit_metadata_query_template_lineage, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import BigQuerySchemaApi from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT +from datahub.ingestion.source.bigquery_v2.queries import ( + BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE, + bigquery_audit_metadata_query_template_lineage, +) from datahub.metadata.schema_classes import ( AuditStampClass, DatasetLineageTypeClass, @@ -40,6 +42,7 @@ ) from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities import memory_footprint +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.sqlglot_lineage import ( SchemaResolver, @@ -204,6 +207,7 @@ def get_lineage_workunits( projects: List[str], sql_parser_schema_resolver: SchemaResolver, view_definition_ids: Dict[str, Dict[str, str]], + view_definitions: FileBackedDict[str], table_refs: Set[str], ) -> Iterable[MetadataWorkUnit]: views_skip_audit_log_lineage: Set[str] = set() @@ -213,6 +217,7 @@ def get_lineage_workunits( self.populate_view_lineage_with_sql_parsing( view_lineage, view_definition_ids[project], + view_definitions, sql_parser_schema_resolver, project, ) @@ -289,11 +294,12 @@ def populate_view_lineage_with_sql_parsing( self, view_lineage: Dict[str, Set[LineageEdge]], view_definition_ids: Dict[str, str], + view_definitions: FileBackedDict[str], sql_parser_schema_resolver: SchemaResolver, default_project: str, ) -> None: for view, view_definition_id in view_definition_ids.items(): - view_definition = view_definition_ids[view_definition_id] + view_definition = view_definitions[view_definition_id] raw_view_lineage = sqlglot_lineage( view_definition, schema_resolver=sql_parser_schema_resolver, @@ -493,7 +499,9 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]] if self.config.use_exported_bigquery_audit_metadata: - self.get_exported_log_entries(corrected_start_time, corrected_end_time) + entries = self.get_exported_log_entries( + corrected_start_time, corrected_end_time + ) parse_fn = self._parse_exported_bigquery_audit_metadata else: entries = self.get_log_entries_via_gcp_logging( @@ -541,8 +549,8 @@ def get_log_entries_via_gcp_logging( entries = self.audit_log_api.get_bigquery_log_entries_via_gcp_logging( logging_client, BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format( - corrected_start_time.strftime(BQ_DATETIME_FORMAT), - corrected_end_time.strftime(BQ_DATETIME_FORMAT), + start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT), + end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT), ), self.config.log_page_size, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py index e04ea679584dc..86b2b9bd4aab8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py @@ -1,3 +1,7 @@ +import textwrap +from typing import Optional + + class BigqueryTableType: # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema BASE_TABLE = "BASE TABLE" @@ -222,3 +226,198 @@ class BigqueryQuery: c.table_name = '{table_identifier.table}' ORDER BY table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC""" + + +BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """ +resource.type=("bigquery_project") +AND +( + protoPayload.methodName= + ( + "google.cloud.bigquery.v2.JobService.Query" + OR + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND + protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" + AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* + AND ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* + OR + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:* + ) + AND ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*" + AND + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*" + AND + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__" + AND + protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*" + ) + +) +AND +timestamp >= "{start_time}" +AND +timestamp < "{end_time}" +""".strip() +BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """ +resource.type=("bigquery_project" OR "bigquery_dataset") +AND +timestamp >= "{start_time}" +AND +timestamp < "{end_time}" +AND protoPayload.serviceName="bigquery.googleapis.com" +AND +( + ( + protoPayload.methodName= + ( + "google.cloud.bigquery.v2.JobService.Query" + OR + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE" + AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:* + AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:* + AND + ( + ( + protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:* + AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*" + ) + OR + ( + protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:* + ) + ) + ) + OR + protoPayload.metadata.tableDataRead.reason = "JOB" +) +""".strip( + "\t \n" +) + + +def bigquery_audit_metadata_query_template_lineage( + dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None +) -> str: + """ + Receives a dataset (with project specified) and returns a query template that is used to query exported + AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + Include only those that: + - have been completed (jobStatus.jobState = "DONE") + - do not contain errors (jobStatus.errorResults is none) + :param dataset: the dataset to query against in the form of $PROJECT.$DATASET + :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log + tables + :param limit: set a limit for the maximum event to return. It is used for connection testing currently + :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery + """ + limit_text = f"limit {limit}" if limit else "" + + shard_condition = "" + if use_date_sharded_tables: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" + shard_condition = ( + """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + ) + else: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" + + query = f""" + SELECT + timestamp, + logName, + insertId, + protopayload_auditlog AS protoPayload, + protopayload_auditlog.metadataJson AS metadata + FROM + {from_table} + WHERE ( + timestamp >= "{{start_time}}" + AND timestamp < "{{end_time}}" + ) + {shard_condition} + AND protopayload_auditlog.serviceName="bigquery.googleapis.com" + AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL + QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 + {limit_text}; + """ + + return textwrap.dedent(query) + + +def bigquery_audit_metadata_query_template_usage( + dataset: str, + use_date_sharded_tables: bool, + limit: Optional[int] = None, +) -> str: + """ + Receives a dataset (with project specified) and returns a query template that is used to query exported + v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + :param dataset: the dataset to query against in the form of $PROJECT.$DATASET + :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log + tables + :param limit: maximum number of events to query for + :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery + """ + + limit_text = f"limit {limit}" if limit else "" + + shard_condition = "" + if use_date_sharded_tables: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`" + shard_condition = ( + """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """ + ) + else: + from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`" + + # Deduplicates insertId via QUALIFY, see: + # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field + query = f""" + SELECT + timestamp, + logName, + insertId, + protopayload_auditlog AS protoPayload, + protopayload_auditlog.metadataJson AS metadata + FROM + {from_table} + WHERE ( + timestamp >= "{{start_time}}" + AND timestamp < "{{end_time}}" + ) + {shard_condition} + AND protopayload_auditlog.serviceName="bigquery.googleapis.com" + AND + ( + ( + protopayload_auditlog.methodName IN + ( + "google.cloud.bigquery.v2.JobService.Query", + "google.cloud.bigquery.v2.JobService.InsertJob" + ) + AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL + AND ( + JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson, + "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL + OR + JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL + ) + ) + OR + JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB" + ) + QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1 + {limit_text}; + """ + + return textwrap.dedent(query) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index f0bfc2e477371..3c12fd7216963 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -37,13 +37,15 @@ ReadEvent, ) from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import ( - BQ_FILTER_RULE_TEMPLATE_V2_USAGE, BigQueryAuditLogApi, - bigquery_audit_metadata_query_template_usage, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT +from datahub.ingestion.source.bigquery_v2.queries import ( + BQ_FILTER_RULE_TEMPLATE_V2_USAGE, + bigquery_audit_metadata_query_template_usage, +) from datahub.ingestion.source.usage.usage_common import ( TOTAL_BUDGET_FOR_QUERY_LIST, make_usage_workunit, From 31a3be8ab6be696f2e97826471c3c4d5685daa82 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Mon, 14 Aug 2023 21:13:50 +0530 Subject: [PATCH 06/11] fix lint, tests --- .../source/bigquery_v2/bigquery_config.py | 2 +- .../ingestion/source/bigquery_v2/lineage.py | 2 +- .../tests/integration/bigquery_v2/test_bigquery.py | 14 +++++--------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 6449c6ead1e58..6634aacf0426f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -39,7 +39,7 @@ class BigQueryUsageConfig(BaseUsageConfig): class BigQueryConnectionConfig(ConfigModel): credential: Optional[BigQueryCredential] = Field( - description="BigQuery credential informations" + default=None, description="BigQuery credential informations" ) _credentials_path: Optional[str] = PrivateAttr(None) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index c68fe5ef81745..7293443ad811f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -332,7 +332,7 @@ def populate_view_lineage_with_sql_parsing( ) def gen_lineage_workunits_for_table( - self, lineage: dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef + self, lineage: Dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef ) -> Iterable[MetadataWorkUnit]: dataset_urn = self.dataset_urn_builder(table_ref) diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index ba3ea06b07623..5d5b83f576d31 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -4,8 +4,10 @@ from freezegun import freeze_time from google.cloud.bigquery.table import TableListItem +from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( BigqueryDataset, + BigQuerySchemaApi, BigqueryTable, ) from tests.test_helpers import mce_helpers @@ -15,15 +17,9 @@ @freeze_time(FROZEN_TIME) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery.BigqueryV2Source.get_core_table_details" -) -@patch( - "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_datasets_for_project_id" -) +@patch.object(BigQuerySchemaApi, "get_tables_for_dataset") +@patch.object(BigqueryV2Source, "get_core_table_details") +@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") @patch("google.cloud.bigquery.Client") def test_bigquery_v2_ingest( client, From eaa72a3aa0b2bd9ff9b212529d9042ef65b1e39d Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Tue, 22 Aug 2023 12:16:58 +0530 Subject: [PATCH 07/11] revert rename of bigquery_schema.py to bigquery_schema_api.py --- .../src/datahub/ingestion/source/bigquery_v2/bigquery.py | 2 +- .../bigquery_v2/{bigquery_schema_api.py => bigquery_schema.py} | 0 .../src/datahub/ingestion/source/bigquery_v2/lineage.py | 2 +- .../src/datahub/ingestion/source/bigquery_v2/profiler.py | 2 +- .../tests/integration/bigquery_v2/test_bigquery.py | 2 +- metadata-ingestion/tests/unit/test_bigquery_profiler.py | 2 +- metadata-ingestion/tests/unit/test_bigquery_source.py | 2 +- 7 files changed, 6 insertions(+), 6 deletions(-) rename metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/{bigquery_schema_api.py => bigquery_schema.py} (100%) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 67b9d6556b3e8..86cca0c45da5a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -43,7 +43,7 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, BigqueryDataset, BigqueryProject, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py rename to metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 7293443ad811f..aaf0f0f39134b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -25,7 +25,7 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import BigQuerySchemaApi +from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigQuerySchemaApi from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT from datahub.ingestion.source.bigquery_v2.queries import ( BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index f825bbf666b64..c9dcb4fe35c3f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -11,7 +11,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( RANGE_PARTITION_NAME, BigqueryTable, ) diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 5d5b83f576d31..e5a25d32992b2 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -5,7 +5,7 @@ from google.cloud.bigquery.table import TableListItem from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryDataset, BigQuerySchemaApi, BigqueryTable, diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py index a723b6d475ae3..a2aec8df93d09 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py +++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py @@ -2,7 +2,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, BigqueryTable, PartitionInfo, diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 4a7d1eef399d6..a954c1768d0e1 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -19,7 +19,7 @@ ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report -from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import ( +from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryProject, BigQuerySchemaApi, BigqueryView, From 1b3d5b58bf9b3bc7a18047522b7373b3b7beff52 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Mon, 28 Aug 2023 19:06:54 +0530 Subject: [PATCH 08/11] move stateful check inside lineage module --- .../ingestion/source/bigquery_v2/bigquery.py | 34 +++---------------- .../ingestion/source/bigquery_v2/lineage.py | 25 ++++++++++++++ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index f7c95b729dcb2..0f0b1da8c4e82 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -229,11 +229,11 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.report.schema_api_perf, self.config.get_bigquery_client() ) - self.redundant_lineage_run_skip_handler: Optional[ + redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None if self.config.enable_stateful_lineage_ingestion: - self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( + redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler( source=self, config=self.config, pipeline_name=self.ctx.pipeline_name, @@ -245,7 +245,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): config, self.report, dataset_urn_builder=self.gen_dataset_urn_from_ref, - redundant_run_skip_handler=self.redundant_lineage_run_skip_handler, + redundant_run_skip_handler=redundant_lineage_run_skip_handler, ) redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None @@ -543,7 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: [p.id for p in projects], self.table_refs ) - if self._should_ingest_lineage(): + if self.config.include_table_lineage: yield from self.lineage_extractor.get_lineage_workunits( [p.id for p in projects], self.sql_parser_schema_resolver, @@ -552,32 +552,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.table_refs, ) - if self.redundant_lineage_run_skip_handler: - # Update the checkpoint state for this run. - self.redundant_lineage_run_skip_handler.update_state( - self.config.start_time, self.config.end_time - ) - - def _should_ingest_lineage(self) -> bool: - if not self.config.include_table_lineage: - return False - - if ( - self.redundant_lineage_run_skip_handler - and self.redundant_lineage_run_skip_handler.should_skip_this_run( - cur_start_time=self.config.start_time, - cur_end_time=self.config.end_time, - ) - ): - # Skip this run - self.report.report_warning( - "lineage-extraction", - "Skip this run as there was already a run for current ingestion window.", - ) - return False - - return True - def _get_projects(self) -> List[BigqueryProject]: logger.info("Getting projects") if self.config.project_ids or self.config.project_id: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 6e01750e5ddec..4a853901f2890 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -231,6 +231,23 @@ def error(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason) log.error(f"{key} => {reason}") + def _should_ingest_lineage(self) -> bool: + if ( + self.redundant_run_skip_handler + and self.redundant_run_skip_handler.should_skip_this_run( + cur_start_time=self.config.start_time, + cur_end_time=self.config.end_time, + ) + ): + # Skip this run + self.report.report_warning( + "lineage-extraction", + "Skip this run as there was already a run for current ingestion window.", + ) + return False + + return True + def get_lineage_workunits( self, projects: List[str], @@ -239,6 +256,8 @@ def get_lineage_workunits( view_definitions: FileBackedDict[str], table_refs: Set[str], ) -> Iterable[MetadataWorkUnit]: + if not self._should_ingest_lineage(): + return views_skip_audit_log_lineage: Set[str] = set() if self.config.lineage_parse_view_ddl: view_lineage: Dict[str, Set[LineageEdge]] = {} @@ -269,6 +288,12 @@ def get_lineage_workunits( table_refs, ) + if self.redundant_run_skip_handler: + # Update the checkpoint state for this run. + self.redundant_run_skip_handler.update_state( + self.config.start_time, self.config.end_time + ) + def generate_lineage( self, project_id: str, From 6a2a3d452437a42189fd7f93701a6c9c5ec0371d Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Tue, 5 Sep 2023 17:57:49 +0530 Subject: [PATCH 09/11] merge related changes From this https://github.com/datahub-project/datahub/commit/fa0c43c0313f6239f54879819ffc6c6dc04cbef5 --- .../datahub/ingestion/source/bigquery_v2/bigquery.py | 2 +- .../datahub/ingestion/source/bigquery_v2/lineage.py | 10 +++++----- metadata-ingestion/tests/unit/test_bigquery_lineage.py | 10 +++++++--- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 165a0eea106d1..ff7a47924626d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -541,7 +541,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.lineage_extractor.get_lineage_workunits( [p.id for p in projects], self.sql_parser_schema_resolver, - self.view_definition_ids, + self.view_refs_by_project, self.view_definitions, self.table_refs, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 08a3db2bf6503..98c8cbaf85eec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -253,7 +253,7 @@ def get_lineage_workunits( self, projects: List[str], sql_parser_schema_resolver: SchemaResolver, - view_definition_ids: Dict[str, Dict[str, str]], + view_refs_by_project: Dict[str, Set[str]], view_definitions: FileBackedDict[str], table_refs: Set[str], ) -> Iterable[MetadataWorkUnit]: @@ -265,7 +265,7 @@ def get_lineage_workunits( for project in projects: self.populate_view_lineage_with_sql_parsing( view_lineage, - view_definition_ids[project], + view_refs_by_project[project], view_definitions, sql_parser_schema_resolver, project, @@ -348,13 +348,13 @@ def generate_lineage( def populate_view_lineage_with_sql_parsing( self, view_lineage: Dict[str, Set[LineageEdge]], - view_definition_ids: Dict[str, str], + view_refs: Set[str], view_definitions: FileBackedDict[str], sql_parser_schema_resolver: SchemaResolver, default_project: str, ) -> None: - for view, view_definition_id in view_definition_ids.items(): - view_definition = view_definitions[view_definition_id] + for view in view_refs: + view_definition = view_definitions[view] raw_view_lineage = sqlglot_lineage( view_definition, schema_resolver=sql_parser_schema_resolver, diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py index 566d6fc2cb0c3..e23494963e475 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py @@ -3,6 +3,7 @@ import pytest +import datahub.emitter.mce_builder as builder from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigQueryTableRef, QueryEvent, @@ -81,7 +82,9 @@ def lineage_entries() -> List[QueryEvent]: def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: config = BigQueryV2Config() report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report, lambda x: "") + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, report, lambda x: builder.make_dataset_urn("bigquery", str(x)) + ) bq_table = BigQueryTableRef.from_string_name( "projects/my_project/datasets/my_dataset/tables/my_table" @@ -104,7 +107,9 @@ def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: config = BigQueryV2Config(extract_column_lineage=True, incremental_lineage=False) report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report, lambda x: "") + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor( + config, report, lambda x: builder.make_dataset_urn("bigquery", str(x)) + ) bq_table = BigQueryTableRef.from_string_name( "projects/my_project/datasets/my_dataset/tables/my_table" @@ -119,7 +124,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: bq_table=bq_table, bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", lineage_metadata=lineage_map, - platform="bigquery", ) assert upstream_lineage assert len(upstream_lineage.upstreams) == 2 From 79f84bac1cb8260d958f378cd69c30028509b0ad Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Wed, 13 Sep 2023 17:43:48 +0530 Subject: [PATCH 10/11] address review comments --- .../ingestion/source/bigquery_v2/bigquery.py | 7 +-- .../bigquery_v2/bigquery_audit_log_api.py | 19 +++----- .../source/bigquery_v2/bigquery_schema.py | 31 ++++++------ .../src/datahub/utilities/perf_timer.py | 47 ++++++++++--------- .../tests/unit/utilities/test_perf_timer.py | 25 ++++++---- 5 files changed, 62 insertions(+), 67 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index ff7a47924626d..ae49a4ba17c11 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -558,12 +558,7 @@ def _get_projects(self) -> List[BigqueryProject]: return list(self._query_project_list()) def _query_project_list(self) -> Iterable[BigqueryProject]: - try: - projects = self.bigquery_data_dictionary.get_projects() - except Exception as e: - logger.error(f"Error getting projects. {e}", exc_info=True) - projects = [] - + projects = self.bigquery_data_dictionary.get_projects() if not projects: # Report failure on exception and if empty list is returned self.report.report_failure( "metadata-extraction", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index fcb6200241cd7..03b12c61ee5c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -87,14 +87,12 @@ def get_exported_bigquery_audit_metadata( f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" ) - if rate_limiter: - with rate_limiter: - for entry in query_job: - with current_timer.pause_timer(): + for entry in query_job: + with current_timer.pause(): + if rate_limiter: + with rate_limiter: yield entry - else: - for entry in query_job: - with current_timer.pause_timer(): + else: yield entry def get_bigquery_log_entries_via_gcp_logging( @@ -124,17 +122,12 @@ def get_bigquery_log_entries_via_gcp_logging( ) for i, entry in enumerate(list_entries): - if i == 0: - logger.info( - f"Starting log load from GCP Logging for {client.project}" - ) - if i % 1000 == 0: logger.info( f"Loaded {i} log entries from GCP Log for {client.project}" ) - with current_timer.pause_timer(): + with current_timer.pause(): if rate_limiter: with rate_limiter: yield entry diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 6fd3482b68921..7edc8656360bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -133,30 +133,29 @@ def __init__( self.bq_client = client self.report = report - def get_client(self) -> bigquery.Client: - assert self.bq_client is not None - return self.bq_client - def get_query_result(self, query: str) -> RowIterator: logger.debug(f"Query : {query}") - resp = self.get_client().query(query) + resp = self.bq_client.query(query) return resp.result() def get_projects(self) -> List[BigqueryProject]: with self.report.list_projects: - projects = self.get_client().list_projects() + try: + projects = self.bq_client.list_projects() - return [ - BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects - ] + return [ + BigqueryProject(id=p.project_id, name=p.friendly_name) + for p in projects + ] + except Exception as e: + logger.error(f"Error getting projects. {e}", exc_info=True) + return [] def get_datasets_for_project_id( self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: with self.report.list_datasets: - datasets = self.get_client().list_datasets( - project_id, max_results=maxResults - ) + datasets = self.bq_client.list_datasets(project_id, max_results=maxResults) return [ BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets ] @@ -189,8 +188,8 @@ def list_tables( self, dataset_name: str, project_id: str ) -> Iterator[TableListItem]: with self.report.list_tables as current_timer: - for table in self.get_client().list_tables(f"{project_id}.{dataset_name}"): - with current_timer.pause_timer(): + for table in self.bq_client.list_tables(f"{project_id}.{dataset_name}"): + with current_timer.pause(): yield table def get_tables_for_dataset( @@ -231,7 +230,7 @@ def get_tables_for_dataset( for table in cur: try: - with current_timer.pause_timer(): + with current_timer.pause(): yield BigQuerySchemaApi._make_bigquery_table( table, tables.get(table.table_name) ) @@ -308,7 +307,7 @@ def get_views_for_dataset( for table in cur: try: - with current_timer.pause_timer(): + with current_timer.pause(): yield BigQuerySchemaApi._make_bigquery_view(table) except Exception as e: view_name = f"{project_id}.{dataset_name}.{table.table_name}" diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 46eb0e25e4fbf..18384420bfefb 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -1,7 +1,10 @@ +import logging import time from contextlib import AbstractContextManager from typing import Any, Optional +logger: logging.Logger = logging.getLogger(__name__) + class PerfTimer(AbstractContextManager): """ @@ -13,29 +16,19 @@ def __init__(self) -> None: self.start_time: Optional[float] = None self.end_time: Optional[float] = None self._past_active_time: float = 0 - self.paused: Optional[bool] = None + self.paused: bool = False + self._error_state = False def start(self) -> None: - # TODO - # assert ( - # self.end_time is None - # ), "Can not start a finished timer. Did you accidentally re-use this timer ?" - if self.end_time is not None: self._past_active_time = self.elapsed_seconds() self.start_time = time.perf_counter() self.end_time = None - if self.paused: - self.paused = False - - def pause_timer(self) -> "PerfTimer": - assert ( - not self.paused and not self.end_time - ), "Can not pause a paused/stopped timer" - assert ( - self.start_time is not None - ), "Can not pause a timer that hasn't started. Did you forget to start the timer ?" + self.paused = False + + def pause(self) -> "PerfTimer": + self.assert_timer_is_running() self._past_active_time = self.elapsed_seconds() self.start_time = None self.end_time = None @@ -43,9 +36,7 @@ def pause_timer(self) -> "PerfTimer": return self def finish(self) -> None: - assert ( - self.start_time is not None - ), "Can not stop a timer that hasn't started. Did you forget to start the timer ?" + self.assert_timer_is_running() self.end_time = time.perf_counter() def __enter__(self) -> "PerfTimer": @@ -71,15 +62,26 @@ def elapsed_seconds(self) -> float: """ Returns the elapsed time in seconds. """ - if self.paused: + if self.paused or not self.start_time: return self._past_active_time - assert self.start_time is not None, "Did you forget to start the timer ?" if self.end_time is None: return (time.perf_counter() - self.start_time) + (self._past_active_time) else: return (self.end_time - self.start_time) + self._past_active_time + def assert_timer_is_running(self) -> None: + """ + Returns true if timer is in running state. + Timer is in NOT in running state if + 1. it has never been started. + 2. it is in paused state. + 3. it had been started and finished in the past but not started again. + """ + if self.start_time is None or self.paused or self.end_time: + self._error_state = True + logger.warning("Did you forget to start the timer ?") + def __repr__(self) -> str: return repr(self.as_obj()) @@ -91,4 +93,5 @@ def as_obj(self) -> Optional[str]: return None else: time_taken = self.elapsed_seconds() - return f"{time_taken:.3f} seconds" + state = " (error)" if self._error_state else "" + return f"{time_taken:.3f} seconds{state}" diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py index 9fbd3a7b5d9cd..d5fde314c2b57 100644 --- a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py +++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py @@ -1,27 +1,32 @@ import time +from functools import partial + +import pytest from datahub.utilities.perf_timer import PerfTimer +approx = partial(pytest.approx, rel=1e-2) + def test_perf_timer_simple(): with PerfTimer() as timer: time.sleep(1) - assert round(timer.elapsed_seconds()) == 1 + assert approx(timer.elapsed_seconds()) == 1 - assert round(timer.elapsed_seconds()) == 1 + assert approx(timer.elapsed_seconds()) == 1 def test_perf_timer_paused_timer(): with PerfTimer() as current_timer: time.sleep(1) - assert round(current_timer.elapsed_seconds()) == 1 - with current_timer.pause_timer(): + assert approx(current_timer.elapsed_seconds()) == 1 + with current_timer.pause(): time.sleep(2) - assert round(current_timer.elapsed_seconds()) == 1 - assert round(current_timer.elapsed_seconds()) == 1 + assert approx(current_timer.elapsed_seconds()) == 1 + assert approx(current_timer.elapsed_seconds()) == 1 time.sleep(1) - assert round(current_timer.elapsed_seconds()) == 2 + assert approx(current_timer.elapsed_seconds()) == 2 def test_generator_with_paused_timer(): @@ -30,12 +35,12 @@ def generator_function(): time.sleep(1) for i in range(10): time.sleep(0.2) - with inner_timer.pause_timer(): + with inner_timer.pause(): time.sleep(0.2) yield i - assert round(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10 + assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10 with PerfTimer() as outer_timer: seq = generator_function() list([i for i in seq]) - assert round(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10 + assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10 From 0ba4efcf6fac33f8e6f346345aa01770bfeb4cd5 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Thu, 14 Sep 2023 10:56:25 +0530 Subject: [PATCH 11/11] fix tests --- metadata-ingestion/tests/unit/test_bigquery_source.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 4b30478873ae7..4fc6c31626ba8 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -252,15 +252,15 @@ def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock): assert projects == [] -@patch.object(BigQuerySchemaApi, "get_projects") @patch.object(BigQueryV2Config, "get_bigquery_client") def test_get_projects_list_failure( get_bq_client_mock: MagicMock, - get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture, ) -> None: error_str = "my error" - get_projects_mock.side_effect = GoogleAPICallError(error_str) + bq_client_mock = MagicMock() + get_bq_client_mock.return_value = bq_client_mock + bq_client_mock.list_projects.side_effect = GoogleAPICallError(error_str) config = BigQueryV2Config.parse_obj( {"project_id_pattern": {"deny": ["^test-project$"]}}