From b84601c9380a996e36dcf6fda77e422bedc9ee18 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Wed, 9 Aug 2023 20:45:24 +0530
Subject: [PATCH 01/11] feat(bigquery): add better timers around every API call

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  75 ++---
 .../source/bigquery_v2/bigquery_audit_api.py  | 208 +++++++++++++
 .../source/bigquery_v2/bigquery_config.py     |  73 +++--
 .../source/bigquery_v2/bigquery_report.py     |  27 +-
 .../source/bigquery_v2/bigquery_schema.py     | 278 ++++++++++--------
 .../ingestion/source/bigquery_v2/common.py    |  34 ---
 .../ingestion/source/bigquery_v2/lineage.py   |  31 +-
 .../ingestion/source/bigquery_v2/usage.py     | 221 ++------------
 .../source/snowflake/snowflake_v2.py          |  21 +-
 .../src/datahub/utilities/perf_timer.py       |  66 ++++-
 .../tests/unit/test_bigquery_source.py        |  35 ++-
 .../unit/test_bigqueryv2_usage_source.py      |  11 +-
 .../tests/unit/utilities/test_perf_timer.py   |  41 +++
 13 files changed, 651 insertions(+), 470 deletions(-)
 create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py
 create mode 100644 metadata-ingestion/tests/unit/utilities/test_perf_timer.py

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index d1f39a3ba1ba6..442e1e525fdf6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -54,7 +54,6 @@
 from datahub.ingestion.source.bigquery_v2.common import (
     BQ_EXTERNAL_DATASET_URL_TEMPLATE,
     BQ_EXTERNAL_TABLE_URL_TEMPLATE,
-    get_bigquery_client,
 )
 from datahub.ingestion.source.bigquery_v2.lineage import (
     BigqueryLineageExtractor,
@@ -227,6 +226,8 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
 
         set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase)
 
+        self.bigquery_data_dictionary = BigQueryDataDictionary(self.report)
+
         # For database, schema, tables, views, etc
         self.lineage_extractor = BigqueryLineageExtractor(config, self.report)
         self.usage_extractor = BigQueryUsageExtractor(
@@ -271,6 +272,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
             platform=self.platform, env=self.config.env
         )
 
+        self.add_config_to_report()
         atexit.register(cleanup, config)
 
     @classmethod
@@ -295,18 +297,20 @@ def metadata_read_capability_test(
         for project_id in project_ids:
             try:
                 logger.info((f"Metadata read capability test for project {project_id}"))
-                client: bigquery.Client = get_bigquery_client(config)
+                client: bigquery.Client = config.get_bigquery_client()
                 assert client
-                result = BigQueryDataDictionary.get_datasets_for_project_id(
-                    client, project_id, 10
+                report = BigQueryV2Report()
+                bigquery_data_dictionary = BigQueryDataDictionary(report)
+                bigquery_data_dictionary.set_client(client)
+                result = bigquery_data_dictionary.get_datasets_for_project_id(
+                    project_id, 10
                 )
                 if len(result) == 0:
                     return CapabilityReport(
                         capable=False,
                         failure_reason=f"Dataset query returned empty dataset. It is either empty or no dataset in project {project_id}",
                     )
-                tables = BigQueryDataDictionary.get_tables_for_dataset(
-                    conn=client,
+                tables = bigquery_data_dictionary.get_tables_for_dataset(
                     project_id=project_id,
                     dataset_name=result[0].name,
                     tables={},
@@ -378,7 +382,7 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
 
         try:
             connection_conf = BigQueryV2Config.parse_obj_allow_extras(config_dict)
-            client: bigquery.Client = get_bigquery_client(connection_conf)
+            client: bigquery.Client = connection_conf.get_bigquery_client()
             assert client
 
             test_report.basic_connectivity = BigqueryV2Source.connectivity_test(client)
@@ -498,17 +502,17 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         ]
 
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        conn: bigquery.Client = get_bigquery_client(self.config)
-        self.add_config_to_report()
+        bq_client: bigquery.Client = self.config.get_bigquery_client()
+        self.bigquery_data_dictionary.set_client(bq_client)
 
-        projects = self._get_projects(conn)
+        projects = self._get_projects()
         if not projects:
             return
 
         for project_id in projects:
             self.report.set_ingestion_stage(project_id.id, "Metadata Extraction")
             logger.info(f"Processing project: {project_id.id}")
-            yield from self._process_project(conn, project_id)
+            yield from self._process_project(project_id)
 
         if self._should_ingest_usage():
             yield from self.usage_extractor.get_usage_workunits(
@@ -563,7 +567,7 @@ def _should_ingest_lineage(self) -> bool:
                 )
         return True
 
-    def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]:
+    def _get_projects(self) -> List[BigqueryProject]:
         logger.info("Getting projects")
         if self.config.project_ids or self.config.project_id:
             project_ids = self.config.project_ids or [self.config.project_id]  # type: ignore
@@ -572,11 +576,11 @@ def _get_projects(self, conn: bigquery.Client) -> List[BigqueryProject]:
                 for project_id in project_ids
             ]
         else:
-            return list(self._get_project_list(conn))
+            return list(self._query_project_list())
 
-    def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]:
+    def _query_project_list(self) -> Iterable[BigqueryProject]:
         try:
-            projects = BigQueryDataDictionary.get_projects(conn)
+            projects = self.bigquery_data_dictionary.get_projects()
         except Exception as e:
             logger.error(f"Error getting projects. {e}", exc_info=True)
             projects = []
@@ -597,7 +601,7 @@ def _get_project_list(self, conn: bigquery.Client) -> Iterable[BigqueryProject]:
                 self.report.report_dropped(project.id)
 
     def _process_project(
-        self, conn: bigquery.Client, bigquery_project: BigqueryProject
+        self, bigquery_project: BigqueryProject
     ) -> Iterable[MetadataWorkUnit]:
         db_tables: Dict[str, List[BigqueryTable]] = {}
         db_views: Dict[str, List[BigqueryView]] = {}
@@ -608,7 +612,7 @@ def _process_project(
 
         try:
             bigquery_project.datasets = (
-                BigQueryDataDictionary.get_datasets_for_project_id(conn, project_id)
+                self.bigquery_data_dictionary.get_datasets_for_project_id(project_id)
             )
         except Exception as e:
             error_message = f"Unable to get datasets for project {project_id}, skipping. The error was: {e}"
@@ -642,7 +646,7 @@ def _process_project(
             try:
                 # db_tables and db_views are populated in the this method
                 yield from self._process_schema(
-                    conn, project_id, bigquery_dataset, db_tables, db_views
+                    project_id, bigquery_dataset, db_tables, db_views
                 )
 
             except Exception as e:
@@ -735,7 +739,6 @@ def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]:
 
     def _process_schema(
         self,
-        conn: bigquery.Client,
         project_id: str,
         bigquery_dataset: BigqueryDataset,
         db_tables: Dict[str, List[BigqueryTable]],
@@ -749,8 +752,7 @@ def _process_schema(
 
         columns = None
         if self.config.include_tables or self.config.include_views:
-            columns = BigQueryDataDictionary.get_columns_for_dataset(
-                conn,
+            columns = self.bigquery_data_dictionary.get_columns_for_dataset(
                 project_id=project_id,
                 dataset_name=dataset_name,
                 column_limit=self.config.column_limit,
@@ -759,7 +761,7 @@ def _process_schema(
 
         if self.config.include_tables:
             db_tables[dataset_name] = list(
-                self.get_tables_for_dataset(conn, project_id, dataset_name)
+                self.get_tables_for_dataset(project_id, dataset_name)
             )
 
             for table in db_tables[dataset_name]:
@@ -772,7 +774,9 @@ def _process_schema(
                 )
         elif self.config.include_table_lineage or self.config.include_usage_statistics:
             # Need table_refs to calculate lineage and usage
-            for table_item in conn.list_tables(f"{project_id}.{dataset_name}"):
+            for table_item in self.bigquery_data_dictionary.list_tables(
+                dataset_name, project_id
+            ):
                 identifier = BigqueryTableIdentifier(
                     project_id=project_id,
                     dataset=dataset_name,
@@ -792,8 +796,8 @@ def _process_schema(
 
         if self.config.include_views:
             db_views[dataset_name] = list(
-                BigQueryDataDictionary.get_views_for_dataset(
-                    conn, project_id, dataset_name, self.config.is_profiling_enabled()
+                self.bigquery_data_dictionary.get_views_for_dataset(
+                    project_id, dataset_name, self.config.is_profiling_enabled()
                 )
             )
 
@@ -1205,7 +1209,6 @@ def get_report(self) -> BigQueryV2Report:
 
     def get_tables_for_dataset(
         self,
-        conn: bigquery.Client,
         project_id: str,
         dataset_name: str,
     ) -> Iterable[BigqueryTable]:
@@ -1224,14 +1227,15 @@ def get_tables_for_dataset(
 
             # We get the list of tables in the dataset to get core table properties and to be able to process the tables in batches
             # We collect only the latest shards from sharded tables (tables with _YYYYMMDD suffix) and ignore temporary tables
-            table_items = self.get_core_table_details(conn, dataset_name, project_id)
+            table_items = self.get_core_table_details(
+                dataset_name, project_id, self.config.temp_table_dataset_prefix
+            )
 
             items_to_get: Dict[str, TableListItem] = {}
             for table_item in table_items.keys():
                 items_to_get[table_item] = table_items[table_item]
                 if len(items_to_get) % max_batch_size == 0:
-                    yield from BigQueryDataDictionary.get_tables_for_dataset(
-                        conn,
+                    yield from self.bigquery_data_dictionary.get_tables_for_dataset(
                         project_id,
                         dataset_name,
                         items_to_get,
@@ -1240,8 +1244,7 @@ def get_tables_for_dataset(
                     items_to_get.clear()
 
             if items_to_get:
-                yield from BigQueryDataDictionary.get_tables_for_dataset(
-                    conn,
+                yield from self.bigquery_data_dictionary.get_tables_for_dataset(
                     project_id,
                     dataset_name,
                     items_to_get,
@@ -1253,13 +1256,15 @@ def get_tables_for_dataset(
         )
 
     def get_core_table_details(
-        self, conn: bigquery.Client, dataset_name: str, project_id: str
+        self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str
     ) -> Dict[str, TableListItem]:
         table_items: Dict[str, TableListItem] = {}
         # Dict to store sharded table and the last seen max shard id
         sharded_tables: Dict[str, TableListItem] = {}
 
-        for table in conn.list_tables(f"{project_id}.{dataset_name}"):
+        for table in self.bigquery_data_dictionary.list_tables(
+            dataset_name, project_id
+        ):
             table_identifier = BigqueryTableIdentifier(
                 project_id=project_id,
                 dataset=dataset_name,
@@ -1296,9 +1301,7 @@ def get_core_table_details(
                 if stored_shard < shard:
                     sharded_tables[table_name] = table
                 continue
-            elif str(table_identifier).startswith(
-                self.config.temp_table_dataset_prefix
-            ):
+            elif str(table_identifier).startswith(temp_table_dataset_prefix):
                 logger.debug(f"Dropping temporary table {table_identifier.table}")
                 self.report.report_dropped(table_identifier.raw_table_name())
                 continue
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py
new file mode 100644
index 0000000000000..09a9098ced338
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py
@@ -0,0 +1,208 @@
+import logging
+import textwrap
+from datetime import datetime
+from typing import Iterable, List, Optional
+
+from google.cloud import bigquery
+from google.cloud.logging_v2.client import Client as GCPLoggingClient
+from ratelimiter import RateLimiter
+
+from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
+    BQ_AUDIT_V2,
+    BQ_FILTER_RULE_TEMPLATE,
+    AuditLogEntry,
+    BigQueryAuditMetadata,
+)
+from datahub.ingestion.source.bigquery_v2.bigquery_report import (
+    BigQueryAuditLogApiPerfReport,
+)
+from datahub.ingestion.source.bigquery_v2.common import (
+    BQ_DATE_SHARD_FORMAT,
+    BQ_DATETIME_FORMAT,
+)
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class BigQueryAuditLogApi:
+    def __init__(
+        self,
+        report: BigQueryAuditLogApiPerfReport,
+        rate_limit: bool,
+        requests_per_min: int,
+    ) -> None:
+        self.api_perf_report = report
+        self.rate_limit = rate_limit
+        self.requests_per_min = requests_per_min
+
+    def get_exported_bigquery_audit_metadata(
+        self,
+        bigquery_client: bigquery.Client,
+        bigquery_audit_metadata_datasets: Optional[List[str]],
+        use_date_sharded_audit_log_tables: bool,
+        start_time: datetime,
+        end_time: datetime,
+        limit: Optional[int] = None,
+    ) -> Iterable[BigQueryAuditMetadata]:
+        if bigquery_audit_metadata_datasets is None:
+            return
+
+        audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT)
+        audit_start_date = start_time.strftime(BQ_DATE_SHARD_FORMAT)
+
+        audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT)
+        audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT)
+
+        with self.api_perf_report.get_exported_bigquery_audit_metadata as current_timer:
+            for dataset in bigquery_audit_metadata_datasets:
+                logger.info(
+                    f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
+                )
+
+                query = bigquery_audit_metadata_query_template(
+                    dataset,
+                    use_date_sharded_audit_log_tables,
+                    limit=limit,
+                ).format(
+                    start_time=audit_start_time,
+                    end_time=audit_end_time,
+                    start_date=audit_start_date,
+                    end_date=audit_end_date,
+                )
+
+                query_job = bigquery_client.query(query)
+                logger.info(
+                    f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
+                )
+                with current_timer.pause_timer():
+                    if self.rate_limit:
+                        with RateLimiter(max_calls=self.requests_per_min, period=60):
+                            yield from query_job
+                    else:
+                        yield from query_job
+
+    def get_bigquery_log_entries_via_gcp_logging(
+        self,
+        client: GCPLoggingClient,
+        start_time: datetime,
+        end_time: datetime,
+        log_page_size: int,
+        limit: Optional[int] = None,
+    ) -> Iterable[AuditLogEntry]:
+        filter = self._generate_filter(start_time, end_time)
+        logger.debug(filter)
+
+        list_entries: Iterable[AuditLogEntry]
+        rate_limiter: Optional[RateLimiter] = None
+        if self.rate_limit:
+            # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging
+            # to properly ratelimit we multiply the page size by the number of requests per minute
+            rate_limiter = RateLimiter(
+                max_calls=self.requests_per_min * log_page_size,
+                period=60,
+            )
+
+        with self.api_perf_report.get_bigquery_log_entries_via_gcp_logging as current_timer:
+            list_entries = client.list_entries(
+                filter_=filter,
+                page_size=log_page_size,
+                max_results=limit,
+            )
+
+            for i, entry in enumerate(list_entries):
+                if i == 0:
+                    logger.info(
+                        f"Starting log load from GCP Logging for {client.project}"
+                    )
+                if i % 1000 == 0:
+                    logger.info(
+                        f"Loaded {i} log entries from GCP Log for {client.project}"
+                    )
+
+                with current_timer.pause_timer():
+                    if rate_limiter:
+                        with rate_limiter:
+                            yield entry
+                    else:
+                        yield entry
+
+    def _generate_filter(self, start_time: datetime, end_time: datetime) -> str:
+        audit_start_time = (start_time).strftime(BQ_DATETIME_FORMAT)
+
+        audit_end_time = (end_time).strftime(BQ_DATETIME_FORMAT)
+
+        filter = BQ_AUDIT_V2[BQ_FILTER_RULE_TEMPLATE].format(
+            start_time=audit_start_time, end_time=audit_end_time
+        )
+        return filter
+
+
+def bigquery_audit_metadata_query_template(
+    dataset: str,
+    use_date_sharded_tables: bool,
+    limit: Optional[int] = None,
+) -> str:
+    """
+    Receives a dataset (with project specified) and returns a query template that is used to query exported
+    v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
+    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
+    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
+           tables
+    :param limit: maximum number of events to query for
+    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
+    """
+
+    limit_text = f"limit {limit}" if limit else ""
+
+    shard_condition = ""
+    if use_date_sharded_tables:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
+        shard_condition = (
+            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+        )
+    else:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
+
+    # Deduplicates insertId via QUALIFY, see:
+    # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field
+    query = f"""
+        SELECT
+            timestamp,
+            logName,
+            insertId,
+            protopayload_auditlog AS protoPayload,
+            protopayload_auditlog.metadataJson AS metadata
+        FROM
+            {from_table}
+        WHERE (
+            timestamp >= "{{start_time}}"
+            AND timestamp < "{{end_time}}"
+        )
+        {shard_condition}
+        AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
+        AND
+        (
+            (
+                protopayload_auditlog.methodName IN
+                    (
+                        "google.cloud.bigquery.v2.JobService.Query",
+                        "google.cloud.bigquery.v2.JobService.InsertJob"
+                    )
+                AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
+                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
+                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
+                AND (
+                        JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson,
+                                                            "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL
+                    OR
+                        JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL
+                    )
+            )
+            OR
+                JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB"
+        )
+        QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
+        {limit_text};
+    """
+
+    return textwrap.dedent(query)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index 7287dc1b67d73..fa689d571c8cc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -4,9 +4,11 @@
 from typing import Any, Dict, List, Optional
 
 import pydantic
+from google.cloud import bigquery
+from google.cloud.logging_v2.client import Client as GCPLoggingClient
 from pydantic import Field, PositiveInt, PrivateAttr, root_validator
 
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig
 from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -35,7 +37,52 @@ class BigQueryUsageConfig(BaseUsageConfig):
     )
 
 
+class BigQueryConnectionConfig(ConfigModel):
+    credential: Optional[BigQueryCredential] = Field(
+        description="BigQuery credential informations"
+    )
+
+    _credentials_path: Optional[str] = PrivateAttr(None)
+
+    extra_client_options: Dict[str, Any] = Field(
+        default={},
+        description="Additional options to pass to google.cloud.logging_v2.client.Client.",
+    )
+
+    project_on_behalf: Optional[str] = Field(
+        default=None,
+        description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
+    )
+
+    def __init__(self, **data: Any):
+        super().__init__(**data)
+
+        if self.credential:
+            self._credentials_path = self.credential.create_credential_temp_file()
+            logger.debug(
+                f"Creating temporary credential file at {self._credentials_path}"
+            )
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
+
+    def get_bigquery_client(config) -> bigquery.Client:
+        client_options = config.extra_client_options
+        return bigquery.Client(config.project_on_behalf, **client_options)
+
+    def make_gcp_logging_client(
+        self, project_id: Optional[str] = None
+    ) -> GCPLoggingClient:
+        # See https://github.com/googleapis/google-cloud-python/issues/2674 for
+        # why we disable gRPC here.
+        client_options = self.extra_client_options.copy()
+        client_options["_use_grpc"] = False
+        if project_id is not None:
+            return GCPLoggingClient(**client_options, project=project_id)
+        else:
+            return GCPLoggingClient(**client_options)
+
+
 class BigQueryV2Config(
+    BigQueryConnectionConfig,
     BigQueryBaseConfig,
     SQLAlchemyConfig,
     StatefulUsageConfigMixin,
@@ -115,11 +162,6 @@ class BigQueryV2Config(
         ),
     )
 
-    project_on_behalf: Optional[str] = Field(
-        default=None,
-        description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
-    )
-
     storage_project_id: None = Field(default=None, hidden_from_docs=True)
 
     lineage_use_sql_parser: bool = Field(
@@ -173,14 +215,8 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
         default=1000,
         description="The number of log item will be queried per page for lineage collection",
     )
-    credential: Optional[BigQueryCredential] = Field(
-        description="BigQuery credential informations"
-    )
+
     # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
-    extra_client_options: Dict[str, Any] = Field(
-        default={},
-        description="Additional options to pass to google.cloud.logging_v2.client.Client.",
-    )
     include_table_lineage: Optional[bool] = Field(
         default=True,
         description="Option to enable/disable lineage generation. Is enabled by default.",
@@ -202,7 +238,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
         default=False,
         description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.",
     )
-    _credentials_path: Optional[str] = PrivateAttr(None)
 
     _cache_path: Optional[str] = PrivateAttr(None)
 
@@ -223,16 +258,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
         description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
     )
 
-    def __init__(self, **data: Any):
-        super().__init__(**data)
-
-        if self.credential:
-            self._credentials_path = self.credential.create_credential_temp_file()
-            logger.debug(
-                f"Creating temporary credential file at {self._credentials_path}"
-            )
-            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
-
     @root_validator(pre=False)
     def profile_default_settings(cls, values: Dict) -> Dict:
         # Extra default SQLAlchemy option for better connection pooling and threading.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index b57e691411f75..fc725e0cda3c4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -15,8 +15,24 @@
 logger: logging.Logger = logging.getLogger(__name__)
 
 
+class BigQueryApiPerfReport:
+    list_projects = PerfTimer()
+    get_datasets_for_project = PerfTimer()
+    get_columns_for_dataset = PerfTimer()
+    get_tables_for_dataset = PerfTimer()
+    list_tables = PerfTimer()
+    get_views_for_dataset = PerfTimer()
+
+
+class BigQueryAuditLogApiPerfReport:
+    get_exported_bigquery_audit_metadata = PerfTimer()
+    get_bigquery_log_entries_via_gcp_logging = PerfTimer()
+
+
 @dataclass
-class BigQueryV2Report(ProfilingSqlReport):
+class BigQueryV2Report(
+    ProfilingSqlReport, BigQueryApiPerfReport, BigQueryAuditLogApiPerfReport
+):
     num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict)
     num_skipped_lineage_entries_missing_data: TopKDict[str, int] = field(
         default_factory=int_top_k_dict
@@ -53,10 +69,11 @@ class BigQueryV2Report(ProfilingSqlReport):
     log_page_size: Optional[pydantic.PositiveInt] = None
     use_exported_bigquery_audit_metadata: Optional[bool] = None
     end_time: Optional[datetime] = None
-    log_entry_start_time: Optional[str] = None
-    log_entry_end_time: Optional[str] = None
-    audit_start_time: Optional[str] = None
-    audit_end_time: Optional[str] = None
+    # TODO: remove one or replace by lineage ones
+    log_entry_start_time: Optional[datetime] = None
+    log_entry_end_time: Optional[datetime] = None
+    audit_start_time: Optional[datetime] = None
+    audit_end_time: Optional[datetime] = None
     upstream_lineage: LossyDict = field(default_factory=LossyDict)
     partition_info: Dict[str, str] = field(default_factory=TopKDict)
     profile_table_selection_criteria: Dict[str, str] = field(default_factory=TopKDict)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
index 2450dbd0e2391..68dd0eeefe09b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -13,7 +13,10 @@
 )
 
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
-from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
+from datahub.ingestion.source.bigquery_v2.bigquery_report import (
+    BigQueryApiPerfReport,
+    BigQueryV2Report,
+)
 from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -345,26 +348,41 @@ class BigqueryQuery:
 
 
 class BigQueryDataDictionary:
+    def __init__(self, report: BigQueryApiPerfReport) -> None:
+        self.bq_client: Optional[bigquery.Client] = None
+        self.api_perf_report = report
+
+    def set_client(self, bq_client: bigquery.Client) -> None:
+        self.bq_client = bq_client
+
+    def get_client(self) -> bigquery.Client:
+        assert self.bq_client is not None
+        return self.bq_client
+
     @staticmethod
     def get_query_result(conn: bigquery.Client, query: str) -> RowIterator:
         logger.debug(f"Query : {query}")
         resp = conn.query(query)
         return resp.result()
 
-    @staticmethod
-    def get_projects(conn: bigquery.Client) -> List[BigqueryProject]:
-        projects = conn.list_projects()
+    def get_projects(self) -> List[BigqueryProject]:
+        with self.api_perf_report.list_projects:
+            projects = self.get_client().list_projects()
 
-        return [
-            BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects
-        ]
+            return [
+                BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects
+            ]
 
-    @staticmethod
     def get_datasets_for_project_id(
-        conn: bigquery.Client, project_id: str, maxResults: Optional[int] = None
+        self, project_id: str, maxResults: Optional[int] = None
     ) -> List[BigqueryDataset]:
-        datasets = conn.list_datasets(project_id, max_results=maxResults)
-        return [BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets]
+        with self.api_perf_report.get_datasets_for_project:
+            datasets = self.get_client().list_datasets(
+                project_id, max_results=maxResults
+            )
+            return [
+                BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets
+            ]
 
     @staticmethod
     def get_datasets_for_project_id_with_information_schema(
@@ -391,56 +409,69 @@ def get_datasets_for_project_id_with_information_schema(
             for s in schemas
         ]
 
-    @staticmethod
+    def list_tables(
+        self, dataset_name: str, project_id: str
+    ) -> Iterator[TableListItem]:
+        with self.api_perf_report.list_tables as current_timer:
+            for table in self.get_client().list_tables(f"{project_id}.{dataset_name}"):
+                with current_timer.pause_timer():
+                    yield table
+
     def get_tables_for_dataset(
-        conn: bigquery.Client,
+        self,
         project_id: str,
         dataset_name: str,
         tables: Dict[str, TableListItem],
         with_data_read_permission: bool = False,
         report: Optional[BigQueryV2Report] = None,
     ) -> Iterator[BigqueryTable]:
-        filter: str = ", ".join(f"'{table}'" for table in tables.keys())
-
-        if with_data_read_permission:
-            # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
-            # and skip the others. Sharded tables are tables with suffix _20220102
-            cur = BigQueryDataDictionary.get_query_result(
-                conn,
-                BigqueryQuery.tables_for_dataset.format(
-                    project_id=project_id,
-                    dataset_name=dataset_name,
-                    table_filter=f" and t.table_name in ({filter})" if filter else "",
-                ),
-            )
-        else:
-            # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
-            # and skip the others. Sharded tables are tables with suffix _20220102
-            cur = BigQueryDataDictionary.get_query_result(
-                conn,
-                BigqueryQuery.tables_for_dataset_without_partition_data.format(
-                    project_id=project_id,
-                    dataset_name=dataset_name,
-                    table_filter=f" and t.table_name in ({filter})" if filter else "",
-                ),
-            )
-
-        for table in cur:
-            try:
-                yield BigQueryDataDictionary._make_bigquery_table(
-                    table, tables.get(table.table_name)
+        with self.api_perf_report.get_tables_for_dataset as current_timer:
+            filter: str = ", ".join(f"'{table}'" for table in tables.keys())
+
+            if with_data_read_permission:
+                # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
+                # and skip the others. Sharded tables are tables with suffix _20220102
+                cur = BigQueryDataDictionary.get_query_result(
+                    self.get_client(),
+                    BigqueryQuery.tables_for_dataset.format(
+                        project_id=project_id,
+                        dataset_name=dataset_name,
+                        table_filter=f" and t.table_name in ({filter})"
+                        if filter
+                        else "",
+                    ),
                 )
-            except Exception as e:
-                table_name = f"{project_id}.{dataset_name}.{table.table_name}"
-                logger.warning(
-                    f"Error while processing table {table_name}",
-                    exc_info=True,
+            else:
+                # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
+                # and skip the others. Sharded tables are tables with suffix _20220102
+                cur = BigQueryDataDictionary.get_query_result(
+                    self.get_client(),
+                    BigqueryQuery.tables_for_dataset_without_partition_data.format(
+                        project_id=project_id,
+                        dataset_name=dataset_name,
+                        table_filter=f" and t.table_name in ({filter})"
+                        if filter
+                        else "",
+                    ),
                 )
-                if report:
-                    report.report_warning(
-                        "metadata-extraction",
-                        f"Failed to get table {table_name}: {e}",
+
+            for table in cur:
+                try:
+                    with current_timer.pause_timer():
+                        yield BigQueryDataDictionary._make_bigquery_table(
+                            table, tables.get(table.table_name)
+                        )
+                except Exception as e:
+                    table_name = f"{project_id}.{dataset_name}.{table.table_name}"
+                    logger.warning(
+                        f"Error while processing table {table_name}",
+                        exc_info=True,
                     )
+                    if report:
+                        report.report_warning(
+                            "metadata-extraction",
+                            f"Failed to get table {table_name}: {e}",
+                        )
 
     @staticmethod
     def _make_bigquery_table(
@@ -480,43 +511,44 @@ def _make_bigquery_table(
             long_term_billable_bytes=table.get("long_term_billable_bytes"),
         )
 
-    @staticmethod
     def get_views_for_dataset(
-        conn: bigquery.Client,
+        self,
         project_id: str,
         dataset_name: str,
         has_data_read: bool,
         report: Optional[BigQueryV2Report] = None,
     ) -> Iterator[BigqueryView]:
-        if has_data_read:
-            cur = BigQueryDataDictionary.get_query_result(
-                conn,
-                BigqueryQuery.views_for_dataset.format(
-                    project_id=project_id, dataset_name=dataset_name
-                ),
-            )
-        else:
-            cur = BigQueryDataDictionary.get_query_result(
-                conn,
-                BigqueryQuery.views_for_dataset_without_data_read.format(
-                    project_id=project_id, dataset_name=dataset_name
-                ),
-            )
-
-        for table in cur:
-            try:
-                yield BigQueryDataDictionary._make_bigquery_view(table)
-            except Exception as e:
-                view_name = f"{project_id}.{dataset_name}.{table.table_name}"
-                logger.warning(
-                    f"Error while processing view {view_name}",
-                    exc_info=True,
+        with self.api_perf_report.get_views_for_dataset as current_timer:
+            if has_data_read:
+                cur = BigQueryDataDictionary.get_query_result(
+                    self.get_client(),
+                    BigqueryQuery.views_for_dataset.format(
+                        project_id=project_id, dataset_name=dataset_name
+                    ),
+                )
+            else:
+                cur = BigQueryDataDictionary.get_query_result(
+                    self.get_client(),
+                    BigqueryQuery.views_for_dataset_without_data_read.format(
+                        project_id=project_id, dataset_name=dataset_name
+                    ),
                 )
-                if report:
-                    report.report_warning(
-                        "metadata-extraction",
-                        f"Failed to get view {view_name}: {e}",
+
+            for table in cur:
+                try:
+                    with current_timer.pause_timer():
+                        yield BigQueryDataDictionary._make_bigquery_view(table)
+                except Exception as e:
+                    view_name = f"{project_id}.{dataset_name}.{table.table_name}"
+                    logger.warning(
+                        f"Error while processing view {view_name}",
+                        exc_info=True,
                     )
+                    if report:
+                        report.report_warning(
+                            "metadata-extraction",
+                            f"Failed to get view {view_name}: {e}",
+                        )
 
     @staticmethod
     def _make_bigquery_view(view: bigquery.Row) -> BigqueryView:
@@ -533,58 +565,58 @@ def _make_bigquery_view(view: bigquery.Row) -> BigqueryView:
             materialized=view.table_type == BigqueryTableType.MATERIALIZED_VIEW,
         )
 
-    @staticmethod
     def get_columns_for_dataset(
-        conn: bigquery.Client,
+        self,
         project_id: str,
         dataset_name: str,
         column_limit: int,
         run_optimized_column_query: bool = False,
     ) -> Optional[Dict[str, List[BigqueryColumn]]]:
         columns: Dict[str, List[BigqueryColumn]] = defaultdict(list)
-        try:
-            cur = BigQueryDataDictionary.get_query_result(
-                conn,
-                BigqueryQuery.columns_for_dataset.format(
-                    project_id=project_id, dataset_name=dataset_name
-                )
-                if not run_optimized_column_query
-                else BigqueryQuery.optimized_columns_for_dataset.format(
-                    project_id=project_id,
-                    dataset_name=dataset_name,
-                    column_limit=column_limit,
-                ),
-            )
-        except Exception as e:
-            logger.warning(f"Columns for dataset query failed with exception: {e}")
-            # Error - Information schema query returned too much data.
-            # Please repeat query with more selective predicates.
-            return None
-
-        last_seen_table: str = ""
-        for column in cur:
-            if (
-                column_limit
-                and column.table_name in columns
-                and len(columns[column.table_name]) >= column_limit
-            ):
-                if last_seen_table != column.table_name:
-                    logger.warning(
-                        f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns"
-                    )
-                    last_seen_table = column.table_name
-            else:
-                columns[column.table_name].append(
-                    BigqueryColumn(
-                        name=column.column_name,
-                        ordinal_position=column.ordinal_position,
-                        field_path=column.field_path,
-                        is_nullable=column.is_nullable == "YES",
-                        data_type=column.data_type,
-                        comment=column.comment,
-                        is_partition_column=column.is_partitioning_column == "YES",
+        with self.api_perf_report.get_columns_for_dataset:
+            try:
+                cur = BigQueryDataDictionary.get_query_result(
+                    self.get_client(),
+                    BigqueryQuery.columns_for_dataset.format(
+                        project_id=project_id, dataset_name=dataset_name
                     )
+                    if not run_optimized_column_query
+                    else BigqueryQuery.optimized_columns_for_dataset.format(
+                        project_id=project_id,
+                        dataset_name=dataset_name,
+                        column_limit=column_limit,
+                    ),
                 )
+            except Exception as e:
+                logger.warning(f"Columns for dataset query failed with exception: {e}")
+                # Error - Information schema query returned too much data.
+                # Please repeat query with more selective predicates.
+                return None
+
+            last_seen_table: str = ""
+            for column in cur:
+                if (
+                    column_limit
+                    and column.table_name in columns
+                    and len(columns[column.table_name]) >= column_limit
+                ):
+                    if last_seen_table != column.table_name:
+                        logger.warning(
+                            f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns"
+                        )
+                        last_seen_table = column.table_name
+                else:
+                    columns[column.table_name].append(
+                        BigqueryColumn(
+                            name=column.column_name,
+                            ordinal_position=column.ordinal_position,
+                            field_path=column.field_path,
+                            is_nullable=column.is_nullable == "YES",
+                            data_type=column.data_type,
+                            comment=column.comment,
+                            is_partition_column=column.is_partitioning_column == "YES",
+                        )
+                    )
 
         return columns
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py
index 4ff509858b87d..e38ab07855b8b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py
@@ -1,39 +1,5 @@
-from typing import Any, Dict, Optional
-
-from google.cloud import bigquery
-from google.cloud.logging_v2.client import Client as GCPLoggingClient
-
-from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
-
 BQ_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 BQ_DATE_SHARD_FORMAT = "%Y%m%d"
 
 BQ_EXTERNAL_TABLE_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m5!1m4!4m3!1s{project}!2s{dataset}!3s{table}"
 BQ_EXTERNAL_DATASET_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m4!1m3!3m2!1s{project}!2s{dataset}"
-
-
-def _make_gcp_logging_client(
-    project_id: Optional[str] = None, extra_client_options: Dict[str, Any] = {}
-) -> GCPLoggingClient:
-    # See https://github.com/googleapis/google-cloud-python/issues/2674 for
-    # why we disable gRPC here.
-    client_options = extra_client_options.copy()
-    client_options["_use_grpc"] = False
-    if project_id is not None:
-        return GCPLoggingClient(**client_options, project=project_id)
-    else:
-        return GCPLoggingClient(**client_options)
-
-
-def get_bigquery_client(config: BigQueryV2Config) -> bigquery.Client:
-    client_options = config.extra_client_options
-    return bigquery.Client(config.project_on_behalf, **client_options)
-
-
-def get_sql_alchemy_url(config: BigQueryV2Config) -> str:
-    if config.project_on_behalf:
-        return f"bigquery://{config.project_on_behalf}"
-    # When project_id is not set, we will attempt to detect the project ID
-    # based on the credentials or environment variables.
-    # See https://github.com/mxmzdlv/pybigquery#authentication.
-    return "bigquery://"
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 255a673026252..210018e55be15 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -26,8 +26,6 @@
 from datahub.ingestion.source.bigquery_v2.common import (
     BQ_DATE_SHARD_FORMAT,
     BQ_DATETIME_FORMAT,
-    _make_gcp_logging_client,
-    get_bigquery_client,
 )
 from datahub.metadata.schema_classes import (
     AuditStampClass,
@@ -133,7 +131,6 @@ def _follow_column_lineage(
 def make_lineage_edges_from_parsing_result(
     sql_lineage: SqlParsingResult, audit_stamp: datetime, lineage_type: str
 ) -> List[LineageEdge]:
-
     # Note: This ignores the out_tables section of the sql parsing result.
     audit_stamp = datetime.now(timezone.utc)
 
@@ -295,7 +292,7 @@ def lineage_via_catalog_lineage_api(
 
         try:
             lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient()
-            bigquery_client: BigQueryClient = get_bigquery_client(self.config)
+            bigquery_client: BigQueryClient = self.config.get_bigquery_client()
             # Filtering datasets
             datasets = list(bigquery_client.list_datasets(project_id))
             project_tables = []
@@ -381,12 +378,12 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
         parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]]
         if self.config.use_exported_bigquery_audit_metadata:
             logger.info("Populating lineage info via exported GCP audit logs")
-            bq_client = get_bigquery_client(self.config)
+            bq_client = self.config.get_bigquery_client()
             entries = self._get_exported_bigquery_audit_metadata(bq_client)
             parse_fn = self._parse_exported_bigquery_audit_metadata
         else:
             logger.info("Populating lineage info via exported GCP audit logs")
-            logging_client = _make_gcp_logging_client(project_id)
+            logging_client = self.config.make_gcp_logging_client(project_id)
             entries = self._get_bigquery_log_entries(logging_client)
             parse_fn = self._parse_bigquery_log_entries
 
@@ -406,15 +403,13 @@ def _get_bigquery_log_entries(
     ) -> Iterable[AuditLogEntry]:
         self.report.num_total_log_entries[client.project] = 0
         # Add a buffer to start and end time to account for delays in logging events.
-        start_time = (self.config.start_time - self.config.max_query_duration).strftime(
-            BQ_DATETIME_FORMAT
-        )
-        self.report.log_entry_start_time = start_time
+        corrected_start_time = self.config.start_time - self.config.max_query_duration
+        start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
+        self.report.log_entry_start_time = corrected_start_time
 
-        end_time = (self.config.end_time + self.config.max_query_duration).strftime(
-            BQ_DATETIME_FORMAT
-        )
-        self.report.log_entry_end_time = end_time
+        corrected_end_time = self.config.end_time + self.config.max_query_duration
+        end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
+        self.report.log_entry_end_time = corrected_end_time
 
         filter = self.BQ_FILTER_RULE_TEMPLATE_V2.format(
             start_time=start_time,
@@ -465,12 +460,12 @@ def _get_exported_bigquery_audit_metadata(
         corrected_start_time = self.config.start_time - self.config.max_query_duration
         start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
         start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT)
-        self.report.audit_start_time = start_time
+        self.report.audit_start_time = corrected_start_time
 
         corrected_end_time = self.config.end_time + self.config.max_query_duration
         end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
         end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT)
-        self.report.audit_end_time = end_time
+        self.report.audit_end_time = corrected_end_time
 
         for dataset in self.config.bigquery_audit_metadata_datasets:
             logger.info(
@@ -827,8 +822,8 @@ def test_capability(self, project_id: str) -> None:
                     f"Connection test got one exported_bigquery_audit_metadata {entry}"
                 )
         else:
-            gcp_logging_client: GCPLoggingClient = _make_gcp_logging_client(
-                project_id, self.config.extra_client_options
+            gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client(
+                project_id
             )
             for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1):
                 logger.debug(f"Connection test got one audit metadata entry {entry}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index 1081dd8eec1ec..20014f2f3fac4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import os
-import textwrap
 import time
 import uuid
 from dataclasses import dataclass
@@ -21,9 +20,6 @@
 )
 
 import humanfriendly
-from google.cloud.bigquery import Client as BigQueryClient
-from google.cloud.logging_v2.client import Client as GCPLoggingClient
-from ratelimiter import RateLimiter
 
 from datahub.configuration.time_window_config import get_time_bucket
 from datahub.emitter.mce_builder import make_user_urn
@@ -32,8 +28,6 @@
 from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
-    BQ_AUDIT_V2,
-    BQ_FILTER_RULE_TEMPLATE,
     AuditEvent,
     AuditLogEntry,
     BigQueryAuditMetadata,
@@ -42,14 +36,9 @@
     QueryEvent,
     ReadEvent,
 )
+from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.common import (
-    BQ_DATE_SHARD_FORMAT,
-    BQ_DATETIME_FORMAT,
-    _make_gcp_logging_client,
-    get_bigquery_client,
-)
 from datahub.ingestion.source.usage.usage_common import (
     TOTAL_BUDGET_FOR_QUERY_LIST,
     make_usage_workunit,
@@ -101,77 +90,6 @@ class OperationalDataMeta:
     custom_type: Optional[str] = None
 
 
-def bigquery_audit_metadata_query_template(
-    dataset: str,
-    use_date_sharded_tables: bool,
-    limit: Optional[int] = None,
-) -> str:
-    """
-    Receives a dataset (with project specified) and returns a query template that is used to query exported
-    v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
-    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
-    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
-           tables
-    :param limit: maximum number of events to query for
-    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
-    """
-
-    limit_text = f"limit {limit}" if limit else ""
-
-    shard_condition = ""
-    if use_date_sharded_tables:
-        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
-        shard_condition = (
-            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
-        )
-    else:
-        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
-
-    # Deduplicates insertId via QUALIFY, see:
-    # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field
-    query = f"""
-        SELECT
-            timestamp,
-            logName,
-            insertId,
-            protopayload_auditlog AS protoPayload,
-            protopayload_auditlog.metadataJson AS metadata
-        FROM
-            {from_table}
-        WHERE (
-            timestamp >= "{{start_time}}"
-            AND timestamp < "{{end_time}}"
-        )
-        {shard_condition}
-        AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
-        AND
-        (
-            (
-                protopayload_auditlog.methodName IN
-                    (
-                        "google.cloud.bigquery.v2.JobService.Query",
-                        "google.cloud.bigquery.v2.JobService.InsertJob"
-                    )
-                AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
-                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
-                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
-                AND (
-                        JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson,
-                                                            "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL
-                    OR
-                        JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL
-                    )
-            )
-            OR
-                JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB"
-        )
-        QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
-        {limit_text};
-    """
-
-    return textwrap.dedent(query)
-
-
 class BigQueryUsageState(Closeable):
     read_events: FileBackedDict[ReadEvent]
     query_events: FileBackedDict[QueryEvent]
@@ -617,109 +535,6 @@ def _store_usage_event(
             return True
         return False
 
-    def _get_exported_bigquery_audit_metadata(
-        self,
-        bigquery_client: BigQueryClient,
-        limit: Optional[int] = None,
-    ) -> Iterable[BigQueryAuditMetadata]:
-        if self.config.bigquery_audit_metadata_datasets is None:
-            return
-
-        corrected_start_time = self.config.start_time - self.config.max_query_duration
-        start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
-        start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT)
-        self.report.audit_start_time = start_time
-
-        corrected_end_time = self.config.end_time + self.config.max_query_duration
-        end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
-        end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT)
-        self.report.audit_end_time = end_time
-
-        for dataset in self.config.bigquery_audit_metadata_datasets:
-            logger.info(
-                f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
-            )
-
-            query = bigquery_audit_metadata_query_template(
-                dataset,
-                self.config.use_date_sharded_audit_log_tables,
-                limit=limit,
-            ).format(
-                start_time=start_time,
-                end_time=end_time,
-                start_date=start_date,
-                end_date=end_date,
-            )
-
-            query_job = bigquery_client.query(query)
-            logger.info(
-                f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
-            )
-            if self.config.rate_limit:
-                with RateLimiter(max_calls=self.config.requests_per_min, period=60):
-                    yield from query_job
-            else:
-                yield from query_job
-
-    def _get_bigquery_log_entries_via_gcp_logging(
-        self, client: GCPLoggingClient, limit: Optional[int] = None
-    ) -> Iterable[AuditLogEntry]:
-
-        filter = self._generate_filter(BQ_AUDIT_V2)
-        logger.debug(filter)
-
-        list_entries: Iterable[AuditLogEntry]
-        rate_limiter: Optional[RateLimiter] = None
-        if self.config.rate_limit:
-            # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging
-            # to properly ratelimit we multiply the page size by the number of requests per minute
-            rate_limiter = RateLimiter(
-                max_calls=self.config.requests_per_min * self.config.log_page_size,
-                period=60,
-            )
-
-        list_entries = client.list_entries(
-            filter_=filter,
-            page_size=self.config.log_page_size,
-            max_results=limit,
-        )
-
-        for i, entry in enumerate(list_entries):
-            if i == 0:
-                logger.info(f"Starting log load from GCP Logging for {client.project}")
-            if i % 1000 == 0:
-                logger.info(f"Loaded {i} log entries from GCP Log for {client.project}")
-            self.report.total_query_log_entries += 1
-
-            if rate_limiter:
-                with rate_limiter:
-                    yield entry
-            else:
-                yield entry
-
-        logger.info(
-            f"Finished loading {self.report.total_query_log_entries} log entries from GCP Logging for {client.project}"
-        )
-
-    def _generate_filter(self, audit_templates: Dict[str, str]) -> str:
-        # We adjust the filter values a bit, since we need to make sure that the join
-        # between query events and read events is complete. For example, this helps us
-        # handle the case where the read happens within our time range but the query
-        # completion event is delayed and happens after the configured end time.
-
-        start_time = (self.config.start_time - self.config.max_query_duration).strftime(
-            BQ_DATETIME_FORMAT
-        )
-        self.report.log_entry_start_time = start_time
-        end_time = (self.config.end_time + self.config.max_query_duration).strftime(
-            BQ_DATETIME_FORMAT
-        )
-        self.report.log_entry_end_time = end_time
-        filter = audit_templates[BQ_FILTER_RULE_TEMPLATE].format(
-            start_time=start_time, end_time=end_time
-        )
-        return filter
-
     @staticmethod
     def _get_destination_table(event: AuditEvent) -> Optional[BigQueryTableRef]:
         if (
@@ -954,25 +769,45 @@ def _parse_exported_bigquery_audit_metadata(
     def _get_parsed_bigquery_log_events(
         self, project_id: str, limit: Optional[int] = None
     ) -> Iterable[AuditEvent]:
+        audit_log_api = BigQueryAuditLogApi(
+            self.report, self.config.rate_limit, self.config.requests_per_min
+        )
+        # We adjust the filter values a bit, since we need to make sure that the join
+        # between query events and read events is complete. For example, this helps us
+        # handle the case where the read happens within our time range but the query
+        # completion event is delayed and happens after the configured end time.
+        corrected_start_time = self.config.start_time - self.config.max_query_duration
+        corrected_end_time = self.config.end_time + -self.config.max_query_duration
+        self.report.audit_start_time = corrected_start_time
+        self.report.audit_end_time = corrected_end_time
+
         parse_fn: Callable[[Any], Optional[AuditEvent]]
         if self.config.use_exported_bigquery_audit_metadata:
-            bq_client = get_bigquery_client(self.config)
-            entries = self._get_exported_bigquery_audit_metadata(
+            bq_client = self.config.get_bigquery_client()
+
+            entries = audit_log_api.get_exported_bigquery_audit_metadata(
                 bigquery_client=bq_client,
+                bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+                use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
+                start_time=corrected_start_time,
+                end_time=corrected_end_time,
                 limit=limit,
             )
             parse_fn = self._parse_exported_bigquery_audit_metadata
         else:
-            logging_client = _make_gcp_logging_client(
-                project_id, self.config.extra_client_options
-            )
-            entries = self._get_bigquery_log_entries_via_gcp_logging(
-                logging_client, limit=limit
+            logging_client = self.config.make_gcp_logging_client(project_id)
+            entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+                logging_client,
+                start_time=corrected_start_time,
+                end_time=corrected_end_time,
+                log_page_size=self.config.log_page_size,
+                limit=limit,
             )
             parse_fn = self._parse_bigquery_log_entry
 
         for entry in entries:
             try:
+                self.report.total_query_log_entries += 1
                 event = parse_fn(entry)
                 if event:
                     yield event
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 7dd51d5b20e8e..dd8b0c75302c2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -520,15 +520,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
         self.connection.close()
 
-        lru_cache_functions: List[Callable] = [
-            self.data_dictionary.get_tables_for_database,
-            self.data_dictionary.get_views_for_database,
-            self.data_dictionary.get_columns_for_schema,
-            self.data_dictionary.get_pk_constraints_for_schema,
-            self.data_dictionary.get_fk_constraints_for_schema,
-        ]
-        for func in lru_cache_functions:
-            self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict()  # type: ignore
+        self.report_cache_info()
 
         # TODO: The checkpoint state for stale entity detection can be committed here.
 
@@ -593,6 +585,17 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
             yield from self.usage_extractor.get_usage_workunits(discovered_datasets)
 
+    def report_cache_info(self):
+        lru_cache_functions: List[Callable] = [
+            self.data_dictionary.get_tables_for_database,
+            self.data_dictionary.get_views_for_database,
+            self.data_dictionary.get_columns_for_schema,
+            self.data_dictionary.get_pk_constraints_for_schema,
+            self.data_dictionary.get_fk_constraints_for_schema,
+        ]
+        for func in lru_cache_functions:
+            self.report.lru_cache_info[func.__name__] = func.cache_info()._asdict()  # type: ignore
+
     def report_warehouse_failure(self):
         if self.config.warehouse is not None:
             self.report_error(
diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py
index 3fac1d68c3a9e..46eb0e25e4fbf 100644
--- a/metadata-ingestion/src/datahub/utilities/perf_timer.py
+++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py
@@ -6,21 +6,53 @@
 class PerfTimer(AbstractContextManager):
     """
     A context manager that gives easy access to elapsed time for performance measurement.
+
     """
 
-    start_time: Optional[float] = None
-    end_time: Optional[float] = None
+    def __init__(self) -> None:
+        self.start_time: Optional[float] = None
+        self.end_time: Optional[float] = None
+        self._past_active_time: float = 0
+        self.paused: Optional[bool] = None
 
     def start(self) -> None:
+        # TODO
+        # assert (
+        #    self.end_time is None
+        # ), "Can not start a finished timer. Did you accidentally re-use this timer ?"
+
+        if self.end_time is not None:
+            self._past_active_time = self.elapsed_seconds()
+
         self.start_time = time.perf_counter()
         self.end_time = None
+        if self.paused:
+            self.paused = False
+
+    def pause_timer(self) -> "PerfTimer":
+        assert (
+            not self.paused and not self.end_time
+        ), "Can not pause a paused/stopped timer"
+        assert (
+            self.start_time is not None
+        ), "Can not pause a timer that hasn't started. Did you forget to start the timer ?"
+        self._past_active_time = self.elapsed_seconds()
+        self.start_time = None
+        self.end_time = None
+        self.paused = True
+        return self
 
     def finish(self) -> None:
-        assert self.start_time is not None
+        assert (
+            self.start_time is not None
+        ), "Can not stop a timer that hasn't started. Did you forget to start the timer ?"
         self.end_time = time.perf_counter()
 
     def __enter__(self) -> "PerfTimer":
-        self.start()
+        if self.paused:  # Entering paused timer context, NO OP
+            pass
+        else:
+            self.start()
         return self
 
     def __exit__(
@@ -29,16 +61,34 @@ def __exit__(
         exc: Any,
         traceback: Any,
     ) -> Optional[bool]:
-        self.finish()
+        if self.paused:  # Exiting paused timer context, resume timer
+            self.start()
+        else:
+            self.finish()
         return None
 
     def elapsed_seconds(self) -> float:
         """
         Returns the elapsed time in seconds.
         """
+        if self.paused:
+            return self._past_active_time
 
-        assert self.start_time is not None
+        assert self.start_time is not None, "Did you forget to start the timer ?"
         if self.end_time is None:
-            return time.perf_counter() - self.start_time
+            return (time.perf_counter() - self.start_time) + (self._past_active_time)
+        else:
+            return (self.end_time - self.start_time) + self._past_active_time
+
+    def __repr__(self) -> str:
+        return repr(self.as_obj())
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+    def as_obj(self) -> Optional[str]:
+        if self.start_time is None:
+            return None
         else:
-            return self.end_time - self.start_time
+            time_taken = self.elapsed_seconds()
+            return f"{time_taken:.3f} seconds"
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index fc8ca166b105a..bc9a3f41a9655 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -18,6 +18,7 @@
     BigQueryTableRef,
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
+from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
 from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
     BigQueryDataDictionary,
     BigqueryProject,
@@ -100,7 +101,7 @@ def test_get_projects_with_project_ids(client_mock):
         }
     )
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
-    assert source._get_projects(client_mock) == [
+    assert source._get_projects() == [
         BigqueryProject("test-1", "test-1"),
         BigqueryProject("test-2", "test-2"),
     ]
@@ -110,7 +111,7 @@ def test_get_projects_with_project_ids(client_mock):
         {"project_ids": ["test-1", "test-2"], "project_id": "test-3"}
     )
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test2"))
-    assert source._get_projects(client_mock) == [
+    assert source._get_projects() == [
         BigqueryProject("test-1", "test-1"),
         BigqueryProject("test-2", "test-2"),
     ]
@@ -125,7 +126,7 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern():
         }
     )
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
-    projects = source._get_projects(MagicMock())
+    projects = source._get_projects()
     assert projects == [
         BigqueryProject(id="test-project", name="test-project"),
         BigqueryProject(id="test-project-2", name="test-project-2"),
@@ -156,7 +157,7 @@ def test_get_dataplatform_instance_aspect_returns_project_id():
 def test_get_projects_with_single_project_id(client_mock):
     config = BigQueryV2Config.parse_obj({"project_id": "test-3"})
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
-    assert source._get_projects(client_mock) == [
+    assert source._get_projects() == [
         BigqueryProject("test-3", "test-3"),
     ]
     assert client_mock.list_projects.call_count == 0
@@ -177,7 +178,8 @@ def test_get_projects_by_list(client_mock):
 
     config = BigQueryV2Config.parse_obj({})
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
-    assert source._get_projects(client_mock) == [
+    source.bigquery_data_dictionary.set_client(client_mock)
+    assert source._get_projects() == [
         BigqueryProject("test-1", "one"),
         BigqueryProject("test-2", "two"),
     ]
@@ -195,7 +197,7 @@ def test_get_projects_filter_by_pattern(get_projects_mock):
         {"project_id_pattern": {"deny": ["^test-project$"]}}
     )
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
-    projects = source._get_projects(MagicMock())
+    projects = source._get_projects()
     assert projects == [
         BigqueryProject(id="test-project-2", name="Test Project 2"),
     ]
@@ -209,7 +211,7 @@ def test_get_projects_list_empty(get_projects_mock):
         {"project_id_pattern": {"deny": ["^test-project$"]}}
     )
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
-    projects = source._get_projects(MagicMock())
+    projects = source._get_projects()
     assert len(source.report.failures) == 1
     assert projects == []
 
@@ -227,7 +229,7 @@ def test_get_projects_list_failure(
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
     caplog.records.clear()
     with caplog.at_level(logging.ERROR):
-        projects = source._get_projects(MagicMock())
+        projects = source._get_projects()
         assert len(caplog.records) == 1
         assert error_str in caplog.records[0].msg
     assert len(source.report.failures) == 1
@@ -242,7 +244,7 @@ def test_get_projects_list_fully_filtered(get_projects_mock):
         {"project_id_pattern": {"deny": ["^test-project$"]}}
     )
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
-    projects = source._get_projects(MagicMock())
+    projects = source._get_projects()
     assert len(source.report.failures) == 0
     assert projects == []
 
@@ -496,10 +498,11 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
     data_dictionary_mock.get_tables_for_dataset.return_value = None
 
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
+    source.bigquery_data_dictionary.set_client(client_mock)
 
     _ = list(
         source.get_tables_for_dataset(
-            conn=client_mock, project_id="test-project", dataset_name="test-dataset"
+            project_id="test-project", dataset_name="test-dataset"
         )
     )
 
@@ -507,7 +510,7 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
 
     # args only available from python 3.8 and that's why call_args_list is sooo ugly
     tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
-        3
+        2
     ]  # alternatively
     for table in tables.keys():
         assert table in ["test-table", "test-sharded-table_20220102"]
@@ -568,10 +571,11 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m
     data_dictionary_mock.get_tables_for_dataset.return_value = None
 
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
+    source.bigquery_data_dictionary.set_client(client_mock)
 
     _ = list(
         source.get_tables_for_dataset(
-            conn=client_mock, project_id="test-project", dataset_name="test-dataset"
+            project_id="test-project", dataset_name="test-dataset"
         )
     )
 
@@ -579,7 +583,7 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m
 
     # args only available from python 3.8 and that's why call_args_list is sooo ugly
     tables: Dict[str, TableListItem] = data_dictionary_mock.call_args_list[0][0][
-        3
+        2
     ]  # alternatively
     for table in tables.keys():
         assert tables[table].table_id in ["test-table", "20220103"]
@@ -651,9 +655,10 @@ def test_get_views_for_dataset(
         )
     )
     query_mock.return_value = [row1, row2]
+    bigquery_data_dictionary = BigQueryDataDictionary(BigQueryV2Report())
+    bigquery_data_dictionary.set_client(client_mock)
 
-    views = BigQueryDataDictionary.get_views_for_dataset(
-        conn=client_mock,
+    views = bigquery_data_dictionary.get_views_for_dataset(
         project_id="test-project",
         dataset_name="test-dataset",
         has_data_read=False,
diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
index 6ee1f05f0582c..8c50619bee53d 100644
--- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
+++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
@@ -4,13 +4,12 @@
 from freezegun import freeze_time
 
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
-    BQ_AUDIT_V2,
     BigqueryTableIdentifier,
     BigQueryTableRef,
 )
+from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
 
 FROZEN_TIME = "2021-07-20 00:00:00"
 
@@ -111,10 +110,12 @@ def test_bigqueryv2_filters():
     OR
     protoPayload.metadata.tableDataRead.reason = "JOB"
 )"""  # noqa: W293
-    source = BigQueryUsageExtractor(
-        config, BigQueryV2Report(), dataset_urn_builder=lambda _: ""
+    api = BigQueryAuditLogApi(
+        BigQueryV2Report(), config.rate_limit, config.requests_per_min
     )
-    filter: str = source._generate_filter(BQ_AUDIT_V2)
+    corrected_start_time = config.start_time - config.max_query_duration
+    corrected_end_time = config.end_time + config.max_query_duration
+    filter: str = api._generate_filter(corrected_start_time, corrected_end_time)
     assert filter == expected_filter
 
 
diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
new file mode 100644
index 0000000000000..9fbd3a7b5d9cd
--- /dev/null
+++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
@@ -0,0 +1,41 @@
+import time
+
+from datahub.utilities.perf_timer import PerfTimer
+
+
+def test_perf_timer_simple():
+    with PerfTimer() as timer:
+        time.sleep(1)
+        assert round(timer.elapsed_seconds()) == 1
+
+    assert round(timer.elapsed_seconds()) == 1
+
+
+def test_perf_timer_paused_timer():
+    with PerfTimer() as current_timer:
+        time.sleep(1)
+        assert round(current_timer.elapsed_seconds()) == 1
+        with current_timer.pause_timer():
+            time.sleep(2)
+            assert round(current_timer.elapsed_seconds()) == 1
+        assert round(current_timer.elapsed_seconds()) == 1
+        time.sleep(1)
+
+    assert round(current_timer.elapsed_seconds()) == 2
+
+
+def test_generator_with_paused_timer():
+    def generator_function():
+        with PerfTimer() as inner_timer:
+            time.sleep(1)
+            for i in range(10):
+                time.sleep(0.2)
+                with inner_timer.pause_timer():
+                    time.sleep(0.2)
+                    yield i
+            assert round(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10
+
+    with PerfTimer() as outer_timer:
+        seq = generator_function()
+        list([i for i in seq])
+        assert round(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10

From 95bbcbed84b7e91b52626ecd606953c0b68d6bd8 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Thu, 10 Aug 2023 18:09:17 +0530
Subject: [PATCH 02/11] wip, timers not added for unused methods - remove these
 ?

---
 .../ingestion/source/bigquery_v2/bigquery.py  | 138 +-----
 .../source/bigquery_v2/bigquery_audit.py      |  43 --
 ...audit_api.py => bigquery_audit_log_api.py} | 176 ++++++--
 .../source/bigquery_v2/bigquery_config.py     |  13 +-
 .../source/bigquery_v2/bigquery_report.py     |  21 +-
 ...query_schema.py => bigquery_schema_api.py} |  45 +-
 .../ingestion/source/bigquery_v2/lineage.py   | 401 +++++++++---------
 .../ingestion/source/bigquery_v2/profiler.py  |   2 +-
 .../ingestion/source/bigquery_v2/usage.py     |  27 +-
 .../integration/bigquery_v2/test_bigquery.py  |   6 +-
 .../tests/unit/test_bigquery_lineage.py       |   5 +-
 .../tests/unit/test_bigquery_profiler.py      |   2 +-
 .../tests/unit/test_bigquery_source.py        |  20 +-
 .../unit/test_bigqueryv2_usage_source.py      |  10 +-
 14 files changed, 460 insertions(+), 449 deletions(-)
 rename metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/{bigquery_audit_api.py => bigquery_audit_log_api.py} (54%)
 rename metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/{bigquery_schema.py => bigquery_schema_api.py} (95%)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 442e1e525fdf6..a38117cce346d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -5,7 +5,7 @@
 import re
 import traceback
 from collections import defaultdict
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
 from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast
 
 from google.cloud import bigquery
@@ -43,22 +43,19 @@
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     BigqueryColumn,
-    BigQueryDataDictionary,
     BigqueryDataset,
     BigqueryProject,
     BigqueryTable,
+    BigQueryTechnicalSchemaApi,
     BigqueryView,
 )
 from datahub.ingestion.source.bigquery_v2.common import (
     BQ_EXTERNAL_DATASET_URL_TEMPLATE,
     BQ_EXTERNAL_TABLE_URL_TEMPLATE,
 )
-from datahub.ingestion.source.bigquery_v2.lineage import (
-    BigqueryLineageExtractor,
-    make_lineage_edges_from_parsing_result,
-)
+from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor
 from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler
 from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
 from datahub.ingestion.source.common.subtypes import (
@@ -88,7 +85,6 @@
 )
 from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
     DatasetProperties,
-    UpstreamLineage,
     ViewProperties,
 )
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
@@ -107,11 +103,9 @@
 )
 from datahub.metadata.schema_classes import (
     DataPlatformInstanceClass,
-    DatasetLineageTypeClass,
     GlobalTagsClass,
     TagAssociationClass,
 )
-from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.utilities.file_backed_collections import FileBackedDict
 from datahub.utilities.hive_schema_to_avro import (
     HiveColumnToAvroConverter,
@@ -120,7 +114,7 @@
 from datahub.utilities.mapping import Constants
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.registries.domain_registry import DomainRegistry
-from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage
+from datahub.utilities.sqlglot_lineage import SchemaResolver
 from datahub.utilities.time import datetime_to_ts_millis
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -226,10 +220,14 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
 
         set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase)
 
-        self.bigquery_data_dictionary = BigQueryDataDictionary(self.report)
+        self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi(self.report)
 
         # For database, schema, tables, views, etc
-        self.lineage_extractor = BigqueryLineageExtractor(config, self.report)
+        self.lineage_extractor = BigqueryLineageExtractor(
+            config,
+            self.report,
+            dataset_urn_builder=self.gen_dataset_urn_from_ref,
+        )
         self.usage_extractor = BigQueryUsageExtractor(
             config, self.report, dataset_urn_builder=self.gen_dataset_urn_from_ref
         )
@@ -300,7 +298,7 @@ def metadata_read_capability_test(
                 client: bigquery.Client = config.get_bigquery_client()
                 assert client
                 report = BigQueryV2Report()
-                bigquery_data_dictionary = BigQueryDataDictionary(report)
+                bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report)
                 bigquery_data_dictionary.set_client(client)
                 result = bigquery_data_dictionary.get_datasets_for_project_id(
                     project_id, 10
@@ -336,7 +334,9 @@ def lineage_capability_test(
         project_ids: List[str],
         report: BigQueryV2Report,
     ) -> CapabilityReport:
-        lineage_extractor = BigqueryLineageExtractor(connection_conf, report)
+        lineage_extractor = BigqueryLineageExtractor(
+            connection_conf, report, lambda ref: ""
+        )
         for project_id in project_ids:
             try:
                 logger.info(f"Lineage capability test for project {project_id}")
@@ -520,9 +520,12 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
             )
 
         if self._should_ingest_lineage():
-            for project in projects:
-                self.report.set_ingestion_stage(project.id, "Lineage Extraction")
-                yield from self.generate_lineage(project.id)
+            yield from self.lineage_extractor.get_lineage_workunits(
+                projects,
+                self.sql_parser_schema_resolver,
+                self.view_definition_ids,
+                self.table_refs,
+            )
 
     def _should_ingest_usage(self) -> bool:
         if not self.config.include_usage_statistics:
@@ -671,72 +674,6 @@ def _process_project(
                 tables=db_tables,
             )
 
-    def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]:
-        logger.info(f"Generate lineage for {project_id}")
-        lineage = self.lineage_extractor.calculate_lineage_for_project(
-            project_id,
-            sql_parser_schema_resolver=self.sql_parser_schema_resolver,
-        )
-
-        if self.config.lineage_parse_view_ddl:
-            for view, view_definition_id in self.view_definition_ids[
-                project_id
-            ].items():
-                view_definition = self.view_definitions[view_definition_id]
-                raw_view_lineage = sqlglot_lineage(
-                    view_definition,
-                    schema_resolver=self.sql_parser_schema_resolver,
-                    default_db=project_id,
-                )
-                if raw_view_lineage.debug_info.table_error:
-                    logger.debug(
-                        f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}"
-                    )
-                    self.report.num_view_definitions_failed_parsing += 1
-                    self.report.view_definitions_parsing_failures.append(
-                        f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}"
-                    )
-                    continue
-                elif raw_view_lineage.debug_info.column_error:
-                    self.report.num_view_definitions_failed_column_parsing += 1
-                    self.report.view_definitions_parsing_failures.append(
-                        f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}"
-                    )
-                else:
-                    self.report.num_view_definitions_parsed += 1
-
-                # For views, we override the upstreams obtained by parsing audit logs
-                # as they may contain indirectly referenced tables.
-                ts = datetime.now(timezone.utc)
-                lineage[view] = set(
-                    make_lineage_edges_from_parsing_result(
-                        raw_view_lineage,
-                        audit_stamp=ts,
-                        lineage_type=DatasetLineageTypeClass.VIEW,
-                    )
-                )
-
-        for lineage_key in lineage.keys():
-            if lineage_key not in self.table_refs:
-                continue
-
-            table_ref = BigQueryTableRef.from_string_name(lineage_key)
-            dataset_urn = self.gen_dataset_urn(
-                project_id=table_ref.table_identifier.project_id,
-                dataset_name=table_ref.table_identifier.dataset,
-                table=table_ref.table_identifier.get_table_display_name(),
-            )
-
-            lineage_info = self.lineage_extractor.get_lineage_for_table(
-                bq_table=table_ref,
-                bq_table_urn=dataset_urn,
-                platform=self.platform,
-                lineage_metadata=lineage,
-            )
-
-            if lineage_info:
-                yield from self.gen_lineage(dataset_urn, lineage_info)
-
     def _process_schema(
         self,
         project_id: str,
@@ -1069,39 +1006,6 @@ def gen_dataset_workunits(
                 domain_config=self.config.domain,
             )
 
-    def gen_lineage(
-        self,
-        dataset_urn: str,
-        upstream_lineage: Optional[UpstreamLineage] = None,
-    ) -> Iterable[MetadataWorkUnit]:
-        if upstream_lineage is None:
-            return
-
-        if upstream_lineage is not None:
-            if self.config.incremental_lineage:
-                patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
-                    urn=dataset_urn
-                )
-                for upstream in upstream_lineage.upstreams:
-                    patch_builder.add_upstream_lineage(upstream)
-
-                yield from [
-                    MetadataWorkUnit(
-                        id=f"upstreamLineage-for-{dataset_urn}",
-                        mcp_raw=mcp,
-                    )
-                    for mcp in patch_builder.build()
-                ]
-            else:
-                if not self.config.extract_column_lineage:
-                    upstream_lineage.fineGrainedLineages = None
-
-                yield from [
-                    MetadataChangeProposalWrapper(
-                        entityUrn=dataset_urn, aspect=upstream_lineage
-                    ).as_workunit()
-                ]
-
     def gen_tags_aspect_workunit(
         self, dataset_urn: str, tags_to_add: List[str]
     ) -> MetadataWorkUnit:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
index 0f9b37c93feaa..b0ac77201b415 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
@@ -13,48 +13,6 @@
     get_first_missing_key_any,
 )
 
-BQ_FILTER_RULE_TEMPLATE = "BQ_FILTER_RULE_TEMPLATE"
-
-BQ_AUDIT_V2 = {
-    BQ_FILTER_RULE_TEMPLATE: """
-resource.type=("bigquery_project" OR "bigquery_dataset")
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-AND protoPayload.serviceName="bigquery.googleapis.com"
-AND
-(
-    (
-        protoPayload.methodName=
-            (
-                "google.cloud.bigquery.v2.JobService.Query"
-                OR
-                "google.cloud.bigquery.v2.JobService.InsertJob"
-            )
-        AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
-        AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
-        AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
-        AND
-        (
-            (
-                protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
-                AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
-            )
-            OR
-            (
-                protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
-            )
-        )
-    )
-    OR
-    protoPayload.metadata.tableDataRead.reason = "JOB"
-)
-""".strip(
-        "\t \n"
-    ),
-}
-
 AuditLogEntry = Any
 
 # BigQueryAuditMetadata is the v2 format in which audit logs are exported to BigQuery
@@ -606,7 +564,6 @@ def from_query_event(
         query_event: QueryEvent,
         debug_include_full_payloads: bool = False,
     ) -> "ReadEvent":
-
         readEvent = ReadEvent(
             actor_email=query_event.actor_email,
             timestamp=query_event.timestamp,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
similarity index 54%
rename from metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py
rename to metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
index 09a9098ced338..048f90de7ee0a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
@@ -1,15 +1,13 @@
 import logging
 import textwrap
 from datetime import datetime
-from typing import Iterable, List, Optional
+from typing import Callable, Iterable, List, Optional
 
 from google.cloud import bigquery
 from google.cloud.logging_v2.client import Client as GCPLoggingClient
 from ratelimiter import RateLimiter
 
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
-    BQ_AUDIT_V2,
-    BQ_FILTER_RULE_TEMPLATE,
     AuditLogEntry,
     BigQueryAuditMetadata,
 )
@@ -24,6 +22,7 @@
 logger: logging.Logger = logging.getLogger(__name__)
 
 
+# TODO: separation of api/extractor classes - client wise, functionality wise ?
 class BigQueryAuditLogApi:
     def __init__(
         self,
@@ -31,13 +30,17 @@ def __init__(
         rate_limit: bool,
         requests_per_min: int,
     ) -> None:
-        self.api_perf_report = report
+        self.report = report
         self.rate_limit = rate_limit
         self.requests_per_min = requests_per_min
 
+    # TODO; should we refractor and move this to schema api , as this uses bigquery client ?
     def get_exported_bigquery_audit_metadata(
         self,
         bigquery_client: bigquery.Client,
+        bigquery_audit_metadata_query_template: Callable[
+            [str, bool, Optional[int]], str
+        ],
         bigquery_audit_metadata_datasets: Optional[List[str]],
         use_date_sharded_audit_log_tables: bool,
         start_time: datetime,
@@ -53,7 +56,7 @@ def get_exported_bigquery_audit_metadata(
         audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT)
         audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT)
 
-        with self.api_perf_report.get_exported_bigquery_audit_metadata as current_timer:
+        with self.report.get_exported_log_entries as current_timer:
             for dataset in bigquery_audit_metadata_datasets:
                 logger.info(
                     f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
@@ -62,7 +65,7 @@ def get_exported_bigquery_audit_metadata(
                 query = bigquery_audit_metadata_query_template(
                     dataset,
                     use_date_sharded_audit_log_tables,
-                    limit=limit,
+                    limit,
                 ).format(
                     start_time=audit_start_time,
                     end_time=audit_end_time,
@@ -74,22 +77,24 @@ def get_exported_bigquery_audit_metadata(
                 logger.info(
                     f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
                 )
-                with current_timer.pause_timer():
-                    if self.rate_limit:
-                        with RateLimiter(max_calls=self.requests_per_min, period=60):
-                            yield from query_job
-                    else:
-                        yield from query_job
+
+                if self.rate_limit:
+                    with RateLimiter(max_calls=self.requests_per_min, period=60):
+                        for entry in query_job:
+                            with current_timer.pause_timer():
+                                yield entry
+                else:
+                    for entry in query_job:
+                        with current_timer.pause_timer():
+                            yield entry
 
     def get_bigquery_log_entries_via_gcp_logging(
         self,
         client: GCPLoggingClient,
-        start_time: datetime,
-        end_time: datetime,
+        filter: str,
         log_page_size: int,
         limit: Optional[int] = None,
     ) -> Iterable[AuditLogEntry]:
-        filter = self._generate_filter(start_time, end_time)
         logger.debug(filter)
 
         list_entries: Iterable[AuditLogEntry]
@@ -102,7 +107,7 @@ def get_bigquery_log_entries_via_gcp_logging(
                 period=60,
             )
 
-        with self.api_perf_report.get_bigquery_log_entries_via_gcp_logging as current_timer:
+        with self.report.list_log_entries as current_timer:
             list_entries = client.list_entries(
                 filter_=filter,
                 page_size=log_page_size,
@@ -114,6 +119,7 @@ def get_bigquery_log_entries_via_gcp_logging(
                     logger.info(
                         f"Starting log load from GCP Logging for {client.project}"
                     )
+
                 if i % 1000 == 0:
                     logger.info(
                         f"Loaded {i} log entries from GCP Log for {client.project}"
@@ -126,18 +132,12 @@ def get_bigquery_log_entries_via_gcp_logging(
                     else:
                         yield entry
 
-    def _generate_filter(self, start_time: datetime, end_time: datetime) -> str:
-        audit_start_time = (start_time).strftime(BQ_DATETIME_FORMAT)
-
-        audit_end_time = (end_time).strftime(BQ_DATETIME_FORMAT)
-
-        filter = BQ_AUDIT_V2[BQ_FILTER_RULE_TEMPLATE].format(
-            start_time=audit_start_time, end_time=audit_end_time
-        )
-        return filter
+            logger.info(
+                f"Finished loading log entries from GCP Log for {client.project}"
+            )
 
 
-def bigquery_audit_metadata_query_template(
+def bigquery_audit_metadata_query_template_usage(
     dataset: str,
     use_date_sharded_tables: bool,
     limit: Optional[int] = None,
@@ -206,3 +206,127 @@ def bigquery_audit_metadata_query_template(
     """
 
     return textwrap.dedent(query)
+
+
+def bigquery_audit_metadata_query_template_lineage(
+    dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None
+) -> str:
+    """
+    Receives a dataset (with project specified) and returns a query template that is used to query exported
+    AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
+    Include only those that:
+    - have been completed (jobStatus.jobState = "DONE")
+    - do not contain errors (jobStatus.errorResults is none)
+    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
+    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
+           tables
+    :param limit: set a limit for the maximum event to return. It is used for connection testing currently
+    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
+    """
+    limit_text = f"limit {limit}" if limit else ""
+
+    shard_condition = ""
+    if use_date_sharded_tables:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
+        shard_condition = (
+            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+        )
+    else:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
+
+    query = f"""
+            SELECT
+                timestamp,
+                logName,
+                insertId,
+                protopayload_auditlog AS protoPayload,
+                protopayload_auditlog.metadataJson AS metadata
+            FROM
+                {from_table}
+            WHERE (
+                timestamp >= "{{start_time}}"
+                AND timestamp < "{{end_time}}"
+            )
+            {shard_condition}
+            AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
+            AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
+            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
+            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
+            {limit_text};
+        """
+
+    return textwrap.dedent(query)
+
+
+BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """
+resource.type=("bigquery_project" OR "bigquery_dataset")
+AND
+timestamp >= "{start_time}"
+AND
+timestamp < "{end_time}"
+AND protoPayload.serviceName="bigquery.googleapis.com"
+AND
+(
+    (
+        protoPayload.methodName=
+            (
+                "google.cloud.bigquery.v2.JobService.Query"
+                OR
+                "google.cloud.bigquery.v2.JobService.InsertJob"
+            )
+        AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
+        AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
+        AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
+        AND
+        (
+            (
+                protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
+                AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
+            )
+            OR
+            (
+                protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
+            )
+        )
+    )
+    OR
+    protoPayload.metadata.tableDataRead.reason = "JOB"
+)
+""".strip(
+    "\t \n"
+)
+
+BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """
+resource.type=("bigquery_project")
+AND
+(
+    protoPayload.methodName=
+        (
+            "google.cloud.bigquery.v2.JobService.Query"
+            OR
+            "google.cloud.bigquery.v2.JobService.InsertJob"
+        )
+    AND
+    protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
+    AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
+    AND (
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
+        OR
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
+    )
+    AND (
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*"
+        AND
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*"
+        AND
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__"
+        AND
+        protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*"
+    )
+
+)
+AND
+timestamp >= "{start_time}"
+AND
+timestamp < "{end_time}"
+""".strip()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index fa689d571c8cc..84fdead338ee6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -6,7 +6,7 @@
 import pydantic
 from google.cloud import bigquery
 from google.cloud.logging_v2.client import Client as GCPLoggingClient
-from pydantic import Field, PositiveInt, PrivateAttr, root_validator
+from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
 
 from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.validate_field_removal import pydantic_removed_field
@@ -266,6 +266,17 @@ def profile_default_settings(cls, values: Dict) -> Dict:
 
         return values
 
+    @validator("bigquery_audit_metadata_datasets")
+    def validate_bigquery_audit_metadata_datasets(
+        cls, v: Optional[List[str]], values: Dict
+    ) -> Dict:
+        if values.get("use_exported_bigquery_audit_metadata"):
+            assert (
+                v and len(v) > 0
+            ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata` for usage/lineage."
+
+        return values
+
     @root_validator(pre=False)
     def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
         project_id = values.get("project_id")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index fc725e0cda3c4..62fd23bb1b68a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -17,7 +17,7 @@
 
 class BigQueryApiPerfReport:
     list_projects = PerfTimer()
-    get_datasets_for_project = PerfTimer()
+    list_datasets = PerfTimer()
     get_columns_for_dataset = PerfTimer()
     get_tables_for_dataset = PerfTimer()
     list_tables = PerfTimer()
@@ -25,8 +25,8 @@ class BigQueryApiPerfReport:
 
 
 class BigQueryAuditLogApiPerfReport:
-    get_exported_bigquery_audit_metadata = PerfTimer()
-    get_bigquery_log_entries_via_gcp_logging = PerfTimer()
+    get_exported_log_entries = PerfTimer()
+    list_log_entries = PerfTimer()
 
 
 @dataclass
@@ -46,8 +46,12 @@ class BigQueryV2Report(
     num_skipped_lineage_entries_other: TopKDict[str, int] = field(
         default_factory=int_top_k_dict
     )
-    num_total_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict)
-    num_parsed_log_entries: TopKDict[str, int] = field(default_factory=int_top_k_dict)
+    num_lineage_total_log_entries: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+    num_lineage_parsed_log_entries: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
     num_lineage_log_parse_failures: TopKDict[str, int] = field(
         default_factory=int_top_k_dict
     )
@@ -57,7 +61,14 @@ class BigQueryV2Report(
     lineage_mem_size: Dict[str, str] = field(default_factory=TopKDict)
     lineage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict)
     usage_extraction_sec: Dict[str, float] = field(default_factory=TopKDict)
+    num_usage_total_log_entries: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+    num_usage_parsed_log_entries: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
     usage_error_count: Dict[str, int] = field(default_factory=int_top_k_dict)
+
     num_usage_resources_dropped: int = 0
     num_usage_operations_dropped: int = 0
     operation_dropped: LossyList[str] = field(default_factory=LossyList)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
similarity index 95%
rename from metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
rename to metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
index 68dd0eeefe09b..e7fcd5bd390c0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
@@ -347,11 +347,14 @@ class BigqueryQuery:
   table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
 
 
-class BigQueryDataDictionary:
+# TODO: remove static methods from here
+# TODO: move queries into separate file
+class BigQueryTechnicalSchemaApi:
     def __init__(self, report: BigQueryApiPerfReport) -> None:
         self.bq_client: Optional[bigquery.Client] = None
         self.api_perf_report = report
 
+    # TODO: remove need to set_client. maybe pass in constructor?
     def set_client(self, bq_client: bigquery.Client) -> None:
         self.bq_client = bq_client
 
@@ -359,10 +362,9 @@ def get_client(self) -> bigquery.Client:
         assert self.bq_client is not None
         return self.bq_client
 
-    @staticmethod
-    def get_query_result(conn: bigquery.Client, query: str) -> RowIterator:
+    def get_query_result(self, query: str) -> RowIterator:
         logger.debug(f"Query : {query}")
-        resp = conn.query(query)
+        resp = self.get_client().query(query)
         return resp.result()
 
     def get_projects(self) -> List[BigqueryProject]:
@@ -376,7 +378,7 @@ def get_projects(self) -> List[BigqueryProject]:
     def get_datasets_for_project_id(
         self, project_id: str, maxResults: Optional[int] = None
     ) -> List[BigqueryDataset]:
-        with self.api_perf_report.get_datasets_for_project:
+        with self.api_perf_report.list_datasets:
             datasets = self.get_client().list_datasets(
                 project_id, max_results=maxResults
             )
@@ -384,9 +386,9 @@ def get_datasets_for_project_id(
                 BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets
             ]
 
-    @staticmethod
+    # This is not used anywhere
     def get_datasets_for_project_id_with_information_schema(
-        conn: bigquery.Client, project_id: str
+        self, project_id: str
     ) -> List[BigqueryDataset]:
         """
         This method is not used as of now, due to below limitation.
@@ -394,8 +396,7 @@ def get_datasets_for_project_id_with_information_schema(
         We'll need Region wise separate queries to fetch all datasets
         https://cloud.google.com/bigquery/docs/information-schema-datasets-schemata
         """
-        schemas = BigQueryDataDictionary.get_query_result(
-            conn,
+        schemas = self.get_query_result(
             BigqueryQuery.datasets_for_project_id.format(project_id=project_id),
         )
         return [
@@ -431,8 +432,7 @@ def get_tables_for_dataset(
             if with_data_read_permission:
                 # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
                 # and skip the others. Sharded tables are tables with suffix _20220102
-                cur = BigQueryDataDictionary.get_query_result(
-                    self.get_client(),
+                cur = self.get_query_result(
                     BigqueryQuery.tables_for_dataset.format(
                         project_id=project_id,
                         dataset_name=dataset_name,
@@ -444,8 +444,7 @@ def get_tables_for_dataset(
             else:
                 # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
                 # and skip the others. Sharded tables are tables with suffix _20220102
-                cur = BigQueryDataDictionary.get_query_result(
-                    self.get_client(),
+                cur = self.get_query_result(
                     BigqueryQuery.tables_for_dataset_without_partition_data.format(
                         project_id=project_id,
                         dataset_name=dataset_name,
@@ -458,7 +457,7 @@ def get_tables_for_dataset(
             for table in cur:
                 try:
                     with current_timer.pause_timer():
-                        yield BigQueryDataDictionary._make_bigquery_table(
+                        yield BigQueryTechnicalSchemaApi._make_bigquery_table(
                             table, tables.get(table.table_name)
                         )
                 except Exception as e:
@@ -520,15 +519,13 @@ def get_views_for_dataset(
     ) -> Iterator[BigqueryView]:
         with self.api_perf_report.get_views_for_dataset as current_timer:
             if has_data_read:
-                cur = BigQueryDataDictionary.get_query_result(
-                    self.get_client(),
+                cur = self.get_query_result(
                     BigqueryQuery.views_for_dataset.format(
                         project_id=project_id, dataset_name=dataset_name
                     ),
                 )
             else:
-                cur = BigQueryDataDictionary.get_query_result(
-                    self.get_client(),
+                cur = self.get_query_result(
                     BigqueryQuery.views_for_dataset_without_data_read.format(
                         project_id=project_id, dataset_name=dataset_name
                     ),
@@ -537,7 +534,7 @@ def get_views_for_dataset(
             for table in cur:
                 try:
                     with current_timer.pause_timer():
-                        yield BigQueryDataDictionary._make_bigquery_view(table)
+                        yield BigQueryTechnicalSchemaApi._make_bigquery_view(table)
                 except Exception as e:
                     view_name = f"{project_id}.{dataset_name}.{table.table_name}"
                     logger.warning(
@@ -575,8 +572,7 @@ def get_columns_for_dataset(
         columns: Dict[str, List[BigqueryColumn]] = defaultdict(list)
         with self.api_perf_report.get_columns_for_dataset:
             try:
-                cur = BigQueryDataDictionary.get_query_result(
-                    self.get_client(),
+                cur = self.get_query_result(
                     BigqueryQuery.columns_for_dataset.format(
                         project_id=project_id, dataset_name=dataset_name
                     )
@@ -620,14 +616,13 @@ def get_columns_for_dataset(
 
         return columns
 
-    @staticmethod
+    # This is not used anywhere
     def get_columns_for_table(
-        conn: bigquery.Client,
+        self,
         table_identifier: BigqueryTableIdentifier,
         column_limit: Optional[int],
     ) -> List[BigqueryColumn]:
-        cur = BigQueryDataDictionary.get_query_result(
-            conn,
+        cur = self.get_query_result(
             BigqueryQuery.columns_for_table.format(table_identifier=table_identifier),
         )
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 210018e55be15..d1bc687ebaca0 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -1,7 +1,6 @@
 import collections
 import itertools
 import logging
-import textwrap
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set, Union
@@ -10,9 +9,10 @@
 from google.cloud.bigquery import Client as BigQueryClient
 from google.cloud.datacatalog import lineage_v1
 from google.cloud.logging_v2.client import Client as GCPLoggingClient
-from ratelimiter import RateLimiter
 
 from datahub.emitter import mce_builder
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
     AuditLogEntry,
     BigQueryAuditMetadata,
@@ -21,12 +21,18 @@
     QueryEvent,
     ReadEvent,
 )
+from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import (
+    BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE,
+    BigQueryAuditLogApi,
+    bigquery_audit_metadata_query_template_lineage,
+)
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.common import (
-    BQ_DATE_SHARD_FORMAT,
-    BQ_DATETIME_FORMAT,
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
+    BigqueryProject,
+    BigQueryTechnicalSchemaApi,
 )
+from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
 from datahub.metadata.schema_classes import (
     AuditStampClass,
     DatasetLineageTypeClass,
@@ -36,6 +42,7 @@
     UpstreamClass,
     UpstreamLineageClass,
 )
+from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.utilities import memory_footprint
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.sqlglot_lineage import (
@@ -177,98 +184,132 @@ def make_lineage_edges_from_parsing_result(
 
 
 class BigqueryLineageExtractor:
-    BQ_FILTER_RULE_TEMPLATE_V2 = """
-resource.type=("bigquery_project")
-AND
-(
-    protoPayload.methodName=
-        (
-            "google.cloud.bigquery.v2.JobService.Query"
-            OR
-            "google.cloud.bigquery.v2.JobService.InsertJob"
-        )
-    AND
-    protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
-    AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
-    AND (
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
-        OR
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
-    )
-    AND (
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*"
-        AND
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*"
-        AND
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__"
-        AND
-        protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*"
-    )
-
-)
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-""".strip()
-
-    def __init__(self, config: BigQueryV2Config, report: BigQueryV2Report):
+    def __init__(
+        self,
+        config: BigQueryV2Config,
+        report: BigQueryV2Report,
+        dataset_urn_builder: Callable[[BigQueryTableRef], str],
+    ):
         self.config = config
         self.report = report
+        self.dataset_urn_builder = dataset_urn_builder
 
     def error(self, log: logging.Logger, key: str, reason: str) -> None:
         self.report.report_warning(key, reason)
         log.error(f"{key} => {reason}")
 
-    @staticmethod
-    def bigquery_audit_metadata_query_template(
-        dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None
-    ) -> str:
-        """
-        Receives a dataset (with project specified) and returns a query template that is used to query exported
-        AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
-        Include only those that:
-        - have been completed (jobStatus.jobState = "DONE")
-        - do not contain errors (jobStatus.errorResults is none)
-        :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
-        :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
-               tables
-        :param limit: set a limit for the maximum event to return. It is used for connection testing currently
-        :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
-        """
-        limit_text = f"limit {limit}" if limit else ""
-
-        shard_condition = ""
-        if use_date_sharded_tables:
-            from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
-            shard_condition = (
-                """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+    def get_lineage_workunits(
+        self,
+        projects: List[BigqueryProject],
+        sql_parser_schema_resolver: SchemaResolver,
+        view_definition_ids: Dict[str, Dict[str, str]],
+        table_refs: Set[str],
+    ) -> Iterable[MetadataWorkUnit]:
+        for project in projects:
+            self.report.set_ingestion_stage(project.id, "Lineage Extraction")
+            yield from self.generate_lineage(
+                project.id,
+                sql_parser_schema_resolver,
+                view_definition_ids[project.id],
+                table_refs,
             )
-        else:
-            from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
-
-        query = f"""
-            SELECT
-                timestamp,
-                logName,
-                insertId,
-                protopayload_auditlog AS protoPayload,
-                protopayload_auditlog.metadataJson AS metadata
-            FROM
-                {from_table}
-            WHERE (
-                timestamp >= "{{start_time}}"
-                AND timestamp < "{{end_time}}"
+
+    def generate_lineage(
+        self,
+        project_id: str,
+        sql_parser_schema_resolver: SchemaResolver,
+        view_definition_ids: Dict[str, str],
+        table_refs: Set[str],
+    ) -> Iterable[MetadataWorkUnit]:
+        logger.info(f"Generate lineage for {project_id}")
+        lineage = self.calculate_lineage_for_project(
+            project_id, sql_parser_schema_resolver=sql_parser_schema_resolver
+        )
+
+        if self.config.lineage_parse_view_ddl:
+            for view, view_definition_id in view_definition_ids.items():
+                view_definition = view_definition_ids[view_definition_id]
+                raw_view_lineage = sqlglot_lineage(
+                    view_definition,
+                    schema_resolver=sql_parser_schema_resolver,
+                    default_db=project_id,
+                )
+                if raw_view_lineage.debug_info.table_error:
+                    logger.debug(
+                        f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}"
+                    )
+                    self.report.num_view_definitions_failed_parsing += 1
+                    self.report.view_definitions_parsing_failures.append(
+                        f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}"
+                    )
+                    continue
+                elif raw_view_lineage.debug_info.column_error:
+                    self.report.num_view_definitions_failed_column_parsing += 1
+                    self.report.view_definitions_parsing_failures.append(
+                        f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}"
+                    )
+                else:
+                    self.report.num_view_definitions_parsed += 1
+
+                # For views, we override the upstreams obtained by parsing audit logs
+                # as they may contain indirectly referenced tables.
+                ts = datetime.now(timezone.utc)
+                lineage[view] = set(
+                    make_lineage_edges_from_parsing_result(
+                        raw_view_lineage,
+                        audit_stamp=ts,
+                        lineage_type=DatasetLineageTypeClass.VIEW,
+                    )
+                )
+
+        for lineage_key in lineage.keys():
+            if lineage_key not in table_refs:
+                continue
+
+            table_ref = BigQueryTableRef.from_string_name(lineage_key)
+            dataset_urn = self.dataset_urn_builder(table_ref)
+
+            lineage_info = self.get_lineage_for_table(
+                bq_table=table_ref,
+                bq_table_urn=dataset_urn,
+                lineage_metadata=lineage,
             )
-            {shard_condition}
-            AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
-            AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
-            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
-            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
-            {limit_text};
-        """
 
-        return textwrap.dedent(query)
+            if lineage_info:
+                yield from self.gen_lineage(dataset_urn, lineage_info)
+
+    def gen_lineage(
+        self,
+        dataset_urn: str,
+        upstream_lineage: Optional[UpstreamLineageClass] = None,
+    ) -> Iterable[MetadataWorkUnit]:
+        if upstream_lineage is None:
+            return
+
+        if upstream_lineage is not None:
+            if self.config.incremental_lineage:
+                patch_builder: DatasetPatchBuilder = DatasetPatchBuilder(
+                    urn=dataset_urn
+                )
+                for upstream in upstream_lineage.upstreams:
+                    patch_builder.add_upstream_lineage(upstream)
+
+                yield from [
+                    MetadataWorkUnit(
+                        id=f"upstreamLineage-for-{dataset_urn}",
+                        mcp_raw=mcp,
+                    )
+                    for mcp in patch_builder.build()
+                ]
+            else:
+                if not self.config.extract_column_lineage:
+                    upstream_lineage.fineGrainedLineages = None
+
+                yield from [
+                    MetadataChangeProposalWrapper(
+                        entityUrn=dataset_urn, aspect=upstream_lineage
+                    ).as_workunit()
+                ]
 
     def lineage_via_catalog_lineage_api(
         self, project_id: str
@@ -292,22 +333,26 @@ def lineage_via_catalog_lineage_api(
 
         try:
             lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient()
-            bigquery_client: BigQueryClient = self.config.get_bigquery_client()
+
+            data_dictionary = BigQueryTechnicalSchemaApi(self.report)
+            data_dictionary.set_client(self.config.get_bigquery_client())
             # Filtering datasets
-            datasets = list(bigquery_client.list_datasets(project_id))
+            datasets = list(data_dictionary.get_datasets_for_project_id(project_id))
             project_tables = []
             for dataset in datasets:
                 # Enables only tables where type is TABLE, VIEW or MATERIALIZED_VIEW (not EXTERNAL)
                 project_tables.extend(
                     [
                         table
-                        for table in bigquery_client.list_tables(dataset.dataset_id)
+                        for table in data_dictionary.list_tables(
+                            dataset.name, project_id
+                        )
                         if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"]
                     ]
                 )
 
             # Convert project tables to <project_id>.<dataset_id>.<table_id> format
-            project_tables = list(
+            project_table_names = list(
                 map(
                     lambda table: "{}.{}.{}".format(
                         table.project, table.dataset_id, table.table_id
@@ -318,7 +363,7 @@ def lineage_via_catalog_lineage_api(
 
             lineage_map: Dict[str, Set[LineageEdge]] = {}
             curr_date = datetime.now()
-            for table in project_tables:
+            for table in project_table_names:
                 logger.info("Creating lineage map for table %s", table)
                 upstreams = set()
                 downstream_table = lineage_v1.EntityReference()
@@ -375,126 +420,61 @@ def lineage_via_catalog_lineage_api(
             raise e
 
     def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
+        audit_log_api = BigQueryAuditLogApi(
+            self.report, self.config.rate_limit, self.config.requests_per_min
+        )
+        # We adjust the filter values a bit, since we need to make sure that the join
+        # between query events and read events is complete. For example, this helps us
+        # handle the case where the read happens within our time range but the query
+        # completion event is delayed and happens after the configured end time.
+        corrected_start_time = self.config.start_time - self.config.max_query_duration
+        corrected_end_time = self.config.end_time + -self.config.max_query_duration
+        self.report.audit_start_time = corrected_start_time
+        self.report.audit_end_time = corrected_end_time
+
         parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]]
         if self.config.use_exported_bigquery_audit_metadata:
             logger.info("Populating lineage info via exported GCP audit logs")
             bq_client = self.config.get_bigquery_client()
-            entries = self._get_exported_bigquery_audit_metadata(bq_client)
+            # TODO: make this call simpler
+            entries = audit_log_api.get_exported_bigquery_audit_metadata(
+                bigquery_client=bq_client,
+                bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage,
+                bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+                use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
+                start_time=corrected_start_time,
+                end_time=corrected_end_time,
+            )
             parse_fn = self._parse_exported_bigquery_audit_metadata
         else:
             logger.info("Populating lineage info via exported GCP audit logs")
+
             logging_client = self.config.make_gcp_logging_client(project_id)
-            entries = self._get_bigquery_log_entries(logging_client)
+            logger.info(
+                f"Start loading log entries from BigQuery for {project_id} "
+                f"with start_time={corrected_start_time} and end_time={corrected_end_time}"
+            )
+            entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+                logging_client,
+                BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
+                    corrected_start_time.strftime(BQ_DATETIME_FORMAT),
+                    corrected_end_time.strftime(BQ_DATETIME_FORMAT),
+                ),
+                self.config.log_page_size,
+            )
             parse_fn = self._parse_bigquery_log_entries
 
         for entry in entries:
-            self.report.num_total_log_entries[project_id] += 1
+            self.report.num_lineage_total_log_entries[project_id] += 1
             try:
                 event = parse_fn(entry)
                 if event:
-                    self.report.num_parsed_log_entries[project_id] += 1
+                    self.report.num_lineage_parsed_log_entries[project_id] += 1
                     yield event
             except Exception as e:
                 logger.warning(f"Unable to parse log entry `{entry}`: {e}")
                 self.report.num_lineage_log_parse_failures[project_id] += 1
 
-    def _get_bigquery_log_entries(
-        self, client: GCPLoggingClient, limit: Optional[int] = None
-    ) -> Iterable[AuditLogEntry]:
-        self.report.num_total_log_entries[client.project] = 0
-        # Add a buffer to start and end time to account for delays in logging events.
-        corrected_start_time = self.config.start_time - self.config.max_query_duration
-        start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
-        self.report.log_entry_start_time = corrected_start_time
-
-        corrected_end_time = self.config.end_time + self.config.max_query_duration
-        end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
-        self.report.log_entry_end_time = corrected_end_time
-
-        filter = self.BQ_FILTER_RULE_TEMPLATE_V2.format(
-            start_time=start_time,
-            end_time=end_time,
-        )
-
-        logger.info(
-            f"Start loading log entries from BigQuery for {client.project} with start_time={start_time} and end_time={end_time}"
-        )
-
-        if self.config.rate_limit:
-            with RateLimiter(max_calls=self.config.requests_per_min, period=60):
-                entries = client.list_entries(
-                    filter_=filter,
-                    page_size=self.config.log_page_size,
-                    max_results=limit,
-                )
-        else:
-            entries = client.list_entries(
-                filter_=filter, page_size=self.config.log_page_size, max_results=limit
-            )
-
-        logger.info(
-            f"Start iterating over log entries from BigQuery for {client.project}"
-        )
-        for entry in entries:
-            self.report.num_total_log_entries[client.project] += 1
-            if self.report.num_total_log_entries[client.project] % 1000 == 0:
-                logger.info(
-                    f"{self.report.num_total_log_entries[client.project]} log entries loaded for project {client.project} so far..."
-                )
-            yield entry
-
-        logger.info(
-            f"Finished loading {self.report.num_total_log_entries[client.project]} log entries from BigQuery project {client.project} so far"
-        )
-
-    def _get_exported_bigquery_audit_metadata(
-        self, bigquery_client: BigQueryClient, limit: Optional[int] = None
-    ) -> Iterable[BigQueryAuditMetadata]:
-        if self.config.bigquery_audit_metadata_datasets is None:
-            self.error(
-                logger, "audit-metadata", "bigquery_audit_metadata_datasets not set"
-            )
-            self.report.bigquery_audit_metadata_datasets_missing = True
-            return
-
-        corrected_start_time = self.config.start_time - self.config.max_query_duration
-        start_time = corrected_start_time.strftime(BQ_DATETIME_FORMAT)
-        start_date = corrected_start_time.strftime(BQ_DATE_SHARD_FORMAT)
-        self.report.audit_start_time = corrected_start_time
-
-        corrected_end_time = self.config.end_time + self.config.max_query_duration
-        end_time = corrected_end_time.strftime(BQ_DATETIME_FORMAT)
-        end_date = corrected_end_time.strftime(BQ_DATE_SHARD_FORMAT)
-        self.report.audit_end_time = corrected_end_time
-
-        for dataset in self.config.bigquery_audit_metadata_datasets:
-            logger.info(
-                f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
-            )
-
-            query: str = self.bigquery_audit_metadata_query_template(
-                dataset=dataset,
-                use_date_sharded_tables=self.config.use_date_sharded_audit_log_tables,
-                limit=limit,
-            ).format(
-                start_time=start_time,
-                end_time=end_time,
-                start_date=start_date,
-                end_date=end_date,
-            )
-
-            query_job = bigquery_client.query(query)
-
-            logger.info(
-                f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
-            )
-
-            if self.config.rate_limit:
-                with RateLimiter(max_calls=self.config.requests_per_min, period=60):
-                    yield from query_job
-            else:
-                yield from query_job
-
     # Currently we only parse JobCompleted events but in future we would want to parse other
     # events to also create field level lineage.
     def _parse_bigquery_log_entries(
@@ -749,7 +729,6 @@ def get_lineage_for_table(
         bq_table: BigQueryTableRef,
         bq_table_urn: str,
         lineage_metadata: Dict[str, Set[LineageEdge]],
-        platform: str,
     ) -> Optional[UpstreamLineageClass]:
         upstream_list: List[UpstreamClass] = []
         fine_grained_lineages: List[FineGrainedLineageClass] = []
@@ -757,12 +736,7 @@ def get_lineage_for_table(
         # even if the lineage is same but the order is different.
         for upstream in sorted(self.get_upstream_tables(bq_table, lineage_metadata)):
             upstream_table = BigQueryTableRef.from_string_name(upstream.table)
-            upstream_table_urn = mce_builder.make_dataset_urn_with_platform_instance(
-                platform,
-                upstream_table.table_identifier.get_table_name(),
-                self.config.platform_instance,
-                self.config.env,
-            )
+            upstream_table_urn = self.dataset_urn_builder(upstream_table)
 
             # Generate table-level lineage.
             upstream_table_class = UpstreamClass(
@@ -812,12 +786,21 @@ def get_lineage_for_table(
         return None
 
     def test_capability(self, project_id: str) -> None:
+        audit_log_api = BigQueryAuditLogApi(
+            self.report, self.config.rate_limit, self.config.requests_per_min
+        )
+
         if self.config.use_exported_bigquery_audit_metadata:
             bigquery_client: BigQueryClient = BigQueryClient(project=project_id)
-            entries = self._get_exported_bigquery_audit_metadata(
-                bigquery_client=bigquery_client, limit=1
-            )
-            for entry in entries:
+            for entry in audit_log_api.get_exported_bigquery_audit_metadata(
+                bigquery_client=bigquery_client,
+                bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage,
+                bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+                use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
+                start_time=self.config.start_time,
+                end_time=self.config.end_time,
+                limit=1,
+            ):
                 logger.debug(
                     f"Connection test got one exported_bigquery_audit_metadata {entry}"
                 )
@@ -825,5 +808,13 @@ def test_capability(self, project_id: str) -> None:
             gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client(
                 project_id
             )
-            for entry in self._get_bigquery_log_entries(gcp_logging_client, limit=1):
+            for entry in audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+                gcp_logging_client,
+                filter=BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
+                    self.config.start_time.strftime(BQ_DATETIME_FORMAT),
+                    self.config.end_time.strftime(BQ_DATETIME_FORMAT),
+                ),
+                log_page_size=self.config.log_page_size,
+                limit=1,
+            ):
                 logger.debug(f"Connection test got one audit metadata entry {entry}")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
index c9dcb4fe35c3f..f825bbf666b64 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
@@ -11,7 +11,7 @@
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     RANGE_PARTITION_NAME,
     BigqueryTable,
 )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index 20014f2f3fac4..51d74168c4970 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -36,9 +36,14 @@
     QueryEvent,
     ReadEvent,
 )
-from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi
+from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import (
+    BQ_FILTER_RULE_TEMPLATE_V2_USAGE,
+    BigQueryAuditLogApi,
+    bigquery_audit_metadata_query_template_usage,
+)
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
+from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
 from datahub.ingestion.source.usage.usage_common import (
     TOTAL_BUDGET_FOR_QUERY_LIST,
     make_usage_workunit,
@@ -286,7 +291,8 @@ class BigQueryUsageExtractor:
     * Aggregation of these statistics into buckets, by day or hour granularity
 
     :::note
-    1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient. In that case, use either admin or private log viewer permission.
+    1. Depending on the compliance policies setup for the bigquery instance, sometimes logging.read permission is not sufficient.
+    In that case, use either admin or private log viewer permission.
     :::
     """
 
@@ -788,6 +794,7 @@ def _get_parsed_bigquery_log_events(
             entries = audit_log_api.get_exported_bigquery_audit_metadata(
                 bigquery_client=bq_client,
                 bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+                bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_usage,
                 use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
                 start_time=corrected_start_time,
                 end_time=corrected_end_time,
@@ -796,10 +803,13 @@ def _get_parsed_bigquery_log_events(
             parse_fn = self._parse_exported_bigquery_audit_metadata
         else:
             logging_client = self.config.make_gcp_logging_client(project_id)
+            logger.info(
+                f"Start loading log entries from BigQuery for {project_id} "
+                f"with start_time={corrected_start_time} and end_time={corrected_end_time}"
+            )
             entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging(
                 logging_client,
-                start_time=corrected_start_time,
-                end_time=corrected_end_time,
+                filter=self._generate_filter(corrected_start_time, corrected_end_time),
                 log_page_size=self.config.log_page_size,
                 limit=limit,
             )
@@ -807,9 +817,10 @@ def _get_parsed_bigquery_log_events(
 
         for entry in entries:
             try:
-                self.report.total_query_log_entries += 1
+                self.report.num_usage_total_log_entries[project_id] += 1
                 event = parse_fn(entry)
                 if event:
+                    self.report.num_usage_parsed_log_entries[project_id] += 1
                     yield event
             except Exception as e:
                 logger.warning(
@@ -820,6 +831,12 @@ def _get_parsed_bigquery_log_events(
                     f"log-parse-{project_id}", e, group="usage-log-parse"
                 )
 
+    def _generate_filter(self, corrected_start_time, corrected_end_time):
+        return BQ_FILTER_RULE_TEMPLATE_V2_USAGE.format(
+            start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT),
+            end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT),
+        )
+
     def get_tables_from_query(
         self, default_project: str, query: str
     ) -> Optional[List[BigQueryTableRef]]:
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index 3bda6c5cce84b..ba3ea06b07623 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -4,7 +4,7 @@
 from freezegun import freeze_time
 from google.cloud.bigquery.table import TableListItem
 
-from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     BigqueryDataset,
     BigqueryTable,
 )
@@ -16,13 +16,13 @@
 
 @freeze_time(FROZEN_TIME)
 @patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
+    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
 )
 @patch(
     "datahub.ingestion.source.bigquery_v2.bigquery.BigqueryV2Source.get_core_table_details"
 )
 @patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_datasets_for_project_id"
+    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_datasets_for_project_id"
 )
 @patch("google.cloud.bigquery.Client")
 def test_bigquery_v2_ingest(
diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py
index c9308fd89ef72..aab923585b6fb 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py
@@ -17,7 +17,9 @@
 def test_lineage_with_timestamps():
     config = BigQueryV2Config()
     report = BigQueryV2Report()
-    extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report)
+    extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(
+        config, report, lambda x: ""
+    )
     lineage_entries: List[QueryEvent] = [
         QueryEvent(
             timestamp=datetime.datetime.now(tz=datetime.timezone.utc),
@@ -86,7 +88,6 @@ def test_lineage_with_timestamps():
         bq_table=bq_table,
         bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)",
         lineage_metadata=lineage_map,
-        platform="bigquery",
     )
     assert upstream_lineage
     assert len(upstream_lineage.upstreams) == 4
diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
index a2aec8df93d09..a723b6d475ae3 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
@@ -2,7 +2,7 @@
 
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     BigqueryColumn,
     BigqueryTable,
     PartitionInfo,
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index bc9a3f41a9655..84f218074d99b 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -19,9 +19,9 @@
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
-    BigQueryDataDictionary,
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     BigqueryProject,
+    BigQueryTechnicalSchemaApi,
     BigqueryView,
 )
 from datahub.ingestion.source.bigquery_v2.lineage import (
@@ -186,7 +186,7 @@ def test_get_projects_by_list(client_mock):
     assert client_mock.list_projects.call_count == 1
 
 
-@patch.object(BigQueryDataDictionary, "get_projects")
+@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
 def test_get_projects_filter_by_pattern(get_projects_mock):
     get_projects_mock.return_value = [
         BigqueryProject("test-project", "Test Project"),
@@ -203,7 +203,7 @@ def test_get_projects_filter_by_pattern(get_projects_mock):
     ]
 
 
-@patch.object(BigQueryDataDictionary, "get_projects")
+@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
 def test_get_projects_list_empty(get_projects_mock):
     get_projects_mock.return_value = []
 
@@ -216,7 +216,7 @@ def test_get_projects_list_empty(get_projects_mock):
     assert projects == []
 
 
-@patch.object(BigQueryDataDictionary, "get_projects")
+@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
 def test_get_projects_list_failure(
     get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture
 ) -> None:
@@ -236,7 +236,7 @@ def test_get_projects_list_failure(
     assert projects == []
 
 
-@patch.object(BigQueryDataDictionary, "get_projects")
+@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
 def test_get_projects_list_fully_filtered(get_projects_mock):
     get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")]
 
@@ -445,7 +445,7 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr
 
 
 @patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
+    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
 )
 @patch("google.cloud.bigquery.client.Client")
 def test_table_processing_logic(client_mock, data_dictionary_mock):
@@ -517,7 +517,7 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
 
 
 @patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_tables_for_dataset"
+    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
 )
 @patch("google.cloud.bigquery.client.Client")
 def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock):
@@ -625,7 +625,7 @@ def bigquery_view_2() -> BigqueryView:
 
 
 @patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema.BigQueryDataDictionary.get_query_result"
+    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_query_result"
 )
 @patch("google.cloud.bigquery.client.Client")
 def test_get_views_for_dataset(
@@ -655,7 +655,7 @@ def test_get_views_for_dataset(
         )
     )
     query_mock.return_value = [row1, row2]
-    bigquery_data_dictionary = BigQueryDataDictionary(BigQueryV2Report())
+    bigquery_data_dictionary = BigQueryTechnicalSchemaApi(BigQueryV2Report())
     bigquery_data_dictionary.set_client(client_mock)
 
     views = bigquery_data_dictionary.get_views_for_dataset(
diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
index 8c50619bee53d..4cf42da4395f9 100644
--- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
+++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py
@@ -7,9 +7,9 @@
     BigqueryTableIdentifier,
     BigQueryTableRef,
 )
-from datahub.ingestion.source.bigquery_v2.bigquery_audit_api import BigQueryAuditLogApi
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
+from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
 
 FROZEN_TIME = "2021-07-20 00:00:00"
 
@@ -110,12 +110,12 @@ def test_bigqueryv2_filters():
     OR
     protoPayload.metadata.tableDataRead.reason = "JOB"
 )"""  # noqa: W293
-    api = BigQueryAuditLogApi(
-        BigQueryV2Report(), config.rate_limit, config.requests_per_min
-    )
+
     corrected_start_time = config.start_time - config.max_query_duration
     corrected_end_time = config.end_time + config.max_query_duration
-    filter: str = api._generate_filter(corrected_start_time, corrected_end_time)
+    filter: str = BigQueryUsageExtractor(
+        config, BigQueryV2Report(), lambda x: ""
+    )._generate_filter(corrected_start_time, corrected_end_time)
     assert filter == expected_filter
 
 

From 76ddc3f878f37c0f95e96ec7094d944b977cf80d Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Fri, 11 Aug 2023 18:02:25 +0530
Subject: [PATCH 03/11] refractor in lineage.py

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  12 +-
 .../bigquery_v2/bigquery_audit_log_api.py     |   5 +-
 .../source/bigquery_v2/bigquery_report.py     |   3 -
 .../source/bigquery_v2/bigquery_schema_api.py | 240 +-------------
 .../ingestion/source/bigquery_v2/lineage.py   | 302 +++++++++---------
 .../ingestion/source/bigquery_v2/queries.py   | 224 +++++++++++++
 .../tests/unit/test_bigquery_source.py        | 123 +++++--
 7 files changed, 492 insertions(+), 417 deletions(-)
 create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index a38117cce346d..4ff4648657959 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -220,7 +220,9 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
 
         set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase)
 
-        self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi(self.report)
+        self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi(
+            self.report, self.config.get_bigquery_client()
+        )
 
         # For database, schema, tables, views, etc
         self.lineage_extractor = BigqueryLineageExtractor(
@@ -298,8 +300,7 @@ def metadata_read_capability_test(
                 client: bigquery.Client = config.get_bigquery_client()
                 assert client
                 report = BigQueryV2Report()
-                bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report)
-                bigquery_data_dictionary.set_client(client)
+                bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report, client)
                 result = bigquery_data_dictionary.get_datasets_for_project_id(
                     project_id, 10
                 )
@@ -502,9 +503,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         ]
 
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
-        bq_client: bigquery.Client = self.config.get_bigquery_client()
-        self.bigquery_data_dictionary.set_client(bq_client)
-
         projects = self._get_projects()
         if not projects:
             return
@@ -521,7 +519,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
         if self._should_ingest_lineage():
             yield from self.lineage_extractor.get_lineage_workunits(
-                projects,
+                [p.id for p in projects],
                 self.sql_parser_schema_resolver,
                 self.view_definition_ids,
                 self.table_refs,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
index 048f90de7ee0a..b017b1d08a1ee 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
@@ -22,7 +22,9 @@
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-# TODO: separation of api/extractor classes - client wise, functionality wise ?
+# Api interfaces are separated based on functionality they provide
+# rather than the underlying bigquery client that is used to
+# provide the functionality.
 class BigQueryAuditLogApi:
     def __init__(
         self,
@@ -34,7 +36,6 @@ def __init__(
         self.rate_limit = rate_limit
         self.requests_per_min = requests_per_min
 
-    # TODO; should we refractor and move this to schema api , as this uses bigquery client ?
     def get_exported_bigquery_audit_metadata(
         self,
         bigquery_client: bigquery.Client,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index 62fd23bb1b68a..6d5822723ec64 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -80,9 +80,6 @@ class BigQueryV2Report(
     log_page_size: Optional[pydantic.PositiveInt] = None
     use_exported_bigquery_audit_metadata: Optional[bool] = None
     end_time: Optional[datetime] = None
-    # TODO: remove one or replace by lineage ones
-    log_entry_start_time: Optional[datetime] = None
-    log_entry_end_time: Optional[datetime] = None
     audit_start_time: Optional[datetime] = None
     audit_end_time: Optional[datetime] = None
     upstream_lineage: LossyDict = field(default_factory=LossyDict)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
index e7fcd5bd390c0..b627af15ca213 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
@@ -17,21 +17,15 @@
     BigQueryApiPerfReport,
     BigQueryV2Report,
 )
+from datahub.ingestion.source.bigquery_v2.queries import (
+    BigqueryQuery,
+    BigqueryTableType,
+)
 from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView
 
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-class BigqueryTableType:
-    # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema
-    BASE_TABLE = "BASE TABLE"
-    EXTERNAL = "EXTERNAL"
-    VIEW = "VIEW"
-    MATERIALIZED_VIEW = "MATERIALIZED VIEW"
-    CLONE = "CLONE"
-    SNAPSHOT = "SNAPSHOT"
-
-
 @dataclass
 class BigqueryColumn(BaseColumn):
     field_path: str
@@ -131,233 +125,11 @@ class BigqueryProject:
     datasets: List[BigqueryDataset] = field(default_factory=list)
 
 
-class BigqueryQuery:
-    show_datasets: str = (
-        "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA"
-    )
-
-    datasets_for_project_id: str = """
-select
-  s.CATALOG_NAME as catalog_name,
-  s.schema_name as table_schema,
-  s.location as location,
-  s.CREATION_TIME as created,
-  s.LAST_MODIFIED_TIME as last_altered,
-  o.OPTION_VALUE as comment
-from
-  `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s
-  left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name
-  and o.option_name = "description"
-order by
-  s.schema_name
-"""
-
-    # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en
-    tables_for_dataset = f"""
-SELECT
-  t.table_catalog as table_catalog,
-  t.table_schema as table_schema,
-  t.table_name as table_name,
-  t.table_type as table_type,
-  t.creation_time as created,
-  ts.last_modified_time as last_altered,
-  tos.OPTION_VALUE as comment,
-  is_insertable_into,
-  ddl,
-  row_count,
-  size_bytes as bytes,
-  num_partitions,
-  max_partition_id,
-  active_billable_bytes,
-  long_term_billable_bytes,
-  REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
-  REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
-
-FROM
-  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
-  join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
-  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
-  and t.TABLE_NAME = tos.TABLE_NAME
-  and tos.OPTION_NAME = "description"
-  left join (
-    select
-        table_name,
-        sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions,
-        max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id,
-        sum(total_rows) as total_rows,
-        sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes,
-        sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes,
-    from
-        `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS
-    group by
-        table_name) as p on
-    t.table_name = p.table_name
-WHERE
-  table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
-{{table_filter}}
-order by
-  table_schema ASC,
-  table_base ASC,
-  table_suffix DESC
-"""
-
-    tables_for_dataset_without_partition_data = f"""
-SELECT
-  t.table_catalog as table_catalog,
-  t.table_schema as table_schema,
-  t.table_name as table_name,
-  t.table_type as table_type,
-  t.creation_time as created,
-  tos.OPTION_VALUE as comment,
-  is_insertable_into,
-  ddl,
-  REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
-  REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
-
-FROM
-  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
-  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
-  and t.TABLE_NAME = tos.TABLE_NAME
-  and tos.OPTION_NAME = "description"
-WHERE
-  table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
-{{table_filter}}
-order by
-  table_schema ASC,
-  table_base ASC,
-  table_suffix DESC
-"""
-
-    views_for_dataset: str = f"""
-SELECT
-  t.table_catalog as table_catalog,
-  t.table_schema as table_schema,
-  t.table_name as table_name,
-  t.table_type as table_type,
-  t.creation_time as created,
-  ts.last_modified_time as last_altered,
-  tos.OPTION_VALUE as comment,
-  is_insertable_into,
-  ddl as view_definition,
-  row_count,
-  size_bytes
-FROM
-  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
-  join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
-  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
-  and t.TABLE_NAME = tos.TABLE_NAME
-  and tos.OPTION_NAME = "description"
-WHERE
-  table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
-order by
-  table_schema ASC,
-  table_name ASC
-"""
-
-    views_for_dataset_without_data_read: str = f"""
-SELECT
-  t.table_catalog as table_catalog,
-  t.table_schema as table_schema,
-  t.table_name as table_name,
-  t.table_type as table_type,
-  t.creation_time as created,
-  tos.OPTION_VALUE as comment,
-  is_insertable_into,
-  ddl as view_definition
-FROM
-  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
-  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
-  and t.TABLE_NAME = tos.TABLE_NAME
-  and tos.OPTION_NAME = "description"
-WHERE
-  table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
-order by
-  table_schema ASC,
-  table_name ASC
-"""
-
-    columns_for_dataset: str = """
-select
-  c.table_catalog as table_catalog,
-  c.table_schema as table_schema,
-  c.table_name as table_name,
-  c.column_name as column_name,
-  c.ordinal_position as ordinal_position,
-  cfp.field_path as field_path,
-  c.is_nullable as is_nullable,
-  CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
-  description as comment,
-  c.is_hidden as is_hidden,
-  c.is_partitioning_column as is_partitioning_column
-from
-  `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
-  join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
-  and cfp.column_name = c.column_name
-ORDER BY
-  table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
-
-    optimized_columns_for_dataset: str = """
-select * from
-(select
-  c.table_catalog as table_catalog,
-  c.table_schema as table_schema,
-  c.table_name as table_name,
-  c.column_name as column_name,
-  c.ordinal_position as ordinal_position,
-  cfp.field_path as field_path,
-  c.is_nullable as is_nullable,
-  CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
-  description as comment,
-  c.is_hidden as is_hidden,
-  c.is_partitioning_column as is_partitioning_column,
-  -- We count the columns to be able limit it later
-  row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
-  -- Getting the maximum shard for each table
-  row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num
-from
-  `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
-  join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
-  and cfp.column_name = c.column_name
-  )
--- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data
-where column_num <= {column_limit} and shard_num = 1
-ORDER BY
-  table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC"""
-
-    columns_for_table: str = """
-select
-  c.table_catalog as table_catalog,
-  c.table_schema as table_schema,
-  c.table_name as table_name,
-  c.column_name as column_name,
-  c.ordinal_position as ordinal_position,
-  cfp.field_path as field_path,
-  c.is_nullable as is_nullable,
-  CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
-  c.is_hidden as is_hidden,
-  c.is_partitioning_column as is_partitioning_column,
-  description as comment
-from
-  `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
-  join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
-  and cfp.column_name = c.column_name
-where
-  c.table_name = '{table_identifier.table}'
-ORDER BY
-  table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
-
-
-# TODO: remove static methods from here
-# TODO: move queries into separate file
 class BigQueryTechnicalSchemaApi:
-    def __init__(self, report: BigQueryApiPerfReport) -> None:
-        self.bq_client: Optional[bigquery.Client] = None
+    def __init__(self, report: BigQueryApiPerfReport, client: bigquery.Client) -> None:
+        self.bq_client = client
         self.api_perf_report = report
 
-    # TODO: remove need to set_client. maybe pass in constructor?
-    def set_client(self, bq_client: bigquery.Client) -> None:
-        self.bq_client = bq_client
-
     def get_client(self) -> bigquery.Client:
         assert self.bq_client is not None
         return self.bq_client
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index d1bc687ebaca0..4dc01bb1c7232 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -6,7 +6,6 @@
 from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set, Union
 
 import humanfriendly
-from google.cloud.bigquery import Client as BigQueryClient
 from google.cloud.datacatalog import lineage_v1
 from google.cloud.logging_v2.client import Client as GCPLoggingClient
 
@@ -29,7 +28,6 @@
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
 from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
-    BigqueryProject,
     BigQueryTechnicalSchemaApi,
 )
 from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
@@ -193,6 +191,9 @@ def __init__(
         self.config = config
         self.report = report
         self.dataset_urn_builder = dataset_urn_builder
+        self.audit_log_api = BigQueryAuditLogApi(
+            report, self.config.rate_limit, self.config.requests_per_min
+        )
 
     def error(self, log: logging.Logger, key: str, reason: str) -> None:
         self.report.report_warning(key, reason)
@@ -200,17 +201,37 @@ def error(self, log: logging.Logger, key: str, reason: str) -> None:
 
     def get_lineage_workunits(
         self,
-        projects: List[BigqueryProject],
+        projects: List[str],
         sql_parser_schema_resolver: SchemaResolver,
         view_definition_ids: Dict[str, Dict[str, str]],
         table_refs: Set[str],
     ) -> Iterable[MetadataWorkUnit]:
+        views_skip_audit_log_lineage: Set[str] = set()
+        if self.config.lineage_parse_view_ddl:
+            view_lineage: Dict[str, Set[LineageEdge]] = {}
+            for project in projects:
+                self.populate_view_lineage_with_sql_parsing(
+                    view_lineage,
+                    view_definition_ids[project],
+                    sql_parser_schema_resolver,
+                    project,
+                )
+
+                views_skip_audit_log_lineage.update(view_lineage.keys())
+                for lineage_key in view_lineage.keys():
+                    yield from self.gen_lineage_workunits_for_table(
+                        view_lineage, BigQueryTableRef.from_string_name(lineage_key)
+                    )
+
+        if self.config.use_exported_bigquery_audit_metadata:
+            projects = ["*"]  # project_id not used when using exported metadata
+
         for project in projects:
-            self.report.set_ingestion_stage(project.id, "Lineage Extraction")
+            self.report.set_ingestion_stage(project, "Lineage Extraction")
             yield from self.generate_lineage(
-                project.id,
+                project,
                 sql_parser_schema_resolver,
-                view_definition_ids[project.id],
+                views_skip_audit_log_lineage,
                 table_refs,
             )
 
@@ -218,65 +239,104 @@ def generate_lineage(
         self,
         project_id: str,
         sql_parser_schema_resolver: SchemaResolver,
-        view_definition_ids: Dict[str, str],
+        views_skip_audit_log_lineage: Set[str],
         table_refs: Set[str],
     ) -> Iterable[MetadataWorkUnit]:
         logger.info(f"Generate lineage for {project_id}")
-        lineage = self.calculate_lineage_for_project(
-            project_id, sql_parser_schema_resolver=sql_parser_schema_resolver
-        )
-
-        if self.config.lineage_parse_view_ddl:
-            for view, view_definition_id in view_definition_ids.items():
-                view_definition = view_definition_ids[view_definition_id]
-                raw_view_lineage = sqlglot_lineage(
-                    view_definition,
-                    schema_resolver=sql_parser_schema_resolver,
-                    default_db=project_id,
-                )
-                if raw_view_lineage.debug_info.table_error:
-                    logger.debug(
-                        f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}"
-                    )
-                    self.report.num_view_definitions_failed_parsing += 1
-                    self.report.view_definitions_parsing_failures.append(
-                        f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}"
-                    )
-                    continue
-                elif raw_view_lineage.debug_info.column_error:
-                    self.report.num_view_definitions_failed_column_parsing += 1
-                    self.report.view_definitions_parsing_failures.append(
-                        f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}"
-                    )
+        with PerfTimer() as timer:
+            try:
+                if self.config.extract_lineage_from_catalog:
+                    lineage = self.lineage_via_catalog_lineage_api(project_id)
                 else:
-                    self.report.num_view_definitions_parsed += 1
-
-                # For views, we override the upstreams obtained by parsing audit logs
-                # as they may contain indirectly referenced tables.
-                ts = datetime.now(timezone.utc)
-                lineage[view] = set(
-                    make_lineage_edges_from_parsing_result(
-                        raw_view_lineage,
-                        audit_stamp=ts,
-                        lineage_type=DatasetLineageTypeClass.VIEW,
+                    events = self._get_parsed_audit_log_events(project_id)
+                    lineage = self._create_lineage_map(
+                        events, sql_parser_schema_resolver
                     )
+            except Exception as e:
+                if project_id:
+                    self.report.lineage_failed_extraction.append(project_id)
+                self.error(
+                    logger,
+                    "lineage",
+                    f"{project_id}: {e}",
                 )
+                lineage = {}
+
+            self.report.lineage_metadata_entries[project_id] = len(lineage)
+            logger.info(f"Built lineage map containing {len(lineage)} entries.")
+            logger.debug(f"lineage metadata is {lineage}")
+            self.report.lineage_extraction_sec[project_id] = round(
+                timer.elapsed_seconds(), 2
+            )
+            self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
+                memory_footprint.total_size(lineage)
+            )
 
         for lineage_key in lineage.keys():
-            if lineage_key not in table_refs:
+            # For views, we do not use the upstreams obtained by parsing audit logs
+            # as they may contain indirectly referenced tables.
+            if (
+                lineage_key not in table_refs
+                or lineage_key in views_skip_audit_log_lineage
+            ):
                 continue
 
-            table_ref = BigQueryTableRef.from_string_name(lineage_key)
-            dataset_urn = self.dataset_urn_builder(table_ref)
+            yield from self.gen_lineage_workunits_for_table(
+                lineage, BigQueryTableRef.from_string_name(lineage_key)
+            )
 
-            lineage_info = self.get_lineage_for_table(
-                bq_table=table_ref,
-                bq_table_urn=dataset_urn,
-                lineage_metadata=lineage,
+    def populate_view_lineage_with_sql_parsing(
+        self,
+        view_lineage: Dict[str, Set[LineageEdge]],
+        view_definition_ids: Dict[str, str],
+        sql_parser_schema_resolver: SchemaResolver,
+        default_project: str,
+    ) -> None:
+        for view, view_definition_id in view_definition_ids.items():
+            view_definition = view_definition_ids[view_definition_id]
+            raw_view_lineage = sqlglot_lineage(
+                view_definition,
+                schema_resolver=sql_parser_schema_resolver,
+                default_db=default_project,
             )
+            if raw_view_lineage.debug_info.table_error:
+                logger.debug(
+                    f"Failed to parse lineage for view {view}: {raw_view_lineage.debug_info.table_error}"
+                )
+                self.report.num_view_definitions_failed_parsing += 1
+                self.report.view_definitions_parsing_failures.append(
+                    f"Table-level sql parsing error for view {view}: {raw_view_lineage.debug_info.table_error}"
+                )
+                continue
+            elif raw_view_lineage.debug_info.column_error:
+                self.report.num_view_definitions_failed_column_parsing += 1
+                self.report.view_definitions_parsing_failures.append(
+                    f"Column-level sql parsing error for view {view}: {raw_view_lineage.debug_info.column_error}"
+                )
+            else:
+                self.report.num_view_definitions_parsed += 1
+
+            ts = datetime.now(timezone.utc)
+            view_lineage[view] = set(
+                make_lineage_edges_from_parsing_result(
+                    raw_view_lineage,
+                    audit_stamp=ts,
+                    lineage_type=DatasetLineageTypeClass.VIEW,
+                )
+            )
+
+    def gen_lineage_workunits_for_table(
+        self, lineage: dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef
+    ) -> Iterable[MetadataWorkUnit]:
+        dataset_urn = self.dataset_urn_builder(table_ref)
 
-            if lineage_info:
-                yield from self.gen_lineage(dataset_urn, lineage_info)
+        lineage_info = self.get_lineage_for_table(
+            bq_table=table_ref,
+            bq_table_urn=dataset_urn,
+            lineage_metadata=lineage,
+        )
+        if lineage_info:
+            yield from self.gen_lineage(dataset_urn, lineage_info)
 
     def gen_lineage(
         self,
@@ -334,8 +394,10 @@ def lineage_via_catalog_lineage_api(
         try:
             lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient()
 
-            data_dictionary = BigQueryTechnicalSchemaApi(self.report)
-            data_dictionary.set_client(self.config.get_bigquery_client())
+            data_dictionary = BigQueryTechnicalSchemaApi(
+                self.report, self.config.get_bigquery_client()
+            )
+
             # Filtering datasets
             datasets = list(data_dictionary.get_datasets_for_project_id(project_id))
             project_tables = []
@@ -420,9 +482,6 @@ def lineage_via_catalog_lineage_api(
             raise e
 
     def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
-        audit_log_api = BigQueryAuditLogApi(
-            self.report, self.config.rate_limit, self.config.requests_per_min
-        )
         # We adjust the filter values a bit, since we need to make sure that the join
         # between query events and read events is complete. For example, this helps us
         # handle the case where the read happens within our time range but the query
@@ -434,33 +493,11 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
 
         parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]]
         if self.config.use_exported_bigquery_audit_metadata:
-            logger.info("Populating lineage info via exported GCP audit logs")
-            bq_client = self.config.get_bigquery_client()
-            # TODO: make this call simpler
-            entries = audit_log_api.get_exported_bigquery_audit_metadata(
-                bigquery_client=bq_client,
-                bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage,
-                bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
-                use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
-                start_time=corrected_start_time,
-                end_time=corrected_end_time,
-            )
+            self.get_exported_log_entries(corrected_start_time, corrected_end_time)
             parse_fn = self._parse_exported_bigquery_audit_metadata
         else:
-            logger.info("Populating lineage info via exported GCP audit logs")
-
-            logging_client = self.config.make_gcp_logging_client(project_id)
-            logger.info(
-                f"Start loading log entries from BigQuery for {project_id} "
-                f"with start_time={corrected_start_time} and end_time={corrected_end_time}"
-            )
-            entries = audit_log_api.get_bigquery_log_entries_via_gcp_logging(
-                logging_client,
-                BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
-                    corrected_start_time.strftime(BQ_DATETIME_FORMAT),
-                    corrected_end_time.strftime(BQ_DATETIME_FORMAT),
-                ),
-                self.config.log_page_size,
+            entries = self.get_log_entries_via_gcp_logging(
+                project_id, corrected_start_time, corrected_end_time
             )
             parse_fn = self._parse_bigquery_log_entries
 
@@ -475,6 +512,42 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
                 logger.warning(f"Unable to parse log entry `{entry}`: {e}")
                 self.report.num_lineage_log_parse_failures[project_id] += 1
 
+    def get_exported_log_entries(
+        self, corrected_start_time, corrected_end_time, limit=None
+    ):
+        logger.info("Populating lineage info via exported GCP audit logs")
+        bq_client = self.config.get_bigquery_client()
+        entries = self.audit_log_api.get_exported_bigquery_audit_metadata(
+            bigquery_client=bq_client,
+            bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage,
+            bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
+            use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
+            start_time=corrected_start_time,
+            end_time=corrected_end_time,
+            limit=limit,
+        )
+        return entries
+
+    def get_log_entries_via_gcp_logging(
+        self, project_id, corrected_start_time, corrected_end_time
+    ):
+        logger.info("Populating lineage info via exported GCP audit logs")
+
+        logging_client = self.config.make_gcp_logging_client(project_id)
+        logger.info(
+            f"Start loading log entries from BigQuery for {project_id} "
+            f"with start_time={corrected_start_time} and end_time={corrected_end_time}"
+        )
+        entries = self.audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+            logging_client,
+            BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
+                corrected_start_time.strftime(BQ_DATETIME_FORMAT),
+                corrected_end_time.strftime(BQ_DATETIME_FORMAT),
+            ),
+            self.config.log_page_size,
+        )
+        return entries
+
     # Currently we only parse JobCompleted events but in future we would want to parse other
     # events to also create field level lineage.
     def _parse_bigquery_log_entries(
@@ -616,38 +689,6 @@ def _create_lineage_map(
         logger.info("Exiting create lineage map function")
         return lineage_map
 
-    def _compute_bigquery_lineage(
-        self,
-        project_id: str,
-        sql_parser_schema_resolver: SchemaResolver,
-    ) -> Dict[str, Set[LineageEdge]]:
-        lineage_metadata: Dict[str, Set[LineageEdge]]
-        try:
-            if self.config.extract_lineage_from_catalog:
-                lineage_metadata = self.lineage_via_catalog_lineage_api(project_id)
-            else:
-                events = self._get_parsed_audit_log_events(project_id)
-                lineage_metadata = self._create_lineage_map(
-                    events, sql_parser_schema_resolver
-                )
-        except Exception as e:
-            if project_id:
-                self.report.lineage_failed_extraction.append(project_id)
-            self.error(
-                logger,
-                "lineage",
-                f"{project_id}: {e}",
-            )
-            lineage_metadata = {}
-
-        self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
-            memory_footprint.total_size(lineage_metadata)
-        )
-        self.report.lineage_metadata_entries[project_id] = len(lineage_metadata)
-        logger.info(f"Built lineage map containing {len(lineage_metadata)} entries.")
-        logger.debug(f"lineage metadata is {lineage_metadata}")
-        return lineage_metadata
-
     def get_upstream_tables(
         self,
         bq_table: BigQueryTableRef,
@@ -708,22 +749,6 @@ def get_upstream_tables(
 
         return set(upstreams.values())
 
-    def calculate_lineage_for_project(
-        self,
-        project_id: str,
-        sql_parser_schema_resolver: SchemaResolver,
-    ) -> Dict[str, Set[LineageEdge]]:
-        with PerfTimer() as timer:
-            lineage = self._compute_bigquery_lineage(
-                project_id, sql_parser_schema_resolver
-            )
-
-            self.report.lineage_extraction_sec[project_id] = round(
-                timer.elapsed_seconds(), 2
-            )
-
-        return lineage
-
     def get_lineage_for_table(
         self,
         bq_table: BigQueryTableRef,
@@ -786,19 +811,10 @@ def get_lineage_for_table(
         return None
 
     def test_capability(self, project_id: str) -> None:
-        audit_log_api = BigQueryAuditLogApi(
-            self.report, self.config.rate_limit, self.config.requests_per_min
-        )
-
         if self.config.use_exported_bigquery_audit_metadata:
-            bigquery_client: BigQueryClient = BigQueryClient(project=project_id)
-            for entry in audit_log_api.get_exported_bigquery_audit_metadata(
-                bigquery_client=bigquery_client,
-                bigquery_audit_metadata_query_template=bigquery_audit_metadata_query_template_lineage,
-                bigquery_audit_metadata_datasets=self.config.bigquery_audit_metadata_datasets,
-                use_date_sharded_audit_log_tables=self.config.use_date_sharded_audit_log_tables,
-                start_time=self.config.start_time,
-                end_time=self.config.end_time,
+            for entry in self.get_exported_log_entries(
+                self.config.start_time,
+                self.config.end_time,
                 limit=1,
             ):
                 logger.debug(
@@ -808,7 +824,7 @@ def test_capability(self, project_id: str) -> None:
             gcp_logging_client: GCPLoggingClient = self.config.make_gcp_logging_client(
                 project_id
             )
-            for entry in audit_log_api.get_bigquery_log_entries_via_gcp_logging(
+            for entry in self.audit_log_api.get_bigquery_log_entries_via_gcp_logging(
                 gcp_logging_client,
                 filter=BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
                     self.config.start_time.strftime(BQ_DATETIME_FORMAT),
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
new file mode 100644
index 0000000000000..e04ea679584dc
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
@@ -0,0 +1,224 @@
+class BigqueryTableType:
+    # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema
+    BASE_TABLE = "BASE TABLE"
+    EXTERNAL = "EXTERNAL"
+    VIEW = "VIEW"
+    MATERIALIZED_VIEW = "MATERIALIZED VIEW"
+    CLONE = "CLONE"
+    SNAPSHOT = "SNAPSHOT"
+
+
+class BigqueryQuery:
+    show_datasets: str = (
+        "select schema_name from `{project_id}`.INFORMATION_SCHEMA.SCHEMATA"
+    )
+
+    datasets_for_project_id: str = """
+select
+  s.CATALOG_NAME as catalog_name,
+  s.schema_name as table_schema,
+  s.location as location,
+  s.CREATION_TIME as created,
+  s.LAST_MODIFIED_TIME as last_altered,
+  o.OPTION_VALUE as comment
+from
+  `{project_id}`.INFORMATION_SCHEMA.SCHEMATA as s
+  left join `{project_id}`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS as o on o.schema_name = s.schema_name
+  and o.option_name = "description"
+order by
+  s.schema_name
+"""
+
+    # https://cloud.google.com/bigquery/docs/information-schema-table-storage?hl=en
+    tables_for_dataset = f"""
+SELECT
+  t.table_catalog as table_catalog,
+  t.table_schema as table_schema,
+  t.table_name as table_name,
+  t.table_type as table_type,
+  t.creation_time as created,
+  ts.last_modified_time as last_altered,
+  tos.OPTION_VALUE as comment,
+  is_insertable_into,
+  ddl,
+  row_count,
+  size_bytes as bytes,
+  num_partitions,
+  max_partition_id,
+  active_billable_bytes,
+  long_term_billable_bytes,
+  REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
+  REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
+
+FROM
+  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+  join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
+  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+  and t.TABLE_NAME = tos.TABLE_NAME
+  and tos.OPTION_NAME = "description"
+  left join (
+    select
+        table_name,
+        sum(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then 1 else 0 END) as num_partitions,
+        max(case when partition_id not in ('__NULL__', '__UNPARTITIONED__', '__STREAMING_UNPARTITIONED__') then partition_id else NULL END) as max_partition_id,
+        sum(total_rows) as total_rows,
+        sum(case when storage_tier = 'LONG_TERM' then total_billable_bytes else 0 end) as long_term_billable_bytes,
+        sum(case when storage_tier = 'ACTIVE' then total_billable_bytes else 0 end) as active_billable_bytes,
+    from
+        `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.PARTITIONS
+    group by
+        table_name) as p on
+    t.table_name = p.table_name
+WHERE
+  table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
+{{table_filter}}
+order by
+  table_schema ASC,
+  table_base ASC,
+  table_suffix DESC
+"""
+
+    tables_for_dataset_without_partition_data = f"""
+SELECT
+  t.table_catalog as table_catalog,
+  t.table_schema as table_schema,
+  t.table_name as table_name,
+  t.table_type as table_type,
+  t.creation_time as created,
+  tos.OPTION_VALUE as comment,
+  is_insertable_into,
+  ddl,
+  REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix,
+  REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base
+
+FROM
+  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+  and t.TABLE_NAME = tos.TABLE_NAME
+  and tos.OPTION_NAME = "description"
+WHERE
+  table_type in ('{BigqueryTableType.BASE_TABLE}', '{BigqueryTableType.EXTERNAL}')
+{{table_filter}}
+order by
+  table_schema ASC,
+  table_base ASC,
+  table_suffix DESC
+"""
+
+    views_for_dataset: str = f"""
+SELECT
+  t.table_catalog as table_catalog,
+  t.table_schema as table_schema,
+  t.table_name as table_name,
+  t.table_type as table_type,
+  t.creation_time as created,
+  ts.last_modified_time as last_altered,
+  tos.OPTION_VALUE as comment,
+  is_insertable_into,
+  ddl as view_definition,
+  row_count,
+  size_bytes
+FROM
+  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+  join `{{project_id}}`.`{{dataset_name}}`.__TABLES__ as ts on ts.table_id = t.TABLE_NAME
+  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+  and t.TABLE_NAME = tos.TABLE_NAME
+  and tos.OPTION_NAME = "description"
+WHERE
+  table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
+order by
+  table_schema ASC,
+  table_name ASC
+"""
+
+    views_for_dataset_without_data_read: str = f"""
+SELECT
+  t.table_catalog as table_catalog,
+  t.table_schema as table_schema,
+  t.table_name as table_name,
+  t.table_type as table_type,
+  t.creation_time as created,
+  tos.OPTION_VALUE as comment,
+  is_insertable_into,
+  ddl as view_definition
+FROM
+  `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t
+  left join `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLE_OPTIONS as tos on t.table_schema = tos.table_schema
+  and t.TABLE_NAME = tos.TABLE_NAME
+  and tos.OPTION_NAME = "description"
+WHERE
+  table_type in ('{BigqueryTableType.VIEW}', '{BigqueryTableType.MATERIALIZED_VIEW}')
+order by
+  table_schema ASC,
+  table_name ASC
+"""
+
+    columns_for_dataset: str = """
+select
+  c.table_catalog as table_catalog,
+  c.table_schema as table_schema,
+  c.table_name as table_name,
+  c.column_name as column_name,
+  c.ordinal_position as ordinal_position,
+  cfp.field_path as field_path,
+  c.is_nullable as is_nullable,
+  CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
+  description as comment,
+  c.is_hidden as is_hidden,
+  c.is_partitioning_column as is_partitioning_column
+from
+  `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
+  join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
+  and cfp.column_name = c.column_name
+ORDER BY
+  table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
+
+    optimized_columns_for_dataset: str = """
+select * from
+(select
+  c.table_catalog as table_catalog,
+  c.table_schema as table_schema,
+  c.table_name as table_name,
+  c.column_name as column_name,
+  c.ordinal_position as ordinal_position,
+  cfp.field_path as field_path,
+  c.is_nullable as is_nullable,
+  CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
+  description as comment,
+  c.is_hidden as is_hidden,
+  c.is_partitioning_column as is_partitioning_column,
+  -- We count the columns to be able limit it later
+  row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num,
+  -- Getting the maximum shard for each table
+  row_number() over (partition by c.table_catalog, c.table_schema, ifnull(REGEXP_EXTRACT(c.table_name, r'(.*)_\\d{{8}}$'), c.table_name), cfp.field_path order by c.table_catalog, c.table_schema asc, c.table_name desc) as shard_num
+from
+  `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c
+  join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
+  and cfp.column_name = c.column_name
+  )
+-- We filter column limit + 1 to make sure we warn about the limit being reached but not reading too much data
+where column_num <= {column_limit} and shard_num = 1
+ORDER BY
+  table_catalog, table_schema, table_name, ordinal_position, column_num ASC, data_type DESC"""
+
+    columns_for_table: str = """
+select
+  c.table_catalog as table_catalog,
+  c.table_schema as table_schema,
+  c.table_name as table_name,
+  c.column_name as column_name,
+  c.ordinal_position as ordinal_position,
+  cfp.field_path as field_path,
+  c.is_nullable as is_nullable,
+  CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type,
+  c.is_hidden as is_hidden,
+  c.is_partitioning_column as is_partitioning_column,
+  description as comment
+from
+  `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c
+  join `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name
+  and cfp.column_name = c.column_name
+where
+  c.table_name = '{table_identifier.table}'
+ORDER BY
+  table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 84f218074d99b..4c4996ea59ed2 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -93,8 +93,12 @@ def test_bigquery_uri_with_credential():
         raise e
 
 
-@patch("google.cloud.bigquery.client.Client")
-def test_get_projects_with_project_ids(client_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_with_project_ids(get_bigquery_client):
+    client_mock = MagicMock()
+    get_bigquery_client.return_value = client_mock
     config = BigQueryV2Config.parse_obj(
         {
             "project_ids": ["test-1", "test-2"],
@@ -118,7 +122,12 @@ def test_get_projects_with_project_ids(client_mock):
     assert client_mock.list_projects.call_count == 0
 
 
-def test_get_projects_with_project_ids_overrides_project_id_pattern():
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_with_project_ids_overrides_project_id_pattern(
+    get_bigquery_client,
+):
     config = BigQueryV2Config.parse_obj(
         {
             "project_ids": ["test-project", "test-project-2"],
@@ -133,7 +142,10 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern():
     ]
 
 
-def test_get_dataplatform_instance_aspect_returns_project_id():
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_dataplatform_instance_aspect_returns_project_id(get_bigquery_client):
     project_id = "project_id"
     expected_instance = (
         f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})"
@@ -153,8 +165,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id():
     assert metadata.aspect.instance == expected_instance
 
 
-@patch("google.cloud.bigquery.client.Client")
-def test_get_projects_with_single_project_id(client_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_with_single_project_id(get_bigquery_client):
+    client_mock = MagicMock()
+    get_bigquery_client.return_value = client_mock
     config = BigQueryV2Config.parse_obj({"project_id": "test-3"})
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
     assert source._get_projects() == [
@@ -163,8 +179,12 @@ def test_get_projects_with_single_project_id(client_mock):
     assert client_mock.list_projects.call_count == 0
 
 
-@patch("google.cloud.bigquery.client.Client")
-def test_get_projects_by_list(client_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_by_list(get_bigquery_client):
+    client_mock = MagicMock()
+    get_bigquery_client.return_value = client_mock
     client_mock.list_projects.return_value = [
         SimpleNamespace(
             project_id="test-1",
@@ -178,7 +198,6 @@ def test_get_projects_by_list(client_mock):
 
     config = BigQueryV2Config.parse_obj({})
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
-    source.bigquery_data_dictionary.set_client(client_mock)
     assert source._get_projects() == [
         BigqueryProject("test-1", "one"),
         BigqueryProject("test-2", "two"),
@@ -187,7 +206,10 @@ def test_get_projects_by_list(client_mock):
 
 
 @patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-def test_get_projects_filter_by_pattern(get_projects_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_filter_by_pattern(get_bigquery_client, get_projects_mock):
     get_projects_mock.return_value = [
         BigqueryProject("test-project", "Test Project"),
         BigqueryProject("test-project-2", "Test Project 2"),
@@ -204,7 +226,10 @@ def test_get_projects_filter_by_pattern(get_projects_mock):
 
 
 @patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-def test_get_projects_list_empty(get_projects_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_list_empty(get_bigquery_client, get_projects_mock):
     get_projects_mock.return_value = []
 
     config = BigQueryV2Config.parse_obj(
@@ -217,8 +242,13 @@ def test_get_projects_list_empty(get_projects_mock):
 
 
 @patch.object(BigQueryTechnicalSchemaApi, "get_projects")
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
 def test_get_projects_list_failure(
-    get_projects_mock: MagicMock, caplog: pytest.LogCaptureFixture
+    get_bigquery_client: MagicMock,
+    get_projects_mock: MagicMock,
+    caplog: pytest.LogCaptureFixture,
 ) -> None:
     error_str = "my error"
     get_projects_mock.side_effect = GoogleAPICallError(error_str)
@@ -237,7 +267,10 @@ def test_get_projects_list_failure(
 
 
 @patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-def test_get_projects_list_fully_filtered(get_projects_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_get_projects_list_fully_filtered(get_projects_mock, get_bigquery_client):
     get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")]
 
     config = BigQueryV2Config.parse_obj(
@@ -249,7 +282,10 @@ def test_get_projects_list_fully_filtered(get_projects_mock):
     assert projects == []
 
 
-def test_simple_upstream_table_generation():
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_simple_upstream_table_generation(get_bigquery_client):
     a: BigQueryTableRef = BigQueryTableRef(
         BigqueryTableIdentifier(
             project_id="test-project", dataset="test-dataset", table="a"
@@ -280,7 +316,12 @@ def test_simple_upstream_table_generation():
     assert list(upstreams)[0].table == str(b)
 
 
-def test_upstream_table_generation_with_temporary_table_without_temp_upstream():
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_upstream_table_generation_with_temporary_table_without_temp_upstream(
+    get_bigquery_client,
+):
     a: BigQueryTableRef = BigQueryTableRef(
         BigqueryTableIdentifier(
             project_id="test-project", dataset="test-dataset", table="a"
@@ -310,7 +351,10 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream():
     assert list(upstreams) == []
 
 
-def test_upstream_table_column_lineage_with_temp_table():
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_upstream_table_column_lineage_with_temp_table(get_bigquery_client):
     from datahub.ingestion.api.common import PipelineContext
 
     a: BigQueryTableRef = BigQueryTableRef(
@@ -384,7 +428,12 @@ def test_upstream_table_column_lineage_with_temp_table():
     assert upstream.column_confidence == 0.7
 
 
-def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream():
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream(
+    get_bigquery_client,
+):
     a: BigQueryTableRef = BigQueryTableRef(
         BigqueryTableIdentifier(
             project_id="test-project", dataset="test-dataset", table="a"
@@ -447,8 +496,12 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr
 @patch(
     "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
 )
-@patch("google.cloud.bigquery.client.Client")
-def test_table_processing_logic(client_mock, data_dictionary_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_table_processing_logic(get_bigquery_client, data_dictionary_mock):
+    client_mock = MagicMock()
+    get_bigquery_client.return_value = client_mock
     config = BigQueryV2Config.parse_obj(
         {
             "project_id": "test-project",
@@ -498,7 +551,6 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
     data_dictionary_mock.get_tables_for_dataset.return_value = None
 
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
-    source.bigquery_data_dictionary.set_client(client_mock)
 
     _ = list(
         source.get_tables_for_dataset(
@@ -519,8 +571,14 @@ def test_table_processing_logic(client_mock, data_dictionary_mock):
 @patch(
     "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
 )
-@patch("google.cloud.bigquery.client.Client")
-def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_mock):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_table_processing_logic_date_named_tables(
+    get_bigquery_client, data_dictionary_mock
+):
+    client_mock = MagicMock()
+    get_bigquery_client.return_value = client_mock
     # test that tables with date names are processed correctly
     config = BigQueryV2Config.parse_obj(
         {
@@ -571,7 +629,6 @@ def test_table_processing_logic_date_named_tables(client_mock, data_dictionary_m
     data_dictionary_mock.get_tables_for_dataset.return_value = None
 
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test"))
-    source.bigquery_data_dictionary.set_client(client_mock)
 
     _ = list(
         source.get_tables_for_dataset(
@@ -627,13 +684,17 @@ def bigquery_view_2() -> BigqueryView:
 @patch(
     "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_query_result"
 )
-@patch("google.cloud.bigquery.client.Client")
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
 def test_get_views_for_dataset(
-    client_mock: Mock,
+    get_bigquery_client: Mock,
     query_mock: Mock,
     bigquery_view_1: BigqueryView,
     bigquery_view_2: BigqueryView,
 ) -> None:
+    client_mock = MagicMock()
+    get_bigquery_client.return_value = client_mock
     assert bigquery_view_1.last_altered
     row1 = create_row(
         dict(
@@ -655,8 +716,9 @@ def test_get_views_for_dataset(
         )
     )
     query_mock.return_value = [row1, row2]
-    bigquery_data_dictionary = BigQueryTechnicalSchemaApi(BigQueryV2Report())
-    bigquery_data_dictionary.set_client(client_mock)
+    bigquery_data_dictionary = BigQueryTechnicalSchemaApi(
+        BigQueryV2Report(), client_mock
+    )
 
     views = bigquery_data_dictionary.get_views_for_dataset(
         project_id="test-project",
@@ -667,7 +729,12 @@ def test_get_views_for_dataset(
 
 
 @patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: [])
-def test_gen_view_dataset_workunits(bigquery_view_1, bigquery_view_2):
+@patch(
+    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
+)
+def test_gen_view_dataset_workunits(
+    get_bigquery_client, bigquery_view_1, bigquery_view_2
+):
     project_id = "test-project"
     dataset_name = "test-dataset"
     config = BigQueryV2Config.parse_obj(

From e57f134e0a7ba880e103928ad8af825befa2c1d8 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Mon, 14 Aug 2023 16:54:41 +0530
Subject: [PATCH 04/11] report composition vs inheritance

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  10 +-
 .../source/bigquery_v2/bigquery_report.py     |  23 ++-
 .../source/bigquery_v2/bigquery_schema_api.py |  12 +-
 .../ingestion/source/bigquery_v2/lineage.py   |  12 +-
 .../ingestion/source/bigquery_v2/usage.py     |   4 +-
 .../tests/unit/test_bigquery_source.py        | 140 +++++++-----------
 6 files changed, 83 insertions(+), 118 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 4ff4648657959..542e153303257 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -47,8 +47,8 @@
     BigqueryColumn,
     BigqueryDataset,
     BigqueryProject,
+    BigQuerySchemaApi,
     BigqueryTable,
-    BigQueryTechnicalSchemaApi,
     BigqueryView,
 )
 from datahub.ingestion.source.bigquery_v2.common import (
@@ -220,8 +220,8 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
 
         set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase)
 
-        self.bigquery_data_dictionary = BigQueryTechnicalSchemaApi(
-            self.report, self.config.get_bigquery_client()
+        self.bigquery_data_dictionary = BigQuerySchemaApi(
+            self.report.schema_api_perf, self.config.get_bigquery_client()
         )
 
         # For database, schema, tables, views, etc
@@ -300,7 +300,9 @@ def metadata_read_capability_test(
                 client: bigquery.Client = config.get_bigquery_client()
                 assert client
                 report = BigQueryV2Report()
-                bigquery_data_dictionary = BigQueryTechnicalSchemaApi(report, client)
+                bigquery_data_dictionary = BigQuerySchemaApi(
+                    report.schema_api_perf, client
+                )
                 result = bigquery_data_dictionary.get_datasets_for_project_id(
                     project_id, 10
                 )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
index 6d5822723ec64..bf11045f24c24 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py
@@ -1,5 +1,4 @@
 import collections
-import dataclasses
 import logging
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
@@ -15,7 +14,7 @@
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-class BigQueryApiPerfReport:
+class BigQuerySchemaApiPerfReport:
     list_projects = PerfTimer()
     list_datasets = PerfTimer()
     get_columns_for_dataset = PerfTimer()
@@ -30,9 +29,7 @@ class BigQueryAuditLogApiPerfReport:
 
 
 @dataclass
-class BigQueryV2Report(
-    ProfilingSqlReport, BigQueryApiPerfReport, BigQueryAuditLogApiPerfReport
-):
+class BigQueryV2Report(ProfilingSqlReport):
     num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict)
     num_skipped_lineage_entries_missing_data: TopKDict[str, int] = field(
         default_factory=int_top_k_dict
@@ -106,16 +103,18 @@ class BigQueryV2Report(
     num_view_definitions_failed_column_parsing: int = 0
     view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
 
-    read_reasons_stat: Counter[str] = dataclasses.field(
-        default_factory=collections.Counter
-    )
-    operation_types_stat: Counter[str] = dataclasses.field(
-        default_factory=collections.Counter
-    )
+    read_reasons_stat: Counter[str] = field(default_factory=collections.Counter)
+    operation_types_stat: Counter[str] = field(default_factory=collections.Counter)
+
     usage_state_size: Optional[str] = None
     ingestion_stage: Optional[str] = None
     ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
-
+    schema_api_perf: BigQuerySchemaApiPerfReport = field(
+        default_factory=BigQuerySchemaApiPerfReport
+    )
+    audit_log_api_perf: BigQueryAuditLogApiPerfReport = field(
+        default_factory=BigQueryAuditLogApiPerfReport
+    )
     _timer: Optional[PerfTimer] = field(
         default=None, init=False, repr=False, compare=False
     )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
index b627af15ca213..9db6b27aa24e3 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
@@ -14,7 +14,7 @@
 
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.ingestion.source.bigquery_v2.bigquery_report import (
-    BigQueryApiPerfReport,
+    BigQuerySchemaApiPerfReport,
     BigQueryV2Report,
 )
 from datahub.ingestion.source.bigquery_v2.queries import (
@@ -125,8 +125,10 @@ class BigqueryProject:
     datasets: List[BigqueryDataset] = field(default_factory=list)
 
 
-class BigQueryTechnicalSchemaApi:
-    def __init__(self, report: BigQueryApiPerfReport, client: bigquery.Client) -> None:
+class BigQuerySchemaApi:
+    def __init__(
+        self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client
+    ) -> None:
         self.bq_client = client
         self.api_perf_report = report
 
@@ -229,7 +231,7 @@ def get_tables_for_dataset(
             for table in cur:
                 try:
                     with current_timer.pause_timer():
-                        yield BigQueryTechnicalSchemaApi._make_bigquery_table(
+                        yield BigQuerySchemaApi._make_bigquery_table(
                             table, tables.get(table.table_name)
                         )
                 except Exception as e:
@@ -306,7 +308,7 @@ def get_views_for_dataset(
             for table in cur:
                 try:
                     with current_timer.pause_timer():
-                        yield BigQueryTechnicalSchemaApi._make_bigquery_view(table)
+                        yield BigQuerySchemaApi._make_bigquery_view(table)
                 except Exception as e:
                     view_name = f"{project_id}.{dataset_name}.{table.table_name}"
                     logger.warning(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 4dc01bb1c7232..bb2f0360c13a6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -27,9 +27,7 @@
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
-    BigQueryTechnicalSchemaApi,
-)
+from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import BigQuerySchemaApi
 from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
 from datahub.metadata.schema_classes import (
     AuditStampClass,
@@ -192,7 +190,9 @@ def __init__(
         self.report = report
         self.dataset_urn_builder = dataset_urn_builder
         self.audit_log_api = BigQueryAuditLogApi(
-            report, self.config.rate_limit, self.config.requests_per_min
+            report.audit_log_api_perf,
+            self.config.rate_limit,
+            self.config.requests_per_min,
         )
 
     def error(self, log: logging.Logger, key: str, reason: str) -> None:
@@ -394,8 +394,8 @@ def lineage_via_catalog_lineage_api(
         try:
             lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient()
 
-            data_dictionary = BigQueryTechnicalSchemaApi(
-                self.report, self.config.get_bigquery_client()
+            data_dictionary = BigQuerySchemaApi(
+                self.report.schema_api_perf, self.config.get_bigquery_client()
             )
 
             # Filtering datasets
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index 51d74168c4970..f0bfc2e477371 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -776,7 +776,9 @@ def _get_parsed_bigquery_log_events(
         self, project_id: str, limit: Optional[int] = None
     ) -> Iterable[AuditEvent]:
         audit_log_api = BigQueryAuditLogApi(
-            self.report, self.config.rate_limit, self.config.requests_per_min
+            self.report.audit_log_api_perf,
+            self.config.rate_limit,
+            self.config.requests_per_min,
         )
         # We adjust the filter values a bit, since we need to make sure that the join
         # between query events and read events is complete. For example, this helps us
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 4c4996ea59ed2..4a7d1eef399d6 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -21,7 +21,7 @@
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
 from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     BigqueryProject,
-    BigQueryTechnicalSchemaApi,
+    BigQuerySchemaApi,
     BigqueryView,
 )
 from datahub.ingestion.source.bigquery_v2.lineage import (
@@ -93,12 +93,10 @@ def test_bigquery_uri_with_credential():
         raise e
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_projects_with_project_ids(get_bigquery_client):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_with_project_ids(get_bq_client_mock):
     client_mock = MagicMock()
-    get_bigquery_client.return_value = client_mock
+    get_bq_client_mock.return_value = client_mock
     config = BigQueryV2Config.parse_obj(
         {
             "project_ids": ["test-1", "test-2"],
@@ -122,11 +120,9 @@ def test_get_projects_with_project_ids(get_bigquery_client):
     assert client_mock.list_projects.call_count == 0
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_get_projects_with_project_ids_overrides_project_id_pattern(
-    get_bigquery_client,
+    get_bq_client_mock,
 ):
     config = BigQueryV2Config.parse_obj(
         {
@@ -142,10 +138,8 @@ def test_get_projects_with_project_ids_overrides_project_id_pattern(
     ]
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_dataplatform_instance_aspect_returns_project_id(get_bigquery_client):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock):
     project_id = "project_id"
     expected_instance = (
         f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})"
@@ -165,12 +159,10 @@ def test_get_dataplatform_instance_aspect_returns_project_id(get_bigquery_client
     assert metadata.aspect.instance == expected_instance
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_projects_with_single_project_id(get_bigquery_client):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_with_single_project_id(get_bq_client_mock):
     client_mock = MagicMock()
-    get_bigquery_client.return_value = client_mock
+    get_bq_client_mock.return_value = client_mock
     config = BigQueryV2Config.parse_obj({"project_id": "test-3"})
     source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test1"))
     assert source._get_projects() == [
@@ -179,12 +171,10 @@ def test_get_projects_with_single_project_id(get_bigquery_client):
     assert client_mock.list_projects.call_count == 0
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_projects_by_list(get_bigquery_client):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_by_list(get_bq_client_mock):
     client_mock = MagicMock()
-    get_bigquery_client.return_value = client_mock
+    get_bq_client_mock.return_value = client_mock
     client_mock.list_projects.return_value = [
         SimpleNamespace(
             project_id="test-1",
@@ -205,11 +195,9 @@ def test_get_projects_by_list(get_bigquery_client):
     assert client_mock.list_projects.call_count == 1
 
 
-@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_projects_filter_by_pattern(get_bigquery_client, get_projects_mock):
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_filter_by_pattern(get_bq_client_mock, get_projects_mock):
     get_projects_mock.return_value = [
         BigqueryProject("test-project", "Test Project"),
         BigqueryProject("test-project-2", "Test Project 2"),
@@ -225,11 +213,9 @@ def test_get_projects_filter_by_pattern(get_bigquery_client, get_projects_mock):
     ]
 
 
-@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_projects_list_empty(get_bigquery_client, get_projects_mock):
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock):
     get_projects_mock.return_value = []
 
     config = BigQueryV2Config.parse_obj(
@@ -241,12 +227,10 @@ def test_get_projects_list_empty(get_bigquery_client, get_projects_mock):
     assert projects == []
 
 
-@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_get_projects_list_failure(
-    get_bigquery_client: MagicMock,
+    get_bq_client_mock: MagicMock,
     get_projects_mock: MagicMock,
     caplog: pytest.LogCaptureFixture,
 ) -> None:
@@ -266,11 +250,9 @@ def test_get_projects_list_failure(
     assert projects == []
 
 
-@patch.object(BigQueryTechnicalSchemaApi, "get_projects")
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_get_projects_list_fully_filtered(get_projects_mock, get_bigquery_client):
+@patch.object(BigQuerySchemaApi, "get_projects")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_get_projects_list_fully_filtered(get_projects_mock, get_bq_client_mock):
     get_projects_mock.return_value = [BigqueryProject("test-project", "Test Project")]
 
     config = BigQueryV2Config.parse_obj(
@@ -282,10 +264,8 @@ def test_get_projects_list_fully_filtered(get_projects_mock, get_bigquery_client
     assert projects == []
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_simple_upstream_table_generation(get_bigquery_client):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_simple_upstream_table_generation(get_bq_client_mock):
     a: BigQueryTableRef = BigQueryTableRef(
         BigqueryTableIdentifier(
             project_id="test-project", dataset="test-dataset", table="a"
@@ -316,11 +296,9 @@ def test_simple_upstream_table_generation(get_bigquery_client):
     assert list(upstreams)[0].table == str(b)
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_upstream_table_generation_with_temporary_table_without_temp_upstream(
-    get_bigquery_client,
+    get_bq_client_mock,
 ):
     a: BigQueryTableRef = BigQueryTableRef(
         BigqueryTableIdentifier(
@@ -351,10 +329,8 @@ def test_upstream_table_generation_with_temporary_table_without_temp_upstream(
     assert list(upstreams) == []
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_upstream_table_column_lineage_with_temp_table(get_bigquery_client):
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_upstream_table_column_lineage_with_temp_table(get_bq_client_mock):
     from datahub.ingestion.api.common import PipelineContext
 
     a: BigQueryTableRef = BigQueryTableRef(
@@ -428,11 +404,9 @@ def test_upstream_table_column_lineage_with_temp_table(get_bigquery_client):
     assert upstream.column_confidence == 0.7
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstream(
-    get_bigquery_client,
+    get_bq_client_mock,
 ):
     a: BigQueryTableRef = BigQueryTableRef(
         BigqueryTableIdentifier(
@@ -493,15 +467,11 @@ def test_upstream_table_generation_with_temporary_table_with_multiple_temp_upstr
     assert sorted_list[1].table == str(e)
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
-)
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
-def test_table_processing_logic(get_bigquery_client, data_dictionary_mock):
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
+def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock):
     client_mock = MagicMock()
-    get_bigquery_client.return_value = client_mock
+    get_bq_client_mock.return_value = client_mock
     config = BigQueryV2Config.parse_obj(
         {
             "project_id": "test-project",
@@ -568,17 +538,13 @@ def test_table_processing_logic(get_bigquery_client, data_dictionary_mock):
         assert table in ["test-table", "test-sharded-table_20220102"]
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
-)
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_table_processing_logic_date_named_tables(
-    get_bigquery_client, data_dictionary_mock
+    get_bq_client_mock, data_dictionary_mock
 ):
     client_mock = MagicMock()
-    get_bigquery_client.return_value = client_mock
+    get_bq_client_mock.return_value = client_mock
     # test that tables with date names are processed correctly
     config = BigQueryV2Config.parse_obj(
         {
@@ -681,20 +647,16 @@ def bigquery_view_2() -> BigqueryView:
     )
 
 
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_query_result"
-)
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQuerySchemaApi, "get_query_result")
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_get_views_for_dataset(
-    get_bigquery_client: Mock,
+    get_bq_client_mock: Mock,
     query_mock: Mock,
     bigquery_view_1: BigqueryView,
     bigquery_view_2: BigqueryView,
 ) -> None:
     client_mock = MagicMock()
-    get_bigquery_client.return_value = client_mock
+    get_bq_client_mock.return_value = client_mock
     assert bigquery_view_1.last_altered
     row1 = create_row(
         dict(
@@ -716,8 +678,8 @@ def test_get_views_for_dataset(
         )
     )
     query_mock.return_value = [row1, row2]
-    bigquery_data_dictionary = BigQueryTechnicalSchemaApi(
-        BigQueryV2Report(), client_mock
+    bigquery_data_dictionary = BigQuerySchemaApi(
+        BigQueryV2Report().schema_api_perf, client_mock
     )
 
     views = bigquery_data_dictionary.get_views_for_dataset(
@@ -729,11 +691,9 @@ def test_get_views_for_dataset(
 
 
 @patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: [])
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_config.BigQueryV2Config.get_bigquery_client"
-)
+@patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_gen_view_dataset_workunits(
-    get_bigquery_client, bigquery_view_1, bigquery_view_2
+    get_bq_client_mock, bigquery_view_1, bigquery_view_2
 ):
     project_id = "test-project"
     dataset_name = "test-dataset"

From 38b18bb2c940dc53cd9585b6945e28148145b1ca Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Mon, 14 Aug 2023 20:23:59 +0530
Subject: [PATCH 05/11] more refractor and fixes

---
 .../ingestion/source/bigquery_v2/bigquery.py  |   4 +-
 .../bigquery_v2/bigquery_audit_log_api.py     | 211 +-----------------
 .../source/bigquery_v2/bigquery_config.py     |   6 +-
 .../source/bigquery_v2/bigquery_schema_api.py |  24 +-
 .../ingestion/source/bigquery_v2/lineage.py   |  20 +-
 .../ingestion/source/bigquery_v2/queries.py   | 199 +++++++++++++++++
 .../ingestion/source/bigquery_v2/usage.py     |   6 +-
 7 files changed, 246 insertions(+), 224 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 542e153303257..67b9d6556b3e8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -299,9 +299,8 @@ def metadata_read_capability_test(
                 logger.info((f"Metadata read capability test for project {project_id}"))
                 client: bigquery.Client = config.get_bigquery_client()
                 assert client
-                report = BigQueryV2Report()
                 bigquery_data_dictionary = BigQuerySchemaApi(
-                    report.schema_api_perf, client
+                    BigQueryV2Report().schema_api_perf, client
                 )
                 result = bigquery_data_dictionary.get_datasets_for_project_id(
                     project_id, 10
@@ -524,6 +523,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 [p.id for p in projects],
                 self.sql_parser_schema_resolver,
                 self.view_definition_ids,
+                self.view_definitions,
                 self.table_refs,
             )
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
index b017b1d08a1ee..fcb6200241cd7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
@@ -1,5 +1,4 @@
 import logging
-import textwrap
 from datetime import datetime
 from typing import Callable, Iterable, List, Optional
 
@@ -40,7 +39,12 @@ def get_exported_bigquery_audit_metadata(
         self,
         bigquery_client: bigquery.Client,
         bigquery_audit_metadata_query_template: Callable[
-            [str, bool, Optional[int]], str
+            [
+                str,  # dataset: str
+                bool,  # use_date_sharded_tables: bool
+                Optional[int],  # limit: Optional[int] = None
+            ],
+            str,
         ],
         bigquery_audit_metadata_datasets: Optional[List[str]],
         use_date_sharded_audit_log_tables: bool,
@@ -57,6 +61,10 @@ def get_exported_bigquery_audit_metadata(
         audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT)
         audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT)
 
+        rate_limiter: Optional[RateLimiter] = None
+        if self.rate_limit:
+            rate_limiter = RateLimiter(max_calls=self.requests_per_min, period=60)
+
         with self.report.get_exported_log_entries as current_timer:
             for dataset in bigquery_audit_metadata_datasets:
                 logger.info(
@@ -79,8 +87,8 @@ def get_exported_bigquery_audit_metadata(
                     f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
                 )
 
-                if self.rate_limit:
-                    with RateLimiter(max_calls=self.requests_per_min, period=60):
+                if rate_limiter:
+                    with rate_limiter:
                         for entry in query_job:
                             with current_timer.pause_timer():
                                 yield entry
@@ -136,198 +144,3 @@ def get_bigquery_log_entries_via_gcp_logging(
             logger.info(
                 f"Finished loading log entries from GCP Log for {client.project}"
             )
-
-
-def bigquery_audit_metadata_query_template_usage(
-    dataset: str,
-    use_date_sharded_tables: bool,
-    limit: Optional[int] = None,
-) -> str:
-    """
-    Receives a dataset (with project specified) and returns a query template that is used to query exported
-    v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
-    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
-    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
-           tables
-    :param limit: maximum number of events to query for
-    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
-    """
-
-    limit_text = f"limit {limit}" if limit else ""
-
-    shard_condition = ""
-    if use_date_sharded_tables:
-        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
-        shard_condition = (
-            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
-        )
-    else:
-        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
-
-    # Deduplicates insertId via QUALIFY, see:
-    # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field
-    query = f"""
-        SELECT
-            timestamp,
-            logName,
-            insertId,
-            protopayload_auditlog AS protoPayload,
-            protopayload_auditlog.metadataJson AS metadata
-        FROM
-            {from_table}
-        WHERE (
-            timestamp >= "{{start_time}}"
-            AND timestamp < "{{end_time}}"
-        )
-        {shard_condition}
-        AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
-        AND
-        (
-            (
-                protopayload_auditlog.methodName IN
-                    (
-                        "google.cloud.bigquery.v2.JobService.Query",
-                        "google.cloud.bigquery.v2.JobService.InsertJob"
-                    )
-                AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
-                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
-                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
-                AND (
-                        JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson,
-                                                            "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL
-                    OR
-                        JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL
-                    )
-            )
-            OR
-                JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB"
-        )
-        QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
-        {limit_text};
-    """
-
-    return textwrap.dedent(query)
-
-
-def bigquery_audit_metadata_query_template_lineage(
-    dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None
-) -> str:
-    """
-    Receives a dataset (with project specified) and returns a query template that is used to query exported
-    AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
-    Include only those that:
-    - have been completed (jobStatus.jobState = "DONE")
-    - do not contain errors (jobStatus.errorResults is none)
-    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
-    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
-           tables
-    :param limit: set a limit for the maximum event to return. It is used for connection testing currently
-    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
-    """
-    limit_text = f"limit {limit}" if limit else ""
-
-    shard_condition = ""
-    if use_date_sharded_tables:
-        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
-        shard_condition = (
-            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
-        )
-    else:
-        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
-
-    query = f"""
-            SELECT
-                timestamp,
-                logName,
-                insertId,
-                protopayload_auditlog AS protoPayload,
-                protopayload_auditlog.metadataJson AS metadata
-            FROM
-                {from_table}
-            WHERE (
-                timestamp >= "{{start_time}}"
-                AND timestamp < "{{end_time}}"
-            )
-            {shard_condition}
-            AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
-            AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
-            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
-            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
-            {limit_text};
-        """
-
-    return textwrap.dedent(query)
-
-
-BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """
-resource.type=("bigquery_project" OR "bigquery_dataset")
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-AND protoPayload.serviceName="bigquery.googleapis.com"
-AND
-(
-    (
-        protoPayload.methodName=
-            (
-                "google.cloud.bigquery.v2.JobService.Query"
-                OR
-                "google.cloud.bigquery.v2.JobService.InsertJob"
-            )
-        AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
-        AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
-        AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
-        AND
-        (
-            (
-                protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
-                AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
-            )
-            OR
-            (
-                protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
-            )
-        )
-    )
-    OR
-    protoPayload.metadata.tableDataRead.reason = "JOB"
-)
-""".strip(
-    "\t \n"
-)
-
-BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """
-resource.type=("bigquery_project")
-AND
-(
-    protoPayload.methodName=
-        (
-            "google.cloud.bigquery.v2.JobService.Query"
-            OR
-            "google.cloud.bigquery.v2.JobService.InsertJob"
-        )
-    AND
-    protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
-    AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
-    AND (
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
-        OR
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
-    )
-    AND (
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*"
-        AND
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*"
-        AND
-        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__"
-        AND
-        protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*"
-    )
-
-)
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-""".strip()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index 84fdead338ee6..6449c6ead1e58 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -269,13 +269,13 @@ def profile_default_settings(cls, values: Dict) -> Dict:
     @validator("bigquery_audit_metadata_datasets")
     def validate_bigquery_audit_metadata_datasets(
         cls, v: Optional[List[str]], values: Dict
-    ) -> Dict:
+    ) -> Optional[List[str]]:
         if values.get("use_exported_bigquery_audit_metadata"):
             assert (
                 v and len(v) > 0
-            ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata` for usage/lineage."
+            ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
 
-        return values
+        return v
 
     @root_validator(pre=False)
     def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
index 9db6b27aa24e3..ca3aae7394469 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
@@ -130,7 +130,7 @@ def __init__(
         self, report: BigQuerySchemaApiPerfReport, client: bigquery.Client
     ) -> None:
         self.bq_client = client
-        self.api_perf_report = report
+        self.report = report
 
     def get_client(self) -> bigquery.Client:
         assert self.bq_client is not None
@@ -142,7 +142,7 @@ def get_query_result(self, query: str) -> RowIterator:
         return resp.result()
 
     def get_projects(self) -> List[BigqueryProject]:
-        with self.api_perf_report.list_projects:
+        with self.report.list_projects:
             projects = self.get_client().list_projects()
 
             return [
@@ -152,7 +152,7 @@ def get_projects(self) -> List[BigqueryProject]:
     def get_datasets_for_project_id(
         self, project_id: str, maxResults: Optional[int] = None
     ) -> List[BigqueryDataset]:
-        with self.api_perf_report.list_datasets:
+        with self.report.list_datasets:
             datasets = self.get_client().list_datasets(
                 project_id, max_results=maxResults
             )
@@ -187,7 +187,7 @@ def get_datasets_for_project_id_with_information_schema(
     def list_tables(
         self, dataset_name: str, project_id: str
     ) -> Iterator[TableListItem]:
-        with self.api_perf_report.list_tables as current_timer:
+        with self.report.list_tables as current_timer:
             for table in self.get_client().list_tables(f"{project_id}.{dataset_name}"):
                 with current_timer.pause_timer():
                     yield table
@@ -200,8 +200,8 @@ def get_tables_for_dataset(
         with_data_read_permission: bool = False,
         report: Optional[BigQueryV2Report] = None,
     ) -> Iterator[BigqueryTable]:
-        with self.api_perf_report.get_tables_for_dataset as current_timer:
-            filter: str = ", ".join(f"'{table}'" for table in tables.keys())
+        with self.report.get_tables_for_dataset as current_timer:
+            filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys())
 
             if with_data_read_permission:
                 # Tables are ordered by name and table suffix to make sure we always process the latest sharded table
@@ -210,8 +210,8 @@ def get_tables_for_dataset(
                     BigqueryQuery.tables_for_dataset.format(
                         project_id=project_id,
                         dataset_name=dataset_name,
-                        table_filter=f" and t.table_name in ({filter})"
-                        if filter
+                        table_filter=f" and t.table_name in ({filter_clause})"
+                        if filter_clause
                         else "",
                     ),
                 )
@@ -222,8 +222,8 @@ def get_tables_for_dataset(
                     BigqueryQuery.tables_for_dataset_without_partition_data.format(
                         project_id=project_id,
                         dataset_name=dataset_name,
-                        table_filter=f" and t.table_name in ({filter})"
-                        if filter
+                        table_filter=f" and t.table_name in ({filter_clause})"
+                        if filter_clause
                         else "",
                     ),
                 )
@@ -291,7 +291,7 @@ def get_views_for_dataset(
         has_data_read: bool,
         report: Optional[BigQueryV2Report] = None,
     ) -> Iterator[BigqueryView]:
-        with self.api_perf_report.get_views_for_dataset as current_timer:
+        with self.report.get_views_for_dataset as current_timer:
             if has_data_read:
                 cur = self.get_query_result(
                     BigqueryQuery.views_for_dataset.format(
@@ -344,7 +344,7 @@ def get_columns_for_dataset(
         run_optimized_column_query: bool = False,
     ) -> Optional[Dict[str, List[BigqueryColumn]]]:
         columns: Dict[str, List[BigqueryColumn]] = defaultdict(list)
-        with self.api_perf_report.get_columns_for_dataset:
+        with self.report.get_columns_for_dataset:
             try:
                 cur = self.get_query_result(
                     BigqueryQuery.columns_for_dataset.format(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index bb2f0360c13a6..c68fe5ef81745 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -21,14 +21,16 @@
     ReadEvent,
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import (
-    BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE,
     BigQueryAuditLogApi,
-    bigquery_audit_metadata_query_template_lineage,
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
 from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import BigQuerySchemaApi
 from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
+from datahub.ingestion.source.bigquery_v2.queries import (
+    BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE,
+    bigquery_audit_metadata_query_template_lineage,
+)
 from datahub.metadata.schema_classes import (
     AuditStampClass,
     DatasetLineageTypeClass,
@@ -40,6 +42,7 @@
 )
 from datahub.specific.dataset import DatasetPatchBuilder
 from datahub.utilities import memory_footprint
+from datahub.utilities.file_backed_collections import FileBackedDict
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.sqlglot_lineage import (
     SchemaResolver,
@@ -204,6 +207,7 @@ def get_lineage_workunits(
         projects: List[str],
         sql_parser_schema_resolver: SchemaResolver,
         view_definition_ids: Dict[str, Dict[str, str]],
+        view_definitions: FileBackedDict[str],
         table_refs: Set[str],
     ) -> Iterable[MetadataWorkUnit]:
         views_skip_audit_log_lineage: Set[str] = set()
@@ -213,6 +217,7 @@ def get_lineage_workunits(
                 self.populate_view_lineage_with_sql_parsing(
                     view_lineage,
                     view_definition_ids[project],
+                    view_definitions,
                     sql_parser_schema_resolver,
                     project,
                 )
@@ -289,11 +294,12 @@ def populate_view_lineage_with_sql_parsing(
         self,
         view_lineage: Dict[str, Set[LineageEdge]],
         view_definition_ids: Dict[str, str],
+        view_definitions: FileBackedDict[str],
         sql_parser_schema_resolver: SchemaResolver,
         default_project: str,
     ) -> None:
         for view, view_definition_id in view_definition_ids.items():
-            view_definition = view_definition_ids[view_definition_id]
+            view_definition = view_definitions[view_definition_id]
             raw_view_lineage = sqlglot_lineage(
                 view_definition,
                 schema_resolver=sql_parser_schema_resolver,
@@ -493,7 +499,9 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]:
 
         parse_fn: Callable[[Any], Optional[Union[ReadEvent, QueryEvent]]]
         if self.config.use_exported_bigquery_audit_metadata:
-            self.get_exported_log_entries(corrected_start_time, corrected_end_time)
+            entries = self.get_exported_log_entries(
+                corrected_start_time, corrected_end_time
+            )
             parse_fn = self._parse_exported_bigquery_audit_metadata
         else:
             entries = self.get_log_entries_via_gcp_logging(
@@ -541,8 +549,8 @@ def get_log_entries_via_gcp_logging(
         entries = self.audit_log_api.get_bigquery_log_entries_via_gcp_logging(
             logging_client,
             BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE.format(
-                corrected_start_time.strftime(BQ_DATETIME_FORMAT),
-                corrected_end_time.strftime(BQ_DATETIME_FORMAT),
+                start_time=corrected_start_time.strftime(BQ_DATETIME_FORMAT),
+                end_time=corrected_end_time.strftime(BQ_DATETIME_FORMAT),
             ),
             self.config.log_page_size,
         )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
index e04ea679584dc..86b2b9bd4aab8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py
@@ -1,3 +1,7 @@
+import textwrap
+from typing import Optional
+
+
 class BigqueryTableType:
     # See https://cloud.google.com/bigquery/docs/information-schema-tables#schema
     BASE_TABLE = "BASE TABLE"
@@ -222,3 +226,198 @@ class BigqueryQuery:
   c.table_name = '{table_identifier.table}'
 ORDER BY
   table_catalog, table_schema, table_name, ordinal_position ASC, data_type DESC"""
+
+
+BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE = """
+resource.type=("bigquery_project")
+AND
+(
+    protoPayload.methodName=
+        (
+            "google.cloud.bigquery.v2.JobService.Query"
+            OR
+            "google.cloud.bigquery.v2.JobService.InsertJob"
+        )
+    AND
+    protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
+    AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
+    AND (
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
+        OR
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedViews:*
+    )
+    AND (
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/_.*/tables/anon.*"
+        AND
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/INFORMATION_SCHEMA.*"
+        AND
+        protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables !~ "projects/.*/datasets/.*/tables/__TABLES__"
+        AND
+        protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable !~ "projects/.*/datasets/_.*/tables/anon.*"
+    )
+
+)
+AND
+timestamp >= "{start_time}"
+AND
+timestamp < "{end_time}"
+""".strip()
+BQ_FILTER_RULE_TEMPLATE_V2_USAGE = """
+resource.type=("bigquery_project" OR "bigquery_dataset")
+AND
+timestamp >= "{start_time}"
+AND
+timestamp < "{end_time}"
+AND protoPayload.serviceName="bigquery.googleapis.com"
+AND
+(
+    (
+        protoPayload.methodName=
+            (
+                "google.cloud.bigquery.v2.JobService.Query"
+                OR
+                "google.cloud.bigquery.v2.JobService.InsertJob"
+            )
+        AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
+        AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
+        AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
+        AND
+        (
+            (
+                protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
+                AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
+            )
+            OR
+            (
+                protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
+            )
+        )
+    )
+    OR
+    protoPayload.metadata.tableDataRead.reason = "JOB"
+)
+""".strip(
+    "\t \n"
+)
+
+
+def bigquery_audit_metadata_query_template_lineage(
+    dataset: str, use_date_sharded_tables: bool, limit: Optional[int] = None
+) -> str:
+    """
+    Receives a dataset (with project specified) and returns a query template that is used to query exported
+    AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
+    Include only those that:
+    - have been completed (jobStatus.jobState = "DONE")
+    - do not contain errors (jobStatus.errorResults is none)
+    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
+    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
+           tables
+    :param limit: set a limit for the maximum event to return. It is used for connection testing currently
+    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
+    """
+    limit_text = f"limit {limit}" if limit else ""
+
+    shard_condition = ""
+    if use_date_sharded_tables:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
+        shard_condition = (
+            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+        )
+    else:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
+
+    query = f"""
+            SELECT
+                timestamp,
+                logName,
+                insertId,
+                protopayload_auditlog AS protoPayload,
+                protopayload_auditlog.metadataJson AS metadata
+            FROM
+                {from_table}
+            WHERE (
+                timestamp >= "{{start_time}}"
+                AND timestamp < "{{end_time}}"
+            )
+            {shard_condition}
+            AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
+            AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
+            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
+            AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
+            QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
+            {limit_text};
+        """
+
+    return textwrap.dedent(query)
+
+
+def bigquery_audit_metadata_query_template_usage(
+    dataset: str,
+    use_date_sharded_tables: bool,
+    limit: Optional[int] = None,
+) -> str:
+    """
+    Receives a dataset (with project specified) and returns a query template that is used to query exported
+    v2 AuditLogs containing protoPayloads of type BigQueryAuditMetadata.
+    :param dataset: the dataset to query against in the form of $PROJECT.$DATASET
+    :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log
+           tables
+    :param limit: maximum number of events to query for
+    :return: a query template, when supplied start_time and end_time, can be used to query audit logs from BigQuery
+    """
+
+    limit_text = f"limit {limit}" if limit else ""
+
+    shard_condition = ""
+    if use_date_sharded_tables:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access_*`"
+        shard_condition = (
+            """ AND _TABLE_SUFFIX BETWEEN "{start_date}" AND "{end_date}" """
+        )
+    else:
+        from_table = f"`{dataset}.cloudaudit_googleapis_com_data_access`"
+
+    # Deduplicates insertId via QUALIFY, see:
+    # https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry, insertId field
+    query = f"""
+        SELECT
+            timestamp,
+            logName,
+            insertId,
+            protopayload_auditlog AS protoPayload,
+            protopayload_auditlog.metadataJson AS metadata
+        FROM
+            {from_table}
+        WHERE (
+            timestamp >= "{{start_time}}"
+            AND timestamp < "{{end_time}}"
+        )
+        {shard_condition}
+        AND protopayload_auditlog.serviceName="bigquery.googleapis.com"
+        AND
+        (
+            (
+                protopayload_auditlog.methodName IN
+                    (
+                        "google.cloud.bigquery.v2.JobService.Query",
+                        "google.cloud.bigquery.v2.JobService.InsertJob"
+                    )
+                AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE"
+                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL
+                AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL
+                AND (
+                        JSON_EXTRACT_ARRAY(protopayload_auditlog.metadataJson,
+                                                            "$.jobChange.job.jobStats.queryStats.referencedTables") IS NOT NULL
+                    OR
+                        JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig.destinationTable") IS NOT NULL
+                    )
+            )
+            OR
+                JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.tableDataRead.reason") = "JOB"
+        )
+        QUALIFY ROW_NUMBER() OVER (PARTITION BY insertId, timestamp, logName) = 1
+        {limit_text};
+    """
+
+    return textwrap.dedent(query)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
index f0bfc2e477371..3c12fd7216963 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py
@@ -37,13 +37,15 @@
     ReadEvent,
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_audit_log_api import (
-    BQ_FILTER_RULE_TEMPLATE_V2_USAGE,
     BigQueryAuditLogApi,
-    bigquery_audit_metadata_query_template_usage,
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
 from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
+from datahub.ingestion.source.bigquery_v2.queries import (
+    BQ_FILTER_RULE_TEMPLATE_V2_USAGE,
+    bigquery_audit_metadata_query_template_usage,
+)
 from datahub.ingestion.source.usage.usage_common import (
     TOTAL_BUDGET_FOR_QUERY_LIST,
     make_usage_workunit,

From 31a3be8ab6be696f2e97826471c3c4d5685daa82 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Mon, 14 Aug 2023 21:13:50 +0530
Subject: [PATCH 06/11] fix lint, tests

---
 .../source/bigquery_v2/bigquery_config.py          |  2 +-
 .../ingestion/source/bigquery_v2/lineage.py        |  2 +-
 .../tests/integration/bigquery_v2/test_bigquery.py | 14 +++++---------
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index 6449c6ead1e58..6634aacf0426f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -39,7 +39,7 @@ class BigQueryUsageConfig(BaseUsageConfig):
 
 class BigQueryConnectionConfig(ConfigModel):
     credential: Optional[BigQueryCredential] = Field(
-        description="BigQuery credential informations"
+        default=None, description="BigQuery credential informations"
     )
 
     _credentials_path: Optional[str] = PrivateAttr(None)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index c68fe5ef81745..7293443ad811f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -332,7 +332,7 @@ def populate_view_lineage_with_sql_parsing(
             )
 
     def gen_lineage_workunits_for_table(
-        self, lineage: dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef
+        self, lineage: Dict[str, Set[LineageEdge]], table_ref: BigQueryTableRef
     ) -> Iterable[MetadataWorkUnit]:
         dataset_urn = self.dataset_urn_builder(table_ref)
 
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index ba3ea06b07623..5d5b83f576d31 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -4,8 +4,10 @@
 from freezegun import freeze_time
 from google.cloud.bigquery.table import TableListItem
 
+from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source
 from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
     BigqueryDataset,
+    BigQuerySchemaApi,
     BigqueryTable,
 )
 from tests.test_helpers import mce_helpers
@@ -15,15 +17,9 @@
 
 
 @freeze_time(FROZEN_TIME)
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_tables_for_dataset"
-)
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery.BigqueryV2Source.get_core_table_details"
-)
-@patch(
-    "datahub.ingestion.source.bigquery_v2.bigquery_schema_api.BigQueryTechnicalSchemaApi.get_datasets_for_project_id"
-)
+@patch.object(BigQuerySchemaApi, "get_tables_for_dataset")
+@patch.object(BigqueryV2Source, "get_core_table_details")
+@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id")
 @patch("google.cloud.bigquery.Client")
 def test_bigquery_v2_ingest(
     client,

From eaa72a3aa0b2bd9ff9b212529d9042ef65b1e39d Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Tue, 22 Aug 2023 12:16:58 +0530
Subject: [PATCH 07/11] revert rename of bigquery_schema.py to
 bigquery_schema_api.py

---
 .../src/datahub/ingestion/source/bigquery_v2/bigquery.py        | 2 +-
 .../bigquery_v2/{bigquery_schema_api.py => bigquery_schema.py}  | 0
 .../src/datahub/ingestion/source/bigquery_v2/lineage.py         | 2 +-
 .../src/datahub/ingestion/source/bigquery_v2/profiler.py        | 2 +-
 .../tests/integration/bigquery_v2/test_bigquery.py              | 2 +-
 metadata-ingestion/tests/unit/test_bigquery_profiler.py         | 2 +-
 metadata-ingestion/tests/unit/test_bigquery_source.py           | 2 +-
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/{bigquery_schema_api.py => bigquery_schema.py} (100%)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 67b9d6556b3e8..86cca0c45da5a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -43,7 +43,7 @@
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
     BigqueryColumn,
     BigqueryDataset,
     BigqueryProject,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
similarity index 100%
rename from metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_api.py
rename to metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 7293443ad811f..aaf0f0f39134b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -25,7 +25,7 @@
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import BigQuerySchemaApi
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigQuerySchemaApi
 from datahub.ingestion.source.bigquery_v2.common import BQ_DATETIME_FORMAT
 from datahub.ingestion.source.bigquery_v2.queries import (
     BQ_FILTER_RULE_TEMPLATE_V2_LINEAGE,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
index f825bbf666b64..c9dcb4fe35c3f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py
@@ -11,7 +11,7 @@
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
     RANGE_PARTITION_NAME,
     BigqueryTable,
 )
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
index 5d5b83f576d31..e5a25d32992b2 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py
@@ -5,7 +5,7 @@
 from google.cloud.bigquery.table import TableListItem
 
 from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
     BigqueryDataset,
     BigQuerySchemaApi,
     BigqueryTable,
diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
index a723b6d475ae3..a2aec8df93d09 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py
@@ -2,7 +2,7 @@
 
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
     BigqueryColumn,
     BigqueryTable,
     PartitionInfo,
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 4a7d1eef399d6..a954c1768d0e1 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -19,7 +19,7 @@
 )
 from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config
 from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report
-from datahub.ingestion.source.bigquery_v2.bigquery_schema_api import (
+from datahub.ingestion.source.bigquery_v2.bigquery_schema import (
     BigqueryProject,
     BigQuerySchemaApi,
     BigqueryView,

From 1b3d5b58bf9b3bc7a18047522b7373b3b7beff52 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Mon, 28 Aug 2023 19:06:54 +0530
Subject: [PATCH 08/11] move stateful check inside lineage module

---
 .../ingestion/source/bigquery_v2/bigquery.py  | 34 +++----------------
 .../ingestion/source/bigquery_v2/lineage.py   | 25 ++++++++++++++
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index f7c95b729dcb2..0f0b1da8c4e82 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -229,11 +229,11 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
             self.report.schema_api_perf, self.config.get_bigquery_client()
         )
 
-        self.redundant_lineage_run_skip_handler: Optional[
+        redundant_lineage_run_skip_handler: Optional[
             RedundantLineageRunSkipHandler
         ] = None
         if self.config.enable_stateful_lineage_ingestion:
-            self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
+            redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
                 source=self,
                 config=self.config,
                 pipeline_name=self.ctx.pipeline_name,
@@ -245,7 +245,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
             config,
             self.report,
             dataset_urn_builder=self.gen_dataset_urn_from_ref,
-            redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
+            redundant_run_skip_handler=redundant_lineage_run_skip_handler,
         )
 
         redundant_usage_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None
@@ -543,7 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 [p.id for p in projects], self.table_refs
             )
 
-        if self._should_ingest_lineage():
+        if self.config.include_table_lineage:
             yield from self.lineage_extractor.get_lineage_workunits(
                 [p.id for p in projects],
                 self.sql_parser_schema_resolver,
@@ -552,32 +552,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 self.table_refs,
             )
 
-            if self.redundant_lineage_run_skip_handler:
-                # Update the checkpoint state for this run.
-                self.redundant_lineage_run_skip_handler.update_state(
-                    self.config.start_time, self.config.end_time
-                )
-
-    def _should_ingest_lineage(self) -> bool:
-        if not self.config.include_table_lineage:
-            return False
-
-        if (
-            self.redundant_lineage_run_skip_handler
-            and self.redundant_lineage_run_skip_handler.should_skip_this_run(
-                cur_start_time=self.config.start_time,
-                cur_end_time=self.config.end_time,
-            )
-        ):
-            # Skip this run
-            self.report.report_warning(
-                "lineage-extraction",
-                "Skip this run as there was already a run for current ingestion window.",
-            )
-            return False
-
-        return True
-
     def _get_projects(self) -> List[BigqueryProject]:
         logger.info("Getting projects")
         if self.config.project_ids or self.config.project_id:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 6e01750e5ddec..4a853901f2890 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -231,6 +231,23 @@ def error(self, log: logging.Logger, key: str, reason: str) -> None:
         self.report.report_warning(key, reason)
         log.error(f"{key} => {reason}")
 
+    def _should_ingest_lineage(self) -> bool:
+        if (
+            self.redundant_run_skip_handler
+            and self.redundant_run_skip_handler.should_skip_this_run(
+                cur_start_time=self.config.start_time,
+                cur_end_time=self.config.end_time,
+            )
+        ):
+            # Skip this run
+            self.report.report_warning(
+                "lineage-extraction",
+                "Skip this run as there was already a run for current ingestion window.",
+            )
+            return False
+
+        return True
+
     def get_lineage_workunits(
         self,
         projects: List[str],
@@ -239,6 +256,8 @@ def get_lineage_workunits(
         view_definitions: FileBackedDict[str],
         table_refs: Set[str],
     ) -> Iterable[MetadataWorkUnit]:
+        if not self._should_ingest_lineage():
+            return
         views_skip_audit_log_lineage: Set[str] = set()
         if self.config.lineage_parse_view_ddl:
             view_lineage: Dict[str, Set[LineageEdge]] = {}
@@ -269,6 +288,12 @@ def get_lineage_workunits(
                 table_refs,
             )
 
+        if self.redundant_run_skip_handler:
+            # Update the checkpoint state for this run.
+            self.redundant_run_skip_handler.update_state(
+                self.config.start_time, self.config.end_time
+            )
+
     def generate_lineage(
         self,
         project_id: str,

From 6a2a3d452437a42189fd7f93701a6c9c5ec0371d Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Tue, 5 Sep 2023 17:57:49 +0530
Subject: [PATCH 09/11] merge related changes

From this https://github.com/datahub-project/datahub/commit/fa0c43c0313f6239f54879819ffc6c6dc04cbef5
---
 .../datahub/ingestion/source/bigquery_v2/bigquery.py   |  2 +-
 .../datahub/ingestion/source/bigquery_v2/lineage.py    | 10 +++++-----
 metadata-ingestion/tests/unit/test_bigquery_lineage.py | 10 +++++++---
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 165a0eea106d1..ff7a47924626d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -541,7 +541,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
             yield from self.lineage_extractor.get_lineage_workunits(
                 [p.id for p in projects],
                 self.sql_parser_schema_resolver,
-                self.view_definition_ids,
+                self.view_refs_by_project,
                 self.view_definitions,
                 self.table_refs,
             )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
index 08a3db2bf6503..98c8cbaf85eec 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py
@@ -253,7 +253,7 @@ def get_lineage_workunits(
         self,
         projects: List[str],
         sql_parser_schema_resolver: SchemaResolver,
-        view_definition_ids: Dict[str, Dict[str, str]],
+        view_refs_by_project: Dict[str, Set[str]],
         view_definitions: FileBackedDict[str],
         table_refs: Set[str],
     ) -> Iterable[MetadataWorkUnit]:
@@ -265,7 +265,7 @@ def get_lineage_workunits(
             for project in projects:
                 self.populate_view_lineage_with_sql_parsing(
                     view_lineage,
-                    view_definition_ids[project],
+                    view_refs_by_project[project],
                     view_definitions,
                     sql_parser_schema_resolver,
                     project,
@@ -348,13 +348,13 @@ def generate_lineage(
     def populate_view_lineage_with_sql_parsing(
         self,
         view_lineage: Dict[str, Set[LineageEdge]],
-        view_definition_ids: Dict[str, str],
+        view_refs: Set[str],
         view_definitions: FileBackedDict[str],
         sql_parser_schema_resolver: SchemaResolver,
         default_project: str,
     ) -> None:
-        for view, view_definition_id in view_definition_ids.items():
-            view_definition = view_definitions[view_definition_id]
+        for view in view_refs:
+            view_definition = view_definitions[view]
             raw_view_lineage = sqlglot_lineage(
                 view_definition,
                 schema_resolver=sql_parser_schema_resolver,
diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py
index 566d6fc2cb0c3..e23494963e475 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+import datahub.emitter.mce_builder as builder
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
     BigQueryTableRef,
     QueryEvent,
@@ -81,7 +82,9 @@ def lineage_entries() -> List[QueryEvent]:
 def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None:
     config = BigQueryV2Config()
     report = BigQueryV2Report()
-    extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report, lambda x: "")
+    extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(
+        config, report, lambda x: builder.make_dataset_urn("bigquery", str(x))
+    )
 
     bq_table = BigQueryTableRef.from_string_name(
         "projects/my_project/datasets/my_dataset/tables/my_table"
@@ -104,7 +107,9 @@ def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None:
 def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None:
     config = BigQueryV2Config(extract_column_lineage=True, incremental_lineage=False)
     report = BigQueryV2Report()
-    extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report, lambda x: "")
+    extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(
+        config, report, lambda x: builder.make_dataset_urn("bigquery", str(x))
+    )
 
     bq_table = BigQueryTableRef.from_string_name(
         "projects/my_project/datasets/my_dataset/tables/my_table"
@@ -119,7 +124,6 @@ def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None:
         bq_table=bq_table,
         bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)",
         lineage_metadata=lineage_map,
-        platform="bigquery",
     )
     assert upstream_lineage
     assert len(upstream_lineage.upstreams) == 2

From 79f84bac1cb8260d958f378cd69c30028509b0ad Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Wed, 13 Sep 2023 17:43:48 +0530
Subject: [PATCH 10/11] address review comments

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  7 +--
 .../bigquery_v2/bigquery_audit_log_api.py     | 19 +++-----
 .../source/bigquery_v2/bigquery_schema.py     | 31 ++++++------
 .../src/datahub/utilities/perf_timer.py       | 47 ++++++++++---------
 .../tests/unit/utilities/test_perf_timer.py   | 25 ++++++----
 5 files changed, 62 insertions(+), 67 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index ff7a47924626d..ae49a4ba17c11 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -558,12 +558,7 @@ def _get_projects(self) -> List[BigqueryProject]:
             return list(self._query_project_list())
 
     def _query_project_list(self) -> Iterable[BigqueryProject]:
-        try:
-            projects = self.bigquery_data_dictionary.get_projects()
-        except Exception as e:
-            logger.error(f"Error getting projects. {e}", exc_info=True)
-            projects = []
-
+        projects = self.bigquery_data_dictionary.get_projects()
         if not projects:  # Report failure on exception and if empty list is returned
             self.report.report_failure(
                 "metadata-extraction",
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
index fcb6200241cd7..03b12c61ee5c6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
@@ -87,14 +87,12 @@ def get_exported_bigquery_audit_metadata(
                     f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
                 )
 
-                if rate_limiter:
-                    with rate_limiter:
-                        for entry in query_job:
-                            with current_timer.pause_timer():
+                for entry in query_job:
+                    with current_timer.pause():
+                        if rate_limiter:
+                            with rate_limiter:
                                 yield entry
-                else:
-                    for entry in query_job:
-                        with current_timer.pause_timer():
+                        else:
                             yield entry
 
     def get_bigquery_log_entries_via_gcp_logging(
@@ -124,17 +122,12 @@ def get_bigquery_log_entries_via_gcp_logging(
             )
 
             for i, entry in enumerate(list_entries):
-                if i == 0:
-                    logger.info(
-                        f"Starting log load from GCP Logging for {client.project}"
-                    )
-
                 if i % 1000 == 0:
                     logger.info(
                         f"Loaded {i} log entries from GCP Log for {client.project}"
                     )
 
-                with current_timer.pause_timer():
+                with current_timer.pause():
                     if rate_limiter:
                         with rate_limiter:
                             yield entry
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
index 6fd3482b68921..7edc8656360bb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py
@@ -133,30 +133,29 @@ def __init__(
         self.bq_client = client
         self.report = report
 
-    def get_client(self) -> bigquery.Client:
-        assert self.bq_client is not None
-        return self.bq_client
-
     def get_query_result(self, query: str) -> RowIterator:
         logger.debug(f"Query : {query}")
-        resp = self.get_client().query(query)
+        resp = self.bq_client.query(query)
         return resp.result()
 
     def get_projects(self) -> List[BigqueryProject]:
         with self.report.list_projects:
-            projects = self.get_client().list_projects()
+            try:
+                projects = self.bq_client.list_projects()
 
-            return [
-                BigqueryProject(id=p.project_id, name=p.friendly_name) for p in projects
-            ]
+                return [
+                    BigqueryProject(id=p.project_id, name=p.friendly_name)
+                    for p in projects
+                ]
+            except Exception as e:
+                logger.error(f"Error getting projects. {e}", exc_info=True)
+                return []
 
     def get_datasets_for_project_id(
         self, project_id: str, maxResults: Optional[int] = None
     ) -> List[BigqueryDataset]:
         with self.report.list_datasets:
-            datasets = self.get_client().list_datasets(
-                project_id, max_results=maxResults
-            )
+            datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
             return [
                 BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets
             ]
@@ -189,8 +188,8 @@ def list_tables(
         self, dataset_name: str, project_id: str
     ) -> Iterator[TableListItem]:
         with self.report.list_tables as current_timer:
-            for table in self.get_client().list_tables(f"{project_id}.{dataset_name}"):
-                with current_timer.pause_timer():
+            for table in self.bq_client.list_tables(f"{project_id}.{dataset_name}"):
+                with current_timer.pause():
                     yield table
 
     def get_tables_for_dataset(
@@ -231,7 +230,7 @@ def get_tables_for_dataset(
 
             for table in cur:
                 try:
-                    with current_timer.pause_timer():
+                    with current_timer.pause():
                         yield BigQuerySchemaApi._make_bigquery_table(
                             table, tables.get(table.table_name)
                         )
@@ -308,7 +307,7 @@ def get_views_for_dataset(
 
             for table in cur:
                 try:
-                    with current_timer.pause_timer():
+                    with current_timer.pause():
                         yield BigQuerySchemaApi._make_bigquery_view(table)
                 except Exception as e:
                     view_name = f"{project_id}.{dataset_name}.{table.table_name}"
diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py
index 46eb0e25e4fbf..18384420bfefb 100644
--- a/metadata-ingestion/src/datahub/utilities/perf_timer.py
+++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py
@@ -1,7 +1,10 @@
+import logging
 import time
 from contextlib import AbstractContextManager
 from typing import Any, Optional
 
+logger: logging.Logger = logging.getLogger(__name__)
+
 
 class PerfTimer(AbstractContextManager):
     """
@@ -13,29 +16,19 @@ def __init__(self) -> None:
         self.start_time: Optional[float] = None
         self.end_time: Optional[float] = None
         self._past_active_time: float = 0
-        self.paused: Optional[bool] = None
+        self.paused: bool = False
+        self._error_state = False
 
     def start(self) -> None:
-        # TODO
-        # assert (
-        #    self.end_time is None
-        # ), "Can not start a finished timer. Did you accidentally re-use this timer ?"
-
         if self.end_time is not None:
             self._past_active_time = self.elapsed_seconds()
 
         self.start_time = time.perf_counter()
         self.end_time = None
-        if self.paused:
-            self.paused = False
-
-    def pause_timer(self) -> "PerfTimer":
-        assert (
-            not self.paused and not self.end_time
-        ), "Can not pause a paused/stopped timer"
-        assert (
-            self.start_time is not None
-        ), "Can not pause a timer that hasn't started. Did you forget to start the timer ?"
+        self.paused = False
+
+    def pause(self) -> "PerfTimer":
+        self.assert_timer_is_running()
         self._past_active_time = self.elapsed_seconds()
         self.start_time = None
         self.end_time = None
@@ -43,9 +36,7 @@ def pause_timer(self) -> "PerfTimer":
         return self
 
     def finish(self) -> None:
-        assert (
-            self.start_time is not None
-        ), "Can not stop a timer that hasn't started. Did you forget to start the timer ?"
+        self.assert_timer_is_running()
         self.end_time = time.perf_counter()
 
     def __enter__(self) -> "PerfTimer":
@@ -71,15 +62,26 @@ def elapsed_seconds(self) -> float:
         """
         Returns the elapsed time in seconds.
         """
-        if self.paused:
+        if self.paused or not self.start_time:
             return self._past_active_time
 
-        assert self.start_time is not None, "Did you forget to start the timer ?"
         if self.end_time is None:
             return (time.perf_counter() - self.start_time) + (self._past_active_time)
         else:
             return (self.end_time - self.start_time) + self._past_active_time
 
+    def assert_timer_is_running(self) -> None:
+        """
+        Returns true if timer is in running state.
+        Timer is in NOT in running state if
+        1. it has never been started.
+        2. it is in paused state.
+        3. it had been started and finished in the past but not started again.
+        """
+        if self.start_time is None or self.paused or self.end_time:
+            self._error_state = True
+            logger.warning("Did you forget to start the timer ?")
+
     def __repr__(self) -> str:
         return repr(self.as_obj())
 
@@ -91,4 +93,5 @@ def as_obj(self) -> Optional[str]:
             return None
         else:
             time_taken = self.elapsed_seconds()
-            return f"{time_taken:.3f} seconds"
+            state = " (error)" if self._error_state else ""
+            return f"{time_taken:.3f} seconds{state}"
diff --git a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
index 9fbd3a7b5d9cd..d5fde314c2b57 100644
--- a/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
+++ b/metadata-ingestion/tests/unit/utilities/test_perf_timer.py
@@ -1,27 +1,32 @@
 import time
+from functools import partial
+
+import pytest
 
 from datahub.utilities.perf_timer import PerfTimer
 
+approx = partial(pytest.approx, rel=1e-2)
+
 
 def test_perf_timer_simple():
     with PerfTimer() as timer:
         time.sleep(1)
-        assert round(timer.elapsed_seconds()) == 1
+        assert approx(timer.elapsed_seconds()) == 1
 
-    assert round(timer.elapsed_seconds()) == 1
+    assert approx(timer.elapsed_seconds()) == 1
 
 
 def test_perf_timer_paused_timer():
     with PerfTimer() as current_timer:
         time.sleep(1)
-        assert round(current_timer.elapsed_seconds()) == 1
-        with current_timer.pause_timer():
+        assert approx(current_timer.elapsed_seconds()) == 1
+        with current_timer.pause():
             time.sleep(2)
-            assert round(current_timer.elapsed_seconds()) == 1
-        assert round(current_timer.elapsed_seconds()) == 1
+            assert approx(current_timer.elapsed_seconds()) == 1
+        assert approx(current_timer.elapsed_seconds()) == 1
         time.sleep(1)
 
-    assert round(current_timer.elapsed_seconds()) == 2
+    assert approx(current_timer.elapsed_seconds()) == 2
 
 
 def test_generator_with_paused_timer():
@@ -30,12 +35,12 @@ def generator_function():
             time.sleep(1)
             for i in range(10):
                 time.sleep(0.2)
-                with inner_timer.pause_timer():
+                with inner_timer.pause():
                     time.sleep(0.2)
                     yield i
-            assert round(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10
+            assert approx(inner_timer.elapsed_seconds()) == 1 + 0.2 * 10
 
     with PerfTimer() as outer_timer:
         seq = generator_function()
         list([i for i in seq])
-        assert round(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10
+        assert approx(outer_timer.elapsed_seconds()) == 1 + 0.2 * 10 + 0.2 * 10

From 0ba4efcf6fac33f8e6f346345aa01770bfeb4cd5 Mon Sep 17 00:00:00 2001
From: Mayuri N <mayuri.nehate@gslab.com>
Date: Thu, 14 Sep 2023 10:56:25 +0530
Subject: [PATCH 11/11] fix tests

---
 metadata-ingestion/tests/unit/test_bigquery_source.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 4b30478873ae7..4fc6c31626ba8 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -252,15 +252,15 @@ def test_get_projects_list_empty(get_bq_client_mock, get_projects_mock):
     assert projects == []
 
 
-@patch.object(BigQuerySchemaApi, "get_projects")
 @patch.object(BigQueryV2Config, "get_bigquery_client")
 def test_get_projects_list_failure(
     get_bq_client_mock: MagicMock,
-    get_projects_mock: MagicMock,
     caplog: pytest.LogCaptureFixture,
 ) -> None:
     error_str = "my error"
-    get_projects_mock.side_effect = GoogleAPICallError(error_str)
+    bq_client_mock = MagicMock()
+    get_bq_client_mock.return_value = bq_client_mock
+    bq_client_mock.list_projects.side_effect = GoogleAPICallError(error_str)
 
     config = BigQueryV2Config.parse_obj(
         {"project_id_pattern": {"deny": ["^test-project$"]}}