datahub-project · hsheth2 · Sep 15, 2023 · Aug 9, 2023 · Aug 10, 2023 · Aug 11, 2023
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py
@@ -13,48 +13,6 @@
     get_first_missing_key_any,
 )
 
-BQ_FILTER_RULE_TEMPLATE = "BQ_FILTER_RULE_TEMPLATE"
-
-BQ_AUDIT_V2 = {
-    BQ_FILTER_RULE_TEMPLATE: """
-resource.type=("bigquery_project" OR "bigquery_dataset")
-AND
-timestamp >= "{start_time}"
-AND
-timestamp < "{end_time}"
-AND protoPayload.serviceName="bigquery.googleapis.com"
-AND
-(
-    (
-        protoPayload.methodName=
-            (
-                "google.cloud.bigquery.v2.JobService.Query"
-                OR
-                "google.cloud.bigquery.v2.JobService.InsertJob"
-            )
-        AND protoPayload.metadata.jobChange.job.jobStatus.jobState="DONE"
-        AND NOT protoPayload.metadata.jobChange.job.jobStatus.errorResult:*
-        AND protoPayload.metadata.jobChange.job.jobConfig.queryConfig:*
-        AND
-        (
-            (
-                protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables:*
-                AND NOT protoPayload.metadata.jobChange.job.jobStats.queryStats.referencedTables =~ "projects/.*/datasets/.*/tables/__TABLES__|__TABLES_SUMMARY__|INFORMATION_SCHEMA.*"
-            )
-            OR
-            (
-                protoPayload.metadata.jobChange.job.jobConfig.queryConfig.destinationTable:*
-            )
-        )
-    )
-    OR
-    protoPayload.metadata.tableDataRead.reason = "JOB"
-)
-""".strip(
-        "\t \n"
-    ),
-}
-
 AuditLogEntry = Any
 
 # BigQueryAuditMetadata is the v2 format in which audit logs are exported to BigQuery
@@ -606,7 +564,6 @@ def from_query_event(
         query_event: QueryEvent,
         debug_include_full_payloads: bool = False,
     ) -> "ReadEvent":
-
         readEvent = ReadEvent(
             actor_email=query_event.actor_email,
             timestamp=query_event.timestamp,

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py
@@ -0,0 +1,139 @@
+import logging
+from datetime import datetime
+from typing import Callable, Iterable, List, Optional
+
+from google.cloud import bigquery
+from google.cloud.logging_v2.client import Client as GCPLoggingClient
+from ratelimiter import RateLimiter
+
+from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
+    AuditLogEntry,
+    BigQueryAuditMetadata,
+)
+from datahub.ingestion.source.bigquery_v2.bigquery_report import (
+    BigQueryAuditLogApiPerfReport,
+)
+from datahub.ingestion.source.bigquery_v2.common import (
+    BQ_DATE_SHARD_FORMAT,
+    BQ_DATETIME_FORMAT,
+)
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+# Api interfaces are separated based on functionality they provide
+# rather than the underlying bigquery client that is used to
+# provide the functionality.
+class BigQueryAuditLogApi:
+    def __init__(
+        self,
+        report: BigQueryAuditLogApiPerfReport,
+        rate_limit: bool,
+        requests_per_min: int,
+    ) -> None:
+        self.report = report
+        self.rate_limit = rate_limit
+        self.requests_per_min = requests_per_min
+
+    def get_exported_bigquery_audit_metadata(
+        self,
+        bigquery_client: bigquery.Client,
+        bigquery_audit_metadata_query_template: Callable[
+            [
+                str,  # dataset: str
+                bool,  # use_date_sharded_tables: bool
+                Optional[int],  # limit: Optional[int] = None
+            ],
+            str,
+        ],
+        bigquery_audit_metadata_datasets: Optional[List[str]],
+        use_date_sharded_audit_log_tables: bool,
+        start_time: datetime,
+        end_time: datetime,
+        limit: Optional[int] = None,
+    ) -> Iterable[BigQueryAuditMetadata]:
+        if bigquery_audit_metadata_datasets is None:
+            return
+
+        audit_start_time = start_time.strftime(BQ_DATETIME_FORMAT)
+        audit_start_date = start_time.strftime(BQ_DATE_SHARD_FORMAT)
+
+        audit_end_time = end_time.strftime(BQ_DATETIME_FORMAT)
+        audit_end_date = end_time.strftime(BQ_DATE_SHARD_FORMAT)
+
+        rate_limiter: Optional[RateLimiter] = None
+        if self.rate_limit:
+            rate_limiter = RateLimiter(max_calls=self.requests_per_min, period=60)
+
+        with self.report.get_exported_log_entries as current_timer:
+            for dataset in bigquery_audit_metadata_datasets:
+                logger.info(
+                    f"Start loading log entries from BigQueryAuditMetadata in {dataset}"
+                )
+
+                query = bigquery_audit_metadata_query_template(
+                    dataset,
+                    use_date_sharded_audit_log_tables,
+                    limit,
+                ).format(
+                    start_time=audit_start_time,
+                    end_time=audit_end_time,
+                    start_date=audit_start_date,
+                    end_date=audit_end_date,
+                )
+
+                query_job = bigquery_client.query(query)
+                logger.info(
+                    f"Finished loading log entries from BigQueryAuditMetadata in {dataset}"
+                )
+
+                for entry in query_job:
+                    with current_timer.pause():
+                        if rate_limiter:
+                            with rate_limiter:
+                                yield entry
+                        else:
+                            yield entry
+
+    def get_bigquery_log_entries_via_gcp_logging(
+        self,
+        client: GCPLoggingClient,
+        filter: str,
+        log_page_size: int,
+        limit: Optional[int] = None,
+    ) -> Iterable[AuditLogEntry]:
+        logger.debug(filter)
+
+        list_entries: Iterable[AuditLogEntry]
+        rate_limiter: Optional[RateLimiter] = None
+        if self.rate_limit:
+            # client.list_entries is a generator, does api calls to GCP Logging when it runs out of entries and needs to fetch more from GCP Logging
+            # to properly ratelimit we multiply the page size by the number of requests per minute
+            rate_limiter = RateLimiter(
+                max_calls=self.requests_per_min * log_page_size,
+                period=60,
+            )
+
+        with self.report.list_log_entries as current_timer:
+            list_entries = client.list_entries(
+                filter_=filter,
+                page_size=log_page_size,
+                max_results=limit,
+            )
+
+            for i, entry in enumerate(list_entries):
+                if i % 1000 == 0:
+                    logger.info(
+                        f"Loaded {i} log entries from GCP Log for {client.project}"
+                    )
+
+                with current_timer.pause():
+                    if rate_limiter:
+                        with rate_limiter:
+                            yield entry
+                    else:
+                        yield entry
+
+            logger.info(
+                f"Finished loading log entries from GCP Log for {client.project}"
+            )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -4,9 +4,11 @@
 from typing import Any, Dict, List, Optional
 
 import pydantic
-from pydantic import Field, PositiveInt, PrivateAttr, root_validator
+from google.cloud import bigquery
+from google.cloud.logging_v2.client import Client as GCPLoggingClient
+from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
 
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
 from datahub.configuration.validate_field_removal import pydantic_removed_field
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
 from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -35,7 +37,52 @@ class BigQueryUsageConfig(BaseUsageConfig):
     )
 
 
+class BigQueryConnectionConfig(ConfigModel):
+    credential: Optional[BigQueryCredential] = Field(
+        default=None, description="BigQuery credential informations"
+    )
+
+    _credentials_path: Optional[str] = PrivateAttr(None)
+
+    extra_client_options: Dict[str, Any] = Field(
+        default={},
+        description="Additional options to pass to google.cloud.logging_v2.client.Client.",
+    )
+
+    project_on_behalf: Optional[str] = Field(
+        default=None,
+        description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
+    )
+
+    def __init__(self, **data: Any):
+        super().__init__(**data)
+
+        if self.credential:
+            self._credentials_path = self.credential.create_credential_temp_file()
+            logger.debug(
+                f"Creating temporary credential file at {self._credentials_path}"
+            )
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
+
+    def get_bigquery_client(config) -> bigquery.Client:
+        client_options = config.extra_client_options
+        return bigquery.Client(config.project_on_behalf, **client_options)
+
+    def make_gcp_logging_client(
+        self, project_id: Optional[str] = None
+    ) -> GCPLoggingClient:
+        # See https://github.com/googleapis/google-cloud-python/issues/2674 for
+        # why we disable gRPC here.
+        client_options = self.extra_client_options.copy()
+        client_options["_use_grpc"] = False
+        if project_id is not None:
+            return GCPLoggingClient(**client_options, project=project_id)
+        else:
+            return GCPLoggingClient(**client_options)
+
+
 class BigQueryV2Config(
+    BigQueryConnectionConfig,
     BigQueryBaseConfig,
     SQLCommonConfig,
     StatefulUsageConfigMixin,
@@ -122,11 +169,6 @@ class BigQueryV2Config(
         ),
     )
 
-    project_on_behalf: Optional[str] = Field(
-        default=None,
-        description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
-    )
-
     storage_project_id: None = Field(default=None, hidden_from_docs=True)
 
     lineage_use_sql_parser: bool = Field(
@@ -180,14 +222,8 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
         default=1000,
         description="The number of log item will be queried per page for lineage collection",
     )
-    credential: Optional[BigQueryCredential] = Field(
-        description="BigQuery credential informations"
-    )
+
     # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage.
-    extra_client_options: Dict[str, Any] = Field(
-        default={},
-        description="Additional options to pass to google.cloud.logging_v2.client.Client.",
-    )
     include_table_lineage: Optional[bool] = Field(
         default=True,
         description="Option to enable/disable lineage generation. Is enabled by default.",
@@ -209,7 +245,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
         default=False,
         description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.",
     )
-    _credentials_path: Optional[str] = PrivateAttr(None)
 
     _cache_path: Optional[str] = PrivateAttr(None)
 
@@ -230,16 +265,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool:
         description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
     )
 
-    def __init__(self, **data: Any):
-        super().__init__(**data)
-
-        if self.credential:
-            self._credentials_path = self.credential.create_credential_temp_file()
-            logger.debug(
-                f"Creating temporary credential file at {self._credentials_path}"
-            )
-            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
-
     @root_validator(pre=False)
     def profile_default_settings(cls, values: Dict) -> Dict:
         # Extra default SQLAlchemy option for better connection pooling and threading.
@@ -248,6 +273,17 @@ def profile_default_settings(cls, values: Dict) -> Dict:
 
         return values
 
+    @validator("bigquery_audit_metadata_datasets")
+    def validate_bigquery_audit_metadata_datasets(
+        cls, v: Optional[List[str]], values: Dict
+    ) -> Optional[List[str]]:
+        if values.get("use_exported_bigquery_audit_metadata"):
+            assert (
+                v and len(v) > 0
+            ), "`bigquery_audit_metadata_datasets` should be set if using `use_exported_bigquery_audit_metadata: True`."
+
+        return v
+
     @root_validator(pre=False)
     def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
         project_id = values.get("project_id")