From f9cf0755d36f40ef3a7dfd20b96a82cf3c63fd1f Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Wed, 25 Oct 2023 14:11:17 -0400
Subject: [PATCH] feat(ingest/bigquery): Attempt to support raw dataset pattern

---
 docs/how/updating-datahub.md                  |  8 +--
 .../source/bigquery_v2/bigquery_config.py     | 18 ++++++-
 .../tests/unit/test_bigquery_source.py        | 53 +++++++++++++++++++
 3 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
index 8813afee65be9..4d1535f28fa0a 100644
--- a/docs/how/updating-datahub.md
+++ b/docs/how/updating-datahub.md
@@ -53,10 +53,10 @@ into
 for example, using `datahub put` command. Policies can be also removed and re-created via UI.
 - #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
 This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
-qualified dataset name, i.e. `<project_name>.<dataset_name>`. If this is not the case, please
-update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part), 
-or set `match_fully_qualified_names: false` in your recipe. However, note that
-setting this to `false` is deprecated and this flag will be removed entirely in a future release.
+qualified dataset name, i.e. `<project_name>.<dataset_name>`. We attempt to support the old
+pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this
+should not cause any issues. However, if you have a complex dataset pattern, we recommend you
+manually convert it to the fully qualified format to avoid any potential issues.
 
 ### Potential Downtime
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
index a6a740385cf5c..6203192769750 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py
@@ -299,7 +299,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
                 "use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
             )
 
-        dataset_pattern = values.get("dataset_pattern")
+        dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
         schema_pattern = values.get("schema_pattern")
         if (
             dataset_pattern == AllowDenyPattern.allow_all()
@@ -329,6 +329,22 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
                 "Please update `dataset_pattern` to match against fully qualified schema name `<project_id>.<dataset_name>` and set config `match_fully_qualified_names : True`."
                 "The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
             )
+        elif match_fully_qualified_names and dataset_pattern is not None:
+            adjusted = False
+            for lst in [dataset_pattern.allow, dataset_pattern.deny]:
+                for i, pattern in enumerate(lst):
+                    if "." not in pattern:
+                        if pattern.startswith("^"):
+                            lst[i] = r"^.*\." + pattern[1:]
+                        else:
+                            lst[i] = r".*\." + pattern
+                        adjusted = True
+            if adjusted:
+                logger.warning(
+                    "`dataset_pattern` was adjusted to match against fully qualified schema names,"
+                    " of the form `<project_id>.<dataset_name>`."
+                )
+
         return values
 
     def get_table_pattern(self, pattern: List[str]) -> str:
diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py
index 5a11a933c8595..4cfa5c48d2377 100644
--- a/metadata-ingestion/tests/unit/test_bigquery_source.py
+++ b/metadata-ingestion/tests/unit/test_bigquery_source.py
@@ -53,6 +53,59 @@ def test_bigquery_uri_on_behalf():
     assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"
 
 
+def test_bigquery_dataset_pattern():
+    config = BigQueryV2Config.parse_obj(
+        {
+            "dataset_pattern": {
+                "allow": [
+                    "test-dataset",
+                    "test-project.test-dataset",
+                    ".*test-dataset",
+                ],
+                "deny": [
+                    "^test-dataset-2$",
+                    "project\\.second_dataset",
+                ],
+            },
+        }
+    )
+    assert config.dataset_pattern.allow == [
+        r".*\.test-dataset",
+        r"test-project.test-dataset",
+        r".*test-dataset",
+    ]
+    assert config.dataset_pattern.deny == [
+        r"^.*\.test-dataset-2$",
+        r"project\.second_dataset",
+    ]
+
+    config = BigQueryV2Config.parse_obj(
+        {
+            "dataset_pattern": {
+                "allow": [
+                    "test-dataset",
+                    "test-project.test-dataset",
+                    ".*test-dataset",
+                ],
+                "deny": [
+                    "^test-dataset-2$",
+                    "project\\.second_dataset",
+                ],
+            },
+            "match_fully_qualified_names": False,
+        }
+    )
+    assert config.dataset_pattern.allow == [
+        r"test-dataset",
+        r"test-project.test-dataset",
+        r".*test-dataset",
+    ]
+    assert config.dataset_pattern.deny == [
+        r"^test-dataset-2$",
+        r"project\.second_dataset",
+    ]
+
+
 def test_bigquery_uri_with_credential():
     expected_credential_json = {
         "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",