From f9cf0755d36f40ef3a7dfd20b96a82cf3c63fd1f Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 25 Oct 2023 14:11:17 -0400 Subject: [PATCH] feat(ingest/bigquery): Attempt to support raw dataset pattern --- docs/how/updating-datahub.md | 8 +-- .../source/bigquery_v2/bigquery_config.py | 18 ++++++- .../tests/unit/test_bigquery_source.py | 53 +++++++++++++++++++ 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 8813afee65be9..4d1535f28fa0a 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -53,10 +53,10 @@ into for example, using `datahub put` command. Policies can be also removed and re-created via UI. - #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`. This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully -qualified dataset name, i.e. `.`. If this is not the case, please -update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part), -or set `match_fully_qualified_names: false` in your recipe. However, note that -setting this to `false` is deprecated and this flag will be removed entirely in a future release. +qualified dataset name, i.e. `.`. We attempt to support the old +pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this +should not cause any issues. However, if you have a complex dataset pattern, we recommend you +manually convert it to the fully qualified format to avoid any potential issues. ### Potential Downtime diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index a6a740385cf5c..6203192769750 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -299,7 +299,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: "use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible." ) - dataset_pattern = values.get("dataset_pattern") + dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern") schema_pattern = values.get("schema_pattern") if ( dataset_pattern == AllowDenyPattern.allow_all() @@ -329,6 +329,22 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: "Please update `dataset_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." "The config option `match_fully_qualified_names` is deprecated and will be removed in a future release." ) + elif match_fully_qualified_names and dataset_pattern is not None: + adjusted = False + for lst in [dataset_pattern.allow, dataset_pattern.deny]: + for i, pattern in enumerate(lst): + if "." not in pattern: + if pattern.startswith("^"): + lst[i] = r"^.*\." + pattern[1:] + else: + lst[i] = r".*\." + pattern + adjusted = True + if adjusted: + logger.warning( + "`dataset_pattern` was adjusted to match against fully qualified schema names," + " of the form `.`." + ) + return values def get_table_pattern(self, pattern: List[str]) -> str: diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 5a11a933c8595..4cfa5c48d2377 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -53,6 +53,59 @@ def test_bigquery_uri_on_behalf(): assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf" +def test_bigquery_dataset_pattern(): + config = BigQueryV2Config.parse_obj( + { + "dataset_pattern": { + "allow": [ + "test-dataset", + "test-project.test-dataset", + ".*test-dataset", + ], + "deny": [ + "^test-dataset-2$", + "project\\.second_dataset", + ], + }, + } + ) + assert config.dataset_pattern.allow == [ + r".*\.test-dataset", + r"test-project.test-dataset", + r".*test-dataset", + ] + assert config.dataset_pattern.deny == [ + r"^.*\.test-dataset-2$", + r"project\.second_dataset", + ] + + config = BigQueryV2Config.parse_obj( + { + "dataset_pattern": { + "allow": [ + "test-dataset", + "test-project.test-dataset", + ".*test-dataset", + ], + "deny": [ + "^test-dataset-2$", + "project\\.second_dataset", + ], + }, + "match_fully_qualified_names": False, + } + ) + assert config.dataset_pattern.allow == [ + r"test-dataset", + r"test-project.test-dataset", + r".*test-dataset", + ] + assert config.dataset_pattern.deny == [ + r"^test-dataset-2$", + r"project\.second_dataset", + ] + + def test_bigquery_uri_with_credential(): expected_credential_json = { "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",