Skip to content

Commit

Permalink
feat(ingest/bigquery): Attempt to support raw dataset pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
asikowitz committed Oct 25, 2023
1 parent 8a80e85 commit f9cf075
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 5 deletions.
8 changes: 4 additions & 4 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ into
for example, using `datahub put` command. Policies can be also removed and re-created via UI.
- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`.
This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully
qualified dataset name, i.e. `<project_name>.<dataset_name>`. If this is not the case, please
update your pattern (e.g. prepend your old dataset pattern with `.*\.` which matches the project part),
or set `match_fully_qualified_names: false` in your recipe. However, note that
setting this to `false` is deprecated and this flag will be removed entirely in a future release.
qualified dataset name, i.e. `<project_name>.<dataset_name>`. We attempt to support the old
pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this
should not cause any issues. However, if you have a complex dataset pattern, we recommend you
manually convert it to the fully qualified format to avoid any potential issues.

### Potential Downtime

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
"use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible."
)

dataset_pattern = values.get("dataset_pattern")
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
schema_pattern = values.get("schema_pattern")
if (
dataset_pattern == AllowDenyPattern.allow_all()
Expand Down Expand Up @@ -329,6 +329,22 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
"Please update `dataset_pattern` to match against fully qualified schema name `<project_id>.<dataset_name>` and set config `match_fully_qualified_names : True`."
"The config option `match_fully_qualified_names` is deprecated and will be removed in a future release."
)
elif match_fully_qualified_names and dataset_pattern is not None:
adjusted = False
for lst in [dataset_pattern.allow, dataset_pattern.deny]:
for i, pattern in enumerate(lst):
if "." not in pattern:
if pattern.startswith("^"):
lst[i] = r"^.*\." + pattern[1:]
else:
lst[i] = r".*\." + pattern
adjusted = True
if adjusted:
logger.warning(
"`dataset_pattern` was adjusted to match against fully qualified schema names,"
" of the form `<project_id>.<dataset_name>`."
)

return values

def get_table_pattern(self, pattern: List[str]) -> str:
Expand Down
53 changes: 53 additions & 0 deletions metadata-ingestion/tests/unit/test_bigquery_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,59 @@ def test_bigquery_uri_on_behalf():
assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf"


def test_bigquery_dataset_pattern():
config = BigQueryV2Config.parse_obj(
{
"dataset_pattern": {
"allow": [
"test-dataset",
"test-project.test-dataset",
".*test-dataset",
],
"deny": [
"^test-dataset-2$",
"project\\.second_dataset",
],
},
}
)
assert config.dataset_pattern.allow == [
r".*\.test-dataset",
r"test-project.test-dataset",
r".*test-dataset",
]
assert config.dataset_pattern.deny == [
r"^.*\.test-dataset-2$",
r"project\.second_dataset",
]

config = BigQueryV2Config.parse_obj(
{
"dataset_pattern": {
"allow": [
"test-dataset",
"test-project.test-dataset",
".*test-dataset",
],
"deny": [
"^test-dataset-2$",
"project\\.second_dataset",
],
},
"match_fully_qualified_names": False,
}
)
assert config.dataset_pattern.allow == [
r"test-dataset",
r"test-project.test-dataset",
r".*test-dataset",
]
assert config.dataset_pattern.deny == [
r"^test-dataset-2$",
r"project\.second_dataset",
]


def test_bigquery_uri_with_credential():
expected_credential_json = {
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
Expand Down

0 comments on commit f9cf075

Please sign in to comment.