Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Aug 3, 2023
2 parents 2ab7926 + 6a36118 commit 565cffe
Show file tree
Hide file tree
Showing 15 changed files with 2,195 additions and 42 deletions.
8 changes: 6 additions & 2 deletions .github/actions/docker-custom-build-and-push/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ runs:
# add git short SHA as Docker tag
tag-custom: ${{ inputs.tags }}
tag-custom-only: true

# Code for testing the build when not pushing to Docker Hub.
- name: Build and Load image for testing (if not publishing)
uses: docker/build-push-action@v3
Expand All @@ -64,12 +64,14 @@ runs:
tags: ${{ steps.docker_meta.outputs.tags }}
load: true
push: false
cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: type=inline
- name: Upload image locally for testing (if not publishing)
uses: ishworkh/docker-image-artifact-upload@v1
if: ${{ inputs.publish != 'true' }}
with:
image: ${{ steps.docker_meta.outputs.tags }}

# Code for building multi-platform images and pushing to Docker Hub.
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
Expand All @@ -93,5 +95,7 @@ runs:
build-args: ${{ inputs.build-args }}
tags: ${{ steps.docker_meta.outputs.tags }}
push: true
cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: type=inline

# TODO add code for vuln scanning?
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps)
<Input
data-testid="cli-version-input"
className="cli-version-input"
placeholder="(e.g. 0.10.4)"
placeholder="(e.g. 0.10.5)"
value={state.config?.version || ''}
onChange={(event) => setVersion(event.target.value)}
/>
Expand Down
16 changes: 15 additions & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,16 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

### Breaking Changes

### Potential Downtime

### Deprecations

### Other notable Changes

## 0.10.5

### Breaking Changes

- #8201: Python SDK: In the DataFlow class, the `cluster` argument is deprecated in favor of `env`.
- #8263: Okta source config option `okta_profile_to_username_attr` default changed from `login` to `email`.
This determines which Okta profile attribute is used for the corresponding DataHub user
Expand All @@ -16,9 +26,13 @@ certain column-level metrics. Instead, set `profile_table_level_only` to `false`
individually enable / disable desired field metrics.
- #8451: The `bigquery-beta` and `snowflake-beta` source aliases have been dropped. Use `bigquery` and `snowflake` as the source type instead.
- #8472: Ingestion runs created with Pipeline.create will show up in the DataHub ingestion tab as CLI-based runs. To revert to the previous behavior of not showing these runs in DataHub, pass `no_default_report=True`.

- #8513: `snowflake` connector will use user's `email` attribute as is in urn. To revert to previous behavior disable `email_as_user_identifier` in recipe.
### Potential Downtime

- BrowsePathsV2 upgrade will now be handled by the `system-update` job in non-blocking mode. This process generates data needed for the new search
and browse feature. This process must complete before enabling the new search and browse UI and while upgrading entities will be missing from the UI.
If not using the new search and browse UI, there will be no impact and the update will complete in the background.

### Deprecations

- #8198: In the Python SDK, the `PlatformKey` class has been renamed to `ContainerKey`.
Expand Down
2 changes: 1 addition & 1 deletion gradle/versioning/versioning.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Produces the following variables and supports token replacement
import org.apache.tools.ant.filters.ReplaceTokens

def detailedVersionString = "0.0.0-unknown-SNAPSHOT"
def cliMajorVersion = "0.10.4" // base default cli major version
def cliMajorVersion = "0.10.5" // base default cli major version
def snapshotVersion = false
if (project.hasProperty("releaseVersion")) {
version = releaseVersion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ class SnowflakeV2Config(
"upstreams_deny_pattern", "temporary_tables_pattern"
)

email_as_user_identifier: bool = Field(
default=True,
description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.",
)

@validator("include_column_lineage")
def validate_include_column_lineage(cls, v, values):
if not values.get("include_table_lineage") and v:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,9 @@ def build_usage_statistics_for_dataset(self, dataset_identifier, row):
)
if self.config.include_top_n_queries
else None,
userCounts=self._map_user_counts(json.loads(row["USER_COUNTS"])),
userCounts=self._map_user_counts(
json.loads(row["USER_COUNTS"]),
),
fieldCounts=self._map_field_counts(json.loads(row["FIELD_COUNTS"])),
)

Expand Down Expand Up @@ -247,7 +249,10 @@ def _map_top_sql_queries(self, top_sql_queries: Dict) -> List[str]:
]
)

def _map_user_counts(self, user_counts: Dict) -> List[DatasetUserUsageCounts]:
def _map_user_counts(
self,
user_counts: Dict,
) -> List[DatasetUserUsageCounts]:
filtered_user_counts = []
for user_count in user_counts:
user_email = user_count.get("email")
Expand All @@ -261,7 +266,11 @@ def _map_user_counts(self, user_counts: Dict) -> List[DatasetUserUsageCounts]:
filtered_user_counts.append(
DatasetUserUsageCounts(
user=make_user_urn(
self.get_user_identifier(user_count["user_name"], user_email)
self.get_user_identifier(
user_count["user_name"],
user_email,
self.config.email_as_user_identifier,
)
),
count=user_count["total"],
# NOTE: Generated emails may be incorrect, as email may be different than
Expand Down Expand Up @@ -347,6 +356,7 @@ def _check_usage_date_ranges(self) -> Any:
def _get_operation_aspect_work_unit(
self, event: SnowflakeJoinedAccessEvent, discovered_datasets: List[str]
) -> Iterable[MetadataWorkUnit]:

if event.query_start_time and event.query_type:
start_time = event.query_start_time
query_type = event.query_type
Expand All @@ -357,7 +367,11 @@ def _get_operation_aspect_work_unit(
)
reported_time: int = int(time.time() * 1000)
last_updated_timestamp: int = int(start_time.timestamp() * 1000)
user_urn = make_user_urn(self.get_user_identifier(user_name, user_email))
user_urn = make_user_urn(
self.get_user_identifier(
user_name, user_email, self.config.email_as_user_identifier
)
)

# NOTE: In earlier `snowflake-usage` connector this was base_objects_accessed, which is incorrect
for obj in event.objects_modified:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,17 @@ def get_dataset_identifier_from_qualified_name(
# Users without email were skipped from both user entries as well as aggregates.
# However email is not mandatory field in snowflake user, user_name is always present.
def get_user_identifier(
self: SnowflakeCommonProtocol, user_name: str, user_email: Optional[str]
self: SnowflakeCommonProtocol,
user_name: str,
user_email: Optional[str],
email_as_user_identifier: bool,
) -> str:
if user_email:
return self.snowflake_identifier(user_email.split("@")[0])
return self.snowflake_identifier(
user_email
if email_as_user_identifier is True
else user_email.split("@")[0]
)
return self.snowflake_identifier(user_name)

# TODO: Revisit this after stateful ingestion can commit checkpoint
Expand Down
22 changes: 18 additions & 4 deletions metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
default=None,
description="database (catalog). If set to Null, all databases will be considered for ingestion.",
)
convert_urns_to_lowercase: bool = Field(
default=False,
description="Enable to convert the SQL Server assets urns to lowercase",
)

@pydantic.validator("uri_args")
def passwords_match(cls, v, values, **kwargs):
Expand Down Expand Up @@ -255,10 +259,20 @@ def get_identifier(
self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any
) -> str:
regular = f"{schema}.{entity}"

qualified_table_name = regular

if self.config.database:
if self.config.database_alias:
return f"{self.config.database_alias}.{regular}"
return f"{self.config.database}.{regular}"
qualified_table_name = f"{self.config.database_alias}.{regular}"
else:
qualified_table_name = f"{self.config.database}.{regular}"

if self.current_database:
return f"{self.current_database}.{regular}"
return regular
qualified_table_name = f"{self.current_database}.{regular}"

return (
qualified_table_name.lower()
if self.config.convert_urns_to_lowercase
else qualified_table_name
)
Original file line number Diff line number Diff line change
Expand Up @@ -5779,7 +5779,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5801,7 +5801,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5823,7 +5823,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5845,7 +5845,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5867,7 +5867,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5889,7 +5889,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5911,7 +5911,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5933,7 +5933,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5955,7 +5955,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand All @@ -5977,7 +5977,7 @@
"type": "FULL_TABLE",
"partition": "FULL_TABLE_SNAPSHOT"
},
"actor": "urn:li:corpuser:abc",
"actor": "urn:li:corpuser:abc@xyz.com",
"operationType": "CREATE",
"lastUpdatedTimestamp": 1654144861367
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
use_legacy_lineage_method=False,
validate_upstreams_against_patterns=False,
include_operational_stats=True,
email_as_user_identifier=True,
start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
tzinfo=timezone.utc
),
Expand Down
Loading

0 comments on commit 565cffe

Please sign in to comment.