Skip to content

Commit

Permalink
fix(ingest/Glue): column upstream lineage between S3 and Glue (datahu…
Browse files Browse the repository at this point in the history
  • Loading branch information
sagar-salvi-apptware authored Jul 19, 2024
1 parent 3733a40 commit 348d449
Show file tree
Hide file tree
Showing 5 changed files with 2,172 additions and 6 deletions.
89 changes: 86 additions & 3 deletions metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pydantic import validator
from pydantic.fields import Field

from datahub.api.entities.dataset.dataset import Dataset
from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import DatasetSourceConfigMixin
from datahub.emitter import mce_builder
Expand Down Expand Up @@ -55,7 +56,11 @@
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.aws import s3_util
from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
from datahub.ingestion.source.aws.s3_util import is_s3_uri, make_s3_urn
from datahub.ingestion.source.aws.s3_util import (
is_s3_uri,
make_s3_urn,
make_s3_urn_for_lineage,
)
from datahub.ingestion.source.common.subtypes import (
DatasetContainerSubTypes,
DatasetSubTypes,
Expand Down Expand Up @@ -90,13 +95,17 @@
DatasetLineageTypeClass,
DatasetProfileClass,
DatasetPropertiesClass,
FineGrainedLineageClass,
FineGrainedLineageDownstreamTypeClass,
FineGrainedLineageUpstreamTypeClass,
GlobalTagsClass,
MetadataChangeEventClass,
OwnerClass,
OwnershipClass,
OwnershipTypeClass,
PartitionSpecClass,
PartitionTypeClass,
SchemaMetadataClass,
TagAssociationClass,
UpstreamClass,
UpstreamLineageClass,
Expand Down Expand Up @@ -171,6 +180,11 @@ class GlueSourceConfig(
description="If enabled, delta schemas can be alternatively fetched from table parameters.",
)

include_column_lineage: bool = Field(
default=True,
description="When enabled, column-level lineage will be extracted from the s3.",
)

def is_profiling_enabled(self) -> bool:
return self.profiling is not None and is_profiling_enabled(
self.profiling.operation_config
Expand Down Expand Up @@ -283,6 +297,7 @@ class GlueSource(StatefulIngestionSourceBase):

def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
super().__init__(config, ctx)
self.ctx = ctx
self.extract_owners = config.extract_owners
self.source_config = config
self.report = GlueSourceReport()
Expand Down Expand Up @@ -714,18 +729,43 @@ def get_lineage_if_enabled(
dataset_properties: Optional[
DatasetPropertiesClass
] = mce_builder.get_aspect_if_available(mce, DatasetPropertiesClass)
# extract dataset schema aspect
schema_metadata: Optional[
SchemaMetadataClass
] = mce_builder.get_aspect_if_available(mce, SchemaMetadataClass)

if dataset_properties and "Location" in dataset_properties.customProperties:
location = dataset_properties.customProperties["Location"]
if is_s3_uri(location):
s3_dataset_urn = make_s3_urn(location, self.source_config.env)
s3_dataset_urn = make_s3_urn_for_lineage(
location, self.source_config.env
)
assert self.ctx.graph
schema_metadata_for_s3: Optional[
SchemaMetadataClass
] = self.ctx.graph.get_schema_metadata(s3_dataset_urn)

if self.source_config.glue_s3_lineage_direction == "upstream":
fine_grained_lineages = None
if (
self.source_config.include_column_lineage
and schema_metadata
and schema_metadata_for_s3
):
fine_grained_lineages = self.get_fine_grained_lineages(
mce.proposedSnapshot.urn,
s3_dataset_urn,
schema_metadata,
schema_metadata_for_s3,
)
upstream_lineage = UpstreamLineageClass(
upstreams=[
UpstreamClass(
dataset=s3_dataset_urn,
type=DatasetLineageTypeClass.COPY,
)
]
],
fineGrainedLineages=fine_grained_lineages or None,
)
return MetadataChangeProposalWrapper(
entityUrn=mce.proposedSnapshot.urn,
Expand All @@ -747,6 +787,49 @@ def get_lineage_if_enabled(
).as_workunit()
return None

def get_fine_grained_lineages(
self,
dataset_urn: str,
s3_dataset_urn: str,
schema_metadata: SchemaMetadata,
schema_metadata_for_s3: SchemaMetadata,
) -> Optional[List[FineGrainedLineageClass]]:
def simplify_field_path(field_path):
return Dataset._simplify_field_path(field_path)

if schema_metadata and schema_metadata_for_s3:
fine_grained_lineages: List[FineGrainedLineageClass] = []
for field in schema_metadata.fields:
field_path_v1 = simplify_field_path(field.fieldPath)
matching_s3_field = next(
(
f
for f in schema_metadata_for_s3.fields
if simplify_field_path(f.fieldPath) == field_path_v1
),
None,
)
if matching_s3_field:
fine_grained_lineages.append(
FineGrainedLineageClass(
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
downstreams=[
mce_builder.make_schema_field_urn(
dataset_urn, field_path_v1
)
],
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
upstreams=[
mce_builder.make_schema_field_urn(
s3_dataset_urn,
simplify_field_path(matching_s3_field.fieldPath),
)
],
)
)
return fine_grained_lineages
return None

def _create_profile_mcp(
self,
mce: MetadataChangeEventClass,
Expand Down
Loading

0 comments on commit 348d449

Please sign in to comment.