diff --git a/.gitignore b/.gitignore index 1b691ad0aaca4..631630d64c7fa 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,6 @@ MANIFEST **/.DS_Store .vscode + +# Metadata Ingestion Generated +metadata-ingestion/generated/** diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java index 8edce498219af..9fba4b8ca7712 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/group/EntityCountsResolver.java @@ -8,6 +8,7 @@ import com.linkedin.entity.client.EntityClient; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; +import io.opentelemetry.extension.annotations.WithSpan; import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; @@ -25,6 +26,7 @@ public EntityCountsResolver(final EntityClient entityClient) { } @Override + @WithSpan public CompletableFuture get(final DataFetchingEnvironment environment) throws Exception { final QueryContext context = environment.getContext(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java index 123ac2a3acc53..2c00eee5f9106 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/recommendation/ListRecommendationsResolver.java @@ -21,6 +21,7 @@ import com.linkedin.metadata.recommendation.SearchRequestContext; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; +import io.opentelemetry.extension.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Collections; import java.util.List; @@ -41,6 +42,7 @@ public class ListRecommendationsResolver implements DataFetcher get(DataFetchingEnvironment environment) { final ListRecommendationsInput input = diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java index b91e98a6ac007..78e39ae346efc 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchResolver.java @@ -8,6 +8,7 @@ import com.linkedin.entity.client.EntityClient; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; +import io.opentelemetry.extension.annotations.WithSpan; import java.util.concurrent.CompletableFuture; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -28,6 +29,7 @@ public class SearchResolver implements DataFetcher get(DataFetchingEnvironment environment) { final SearchInput input = bindArgument(environment.getArgument("input"), SearchInput.class); final String entityName = EntityTypeMapper.getName(input.getType()); diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 0d591ca615264..b5edf446a6c6e 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -12,7 +12,7 @@ node { } // Version of node to use. - version = '14.15.3' + version = '16.8.0' // Version of Yarn to use. yarnVersion = '1.22.0' diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index ab265311733f9..36c178058c889 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -221,6 +221,8 @@ module.exports = { // WIP "docs/advanced/entity-hierarchy", // WIP "docs/advanced/partial-update", // WIP "docs/advanced/pdl-best-practices", + // WIP "docs/introducing-metadata-service-authentication" + // WIP "metadata-models-custom/README" ], }, ], diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index 585b2378552e3..34649ef99d80a 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -39,6 +39,17 @@ For example, here are helpful links to the most popular entities in DataHub's me * Feature Table (a.k.a. MLFeatureTable): [Profile](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,MlFeatureTable,PROD)/Schema?is_lineage_mode=false) [Documentation](https://demo.datahubproject.io/dataset/urn:li:dataset:(urn:li:dataPlatform:datahub,MlFeatureTable,PROD)/Documentation?is_lineage_mode=false) * For the full list of entities in the metadata model, browse them [here](https://demo.datahubproject.io/browse/dataset/prod/datahub/entities) +### Generating documentation for the Metadata Model + +The metadata model documentation can be generated and uploaded into a running DataHub instance using the following command below. + +```console +./gradlew :metadata-ingestion:modelDocUpload +``` + +**_NOTE_**: This will upload the model documentation to the DataHub instance running at the environment variable `$DATAHUB_HOST` (http://localhost:8080 by default) + +It will also generate a few files under `metadata-ingestion/generated/docs` such as a dot file called `metadata_graph.dot` that you can use to visualize the relationships among the entities. ## Querying the Metadata Graph diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/config/Entity.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/config/Entity.java index 0bd1582fb6349..075098c80d79b 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/config/Entity.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/config/Entity.java @@ -12,6 +12,7 @@ @AllArgsConstructor public class Entity { String name; + String doc; String keyAspect; List aspects; } diff --git a/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/Configs.java b/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/Configs.java index 1364edd89a9b3..df06d1bae28e0 100644 --- a/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/Configs.java +++ b/metadata-events/mxe-registration/src/main/java/com/linkedin/mxe/Configs.java @@ -3,8 +3,6 @@ import com.linkedin.pegasus2avro.mxe.FailedMetadataChangeEvent; import com.linkedin.pegasus2avro.mxe.MetadataAuditEvent; import com.linkedin.pegasus2avro.mxe.MetadataChangeEvent; -import com.linkedin.pegasus2avro.mxe.MetadataGraphEvent; -import com.linkedin.pegasus2avro.mxe.MetadataSearchEvent; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -26,8 +24,6 @@ public class Configs { put(Topics.METADATA_AUDIT_EVENT, MetadataAuditEvent.SCHEMA$); put(Topics.METADATA_CHANGE_EVENT, MetadataChangeEvent.SCHEMA$); put(Topics.FAILED_METADATA_CHANGE_EVENT, FailedMetadataChangeEvent.SCHEMA$); - put(Topics.METADATA_GRAPH_EVENT, MetadataGraphEvent.SCHEMA$); - put(Topics.METADATA_SEARCH_EVENT, MetadataSearchEvent.SCHEMA$); put(Topics.DEV_METADATA_AUDIT_EVENT, MetadataAuditEvent.SCHEMA$); put(Topics.DEV_METADATA_CHANGE_EVENT, MetadataChangeEvent.SCHEMA$); diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 454a29c71248f..f809e89b00066 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -42,6 +42,18 @@ task installDev(type: Exec, dependsOn: [install]) { commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e .[dev] && touch ${venv_name}/.build_install_dev_sentinel" } + +task modelDocGen(type: Exec, dependsOn: [codegen, installDev]) { + inputs.files(project.fileTree(dir: "../metadata-events/mxe-schemas/src/", include: "**/*.avsc")) + outputs.dir('generated/docs') + commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/modeldocgen.sh" +} + +task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) { + commandLine 'bash', '-c', "source ${venv_name}/bin/activate && ./scripts/modeldocupload.sh" +} + + task lint(type: Exec, dependsOn: installDev) { /* The find/sed combo below is a temporary work-around for the following mypy issue with airflow 2.2.0: @@ -79,7 +91,6 @@ task installDevTest(type: Exec, dependsOn: [installDev]) { def testFile = hasProperty('testFile') ? testFile : 'unknown' task testSingle(dependsOn: [installDevTest]) { - println "$testFile" doLast { if (testFile != 'unknown') { exec { diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py new file mode 100644 index 0000000000000..3426ca5375976 --- /dev/null +++ b/metadata-ingestion/scripts/modeldocgen.py @@ -0,0 +1,582 @@ +import json +import logging +import unittest.mock +from dataclasses import Field, dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import avro.schema +import click + +from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.ingestion.api.common import PipelineContext, RecordEnvelope +from datahub.ingestion.api.sink import NoopWriteCallback +from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields +from datahub.ingestion.sink.file import FileSink, FileSinkConfig +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField +from datahub.metadata.schema_classes import ( + BrowsePathsClass, + ChangeTypeClass, + DatasetPropertiesClass, + DatasetSnapshotClass, + ForeignKeyConstraintClass, + GlobalTagsClass, + MetadataChangeEventClass, + OtherSchemaClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StringTypeClass, + SubTypesClass, + SystemMetadataClass, + TagAssociationClass, +) + +logger = logging.getLogger(__name__) + + +def capitalize_first(something: str) -> str: + return something[0:1].upper() + something[1:] + + +@dataclass +class EntityDefinition: + name: str + keyAspect: str + aspects: List[str] = field(default_factory=list) + aspect_map: Optional[Dict[str, Any]] = None + relationship_map: Optional[Dict[str, str]] = None + doc: Optional[str] = None + # schema: Optional[avro.schema.Schema] = None + # logical_schema: Optional[avro.schema.Schema] = None + + # @validator("name") + # def lower_everything(cls, v: str) -> str: + # return v.lower() + + @property + def display_name(self): + return capitalize_first(self.name) + + +@dataclass +class AspectDefinition: + name: str + EntityUrns: Optional[List[str]] = None + schema: Optional[avro.schema.Schema] = None + type: Optional[str] = None + + +entity_registry: Dict[str, EntityDefinition] = {} + + +def get_aspects_from_snapshot( + snapshot_schema: avro.schema.RecordSchema, +) -> Dict[str, AspectDefinition]: + union_schema: avro.schema.UnionSchema = snapshot_schema.fields[1].type.items + aspect_map = {} + for aspect_schema in union_schema.schemas: + if "Aspect" in aspect_schema.props: + aspectDef = AspectDefinition( + schema=aspect_schema, + name=aspect_schema.props["Aspect"].get("name"), + ) + aspect_map[aspectDef.name] = aspectDef + + return aspect_map + + +aspect_registry: Dict[str, AspectDefinition] = {} + + +# Patch add_name method to NOT complain about duplicate names +def add_name(self, name_attr, space_attr, new_schema): + to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace) + + if self.names: + self.names[to_add.fullname] = new_schema + return to_add + + +def load_schema_file(schema_file: str) -> None: + + with open(schema_file) as f: + raw_schema_text = f.read() + + avro_schema = avro.schema.parse(raw_schema_text) + + if ( + isinstance(avro_schema, avro.schema.RecordSchema) + and "Aspect" in avro_schema.other_props + ): + # probably an aspect schema + record_schema: avro.schema.RecordSchema = avro_schema + aspect_def = record_schema.get_prop("Aspect") + try: + aspect_definition = AspectDefinition(**aspect_def) + except Exception as e: + import pdb + + breakpoint() + + aspect_definition.schema = record_schema + aspect_registry[aspect_definition.name] = aspect_definition + elif avro_schema.name == "MetadataChangeEvent": + # probably an MCE schema + field: Field = avro_schema.fields[1] + assert isinstance(field.type, avro.schema.UnionSchema) + for member_schema in field.type.schemas: + if "Entity" in member_schema.props: + entity_def = member_schema.props["Entity"] + entity_name = entity_def["name"] + entity_definition = entity_registry.get( + entity_name, EntityDefinition(**entity_def) + ) + entity_definition.aspect_map = get_aspects_from_snapshot(member_schema) + all_aspects = [a for a in entity_definition.aspect_map.keys()] + # in terms of order, we prefer the aspects from snapshot over the aspects from the config registry + # so we flip the aspect list here + for aspect_name in entity_definition.aspects: + if aspect_name not in all_aspects: + all_aspects.append(aspect_name) + entity_definition.aspects = all_aspects + entity_registry[entity_name] = entity_definition + else: + print(f"Ignoring schema {schema_file}") + + +@dataclass +class Relationship: + name: str + src: str + dst: str + doc: Optional[str] = None + id: Optional[str] = None + + +@dataclass +class RelationshipAdjacency: + self_loop: List[Relationship] = field(default_factory=list) + incoming: List[Relationship] = field(default_factory=list) + outgoing: List[Relationship] = field(default_factory=list) + + +@dataclass +class RelationshipGraph: + map: Dict[str, RelationshipAdjacency] = field(default_factory=dict) + + def add_edge( + self, src: str, dst: str, label: str, reason: str, edge_id: Optional[str] = None + ) -> None: + relnship = Relationship( + label, src, dst, reason, id=edge_id or f"{src}:{label}:{dst}:{reason}" + ) + + if src == dst: + adjacency = self.map.get(src, RelationshipAdjacency()) + for reln in adjacency.self_loop: + if relnship.id == reln.id: + print(f"Skipping adding edge since ids match {reln.id}") + return + adjacency.self_loop.append(relnship) + self.map[src] = adjacency + else: + adjacency = self.map.get(src, RelationshipAdjacency()) + for reln in adjacency.outgoing: + if relnship.id == reln.id: + logger.info(f"Skipping adding edge since ids match {reln.id}") + return + + adjacency.outgoing.append(relnship) + self.map[src] = adjacency + + adjacency = self.map.get(dst, RelationshipAdjacency()) + for reln in adjacency.incoming: + if relnship.id == reln.id: + logger.info(f"Skipping adding edge since ids match {reln.id}") + return + + adjacency.incoming.append(relnship) + self.map[dst] = adjacency + + def get_adjacency(self, node: str) -> RelationshipAdjacency: + return self.map.get(node, RelationshipAdjacency()) + + +def make_relnship_docs(relationships: List[Relationship], direction: str) -> str: + doc = "" + map: Dict[str, List[Relationship]] = {} + for relnship in relationships: + map[relnship.name] = map.get(relnship.name, []) + map[relnship.name].append(relnship) + for rel_name, relnships in map.items(): + doc += f"\n- {rel_name}\n" + for relnship in relnships: + doc += f"\n - {relnship.dst if direction == 'outgoing' else relnship.src}{relnship.doc or ''}" + return doc + + +def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str: + entity_name = entity_display_name[0:1].lower() + entity_display_name[1:] + entity_def: Optional[EntityDefinition] = entity_registry.get(entity_name, None) + if entity_def: + import pdb + + # breakpoint() + doc = entity_def.doc or f"This is the {entity_def.display_name} entity." + # create relationships section + relationships_section = f"\n## Relationships\n" + adjacency = graph.get_adjacency(entity_def.display_name) + if adjacency.self_loop: + relationships_section += f"\n### Self\nThese are the relationships to itself, stored in this entity's aspects" + for relnship in adjacency.self_loop: + relationships_section += f"\n- {relnship.name} ({relnship.doc[1:] if relnship.doc else ''})" + + if adjacency.outgoing: + relationships_section += f"\n### Outgoing\nThese are the relationships stored in this entity's aspects" + relationships_section += make_relnship_docs( + adjacency.outgoing, direction="outgoing" + ) + + if adjacency.incoming: + relationships_section += f"\n### Incoming\nThese are the relationships stored in other entity's aspects" + relationships_section += make_relnship_docs( + adjacency.incoming, direction="incoming" + ) + + # create global metadata graph + global_graph_url = "https://github.com/linkedin/datahub/raw/master/docs/imgs/datahub-metadata-model.png" + global_graph_section = ( + f"\n## [Global Metadata Model]({global_graph_url})" + + f"\n![Global Graph]({global_graph_url})" + ) + return doc + relationships_section + global_graph_section + else: + raise Exception(f"Failed to find information for entity: {entity_name}") + + +def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: + def strip_types(field_path: str) -> str: + import re + + final_path = field_path + final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) + final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) + return final_path + + datasets: List[DatasetSnapshotClass] = [] + + for entity_name, entity_def in entity_registry.items(): + entity_display_name = entity_def.display_name + entity_fields = [] + for aspect_name in entity_def.aspects: + if aspect_name not in aspect_registry: + print(f"Did not find aspect name: {aspect_name} in aspect_registry") + continue + import pdb + + # breakpoint() + # all aspects should have a schema + aspect_schema = aspect_registry[aspect_name].schema + assert aspect_schema + entity_fields.append( + { + "type": aspect_schema.to_json(), + "name": aspect_name, + } + ) + + if entity_fields: + names = avro.schema.Names() + field_objects = [] + for f in entity_fields: + field = avro.schema.Field( + type_=f["type"], + name=f["name"], + has_default=False, + ) + field_objects.append(field) + + with unittest.mock.patch("avro.schema.Names.add_name", add_name): + entity_avro_schema = avro.schema.RecordSchema( + name=entity_name, + namespace="datahub.metadata.model", + names=names, + fields=[], + ) + entity_avro_schema.set_prop("fields", field_objects) + rawSchema = json.dumps(entity_avro_schema.to_json()) + # always add the URN which is the primary key + urn_field = SchemaField( + fieldPath="urn", + type=SchemaFieldDataTypeClass(type=StringTypeClass()), + nativeDataType="string", + nullable=False, + isPartOfKey=True, + description=f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.", + ) + schema_fields: List[SchemaField] = [urn_field] + avro_schema_to_mce_fields( + rawSchema + ) + foreign_keys: List[ForeignKeyConstraintClass] = [] + source_dataset_urn = make_dataset_urn( + platform=make_data_platform_urn("datahub"), + name=f"{entity_display_name}", + ) + for f_field in schema_fields: + if f_field.jsonProps: + import pdb + #breakpoint() + json_dict = json.loads(f_field.jsonProps) + if "Aspect" in json_dict: + aspect_info = json_dict["Aspect"] + f_field.globalTags = f_field.globalTags or GlobalTagsClass( + tags=[] + ) + f_field.globalTags.tags.append( + TagAssociationClass(tag="urn:li:tag:Aspect") + ) + # if this is the key aspect, also add primary-key + if entity_def.keyAspect == aspect_info.get("name"): + f_field.isPartOfKey = True + + if "timeseries" == aspect_info.get("type", ""): + # f_field.globalTags = f_field.globalTags or GlobalTagsClass( + # tags=[] + # ) + f_field.globalTags.tags.append( + TagAssociationClass(tag="urn:li:tag:Temporal") + ) + import pdb + + # breakpoint() + if "Searchable" in json_dict: + f_field.globalTags = f_field.globalTags or GlobalTagsClass( + tags=[] + ) + f_field.globalTags.tags.append( + TagAssociationClass(tag="urn:li:tag:Searchable") + ) + if "Relationship" in json_dict: + relationship_info = json_dict["Relationship"] + # detect if we have relationship specified at leaf level or thru path specs + if "entityTypes" not in relationship_info: + # path spec + assert ( + len(relationship_info.keys()) == 1 + ), "We should never have more than one path spec assigned to a relationship annotation" + final_info = None + for k, v in relationship_info.items(): + final_info = v + relationship_info = final_info + + assert "entityTypes" in relationship_info + + entity_types: List[str] = relationship_info.get( + "entityTypes", [] + ) + relnship_name = relationship_info.get("name", None) + for entity_type in entity_types: + destination_entity_name = capitalize_first(entity_type) + + foreign_dataset_urn = make_dataset_urn( + platform=make_data_platform_urn("datahub"), + name=destination_entity_name, + ) + fkey = ForeignKeyConstraintClass( + name=relnship_name, + foreignDataset=foreign_dataset_urn, + foreignFields=[ + f"urn:li:schemaField:({foreign_dataset_urn}, urn)" + ], + sourceFields=[ + f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" + ], + ) + foreign_keys.append(fkey) + relnships_graph.add_edge( + entity_display_name, + destination_entity_name, + fkey.name, + f" via `{strip_types(f_field.fieldPath)}`", + edge_id=f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", + ) + + schemaMetadata = SchemaMetadataClass( + schemaName=f"{entity_name}", + platform=make_data_platform_urn("datahub"), + platformSchema=OtherSchemaClass(rawSchema=rawSchema), + fields=schema_fields, + version=0, + hash="", + foreignKeys=foreign_keys if foreign_keys else None, + ) + + dataset = DatasetSnapshotClass( + urn=make_dataset_urn( + platform=make_data_platform_urn("datahub"), + name=f"{entity_display_name}", + ), + aspects=[ + schemaMetadata, + GlobalTagsClass( + tags=[TagAssociationClass(tag="urn:li:tag:Entity")] + ), + BrowsePathsClass([f"/prod/datahub/entities/{entity_display_name}"]), + ], + ) + datasets.append(dataset) + + events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [] + + for d in datasets: + entity_name = d.urn.split(":")[-1].split(",")[1] + d.aspects.append( + DatasetPropertiesClass( + description=make_entity_docs(entity_name, relnships_graph) + ) + ) + + mce = MetadataChangeEventClass( + proposedSnapshot=d, + systemMetadata=SystemMetadataClass(runId="test-metamodel"), + ) + events.append(mce) + + mcp = MetadataChangeProposalWrapper( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn=d.urn, + aspectName="subTypes", + aspect=SubTypesClass(typeNames=["entity"]), + ) + events.append(mcp) + return events + + +from datahub.configuration.common import ConfigModel + + +class EntityRegistry(ConfigModel): + entities: List[EntityDefinition] + + +def load_registry_file(registry_file: str) -> Dict[str, EntityDefinition]: + import yaml + + with open(registry_file, "r") as f: + registry = EntityRegistry.parse_obj(yaml.safe_load(f)) + for entity_def in registry.entities: + entity_registry[entity_def.name] = entity_def + + return entity_registry + + +@click.command() +@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True) +@click.option("--server", type=str, required=False) +@click.option("--file", type=str, required=False) +@click.option("--dot", type=str, required=False) +@click.option("--png", type=str, required=False) +def generate( + schema_files: List[str], + server: Optional[str], + file: Optional[str], + dot: Optional[str], + png: Optional[str] +) -> None: + logger.info(f"server = {server}") + logger.info(f"file = {file}") + logger.info(f"dot = {dot}") + logger.info(f"png = {png}") + + for schema_file in schema_files: + if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): + # registry file + load_registry_file(schema_file) + else: + # schema file + load_schema_file(schema_file) + + relationship_graph = RelationshipGraph() + events = generate_stitched_record(relationship_graph) + + if file: + logger.info(f"Will write events to {file}") + Path(file).parent.mkdir(parents=True, exist_ok=True) + fileSink = FileSink( + PipelineContext(run_id="generated-metaModel"), + FileSinkConfig(filename=file), + ) + for e in events: + fileSink.write_record_async( + RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback() + ) + fileSink.close() + pipeline_config = { + "source": { + "type": "file", + "config": {"filename": file}, + }, + "sink": { + "type": "datahub-rest", + "config": { + "server": "${DATAHUB_SERVER:-http://localhost:8080}", + "token": "${DATAHUB_TOKEN:-}", + }, + }, + "run_id": "modeldoc-generated", + } + pipeline_file = Path(file).parent.absolute() / "pipeline.yml" + with open(pipeline_file, "w") as f: + json.dump(pipeline_config, f, indent=2) + logger.info(f"Wrote pipeline to {pipeline_file}") + + if server: + logger.info(f"Will send events to {server}") + assert server.startswith("http://"), "server address must start with http://" + emitter = DatahubRestEmitter(gms_server=server) + emitter.test_connection() + for e in events: + emitter.emit(e) + + if dot: + logger.info(f"Will write dot file to {dot}") + + import pydot + + graph = pydot.Dot("my_graph", graph_type="graph") + for node, adjacency in relationship_graph.map.items(): + my_node = pydot.Node( + node, + label=node, + shape="box", + ) + graph.add_node(my_node) + if adjacency.self_loop: + for relnship in adjacency.self_loop: + graph.add_edge( + pydot.Edge( + src=relnship.src, dst=relnship.dst, label=relnship.name + ) + ) + if adjacency.outgoing: + for relnship in adjacency.outgoing: + graph.add_edge( + pydot.Edge( + src=relnship.src, dst=relnship.dst, label=relnship.name + ) + ) + Path(dot).parent.mkdir(parents=True, exist_ok=True) + graph.write_raw(dot) + if png: + try: + graph.write_png(png) + except Exception as e: + logger.error("Failed to create png file. Do you have graphviz installed?") + raise e + +if __name__ == "__main__": + logger.setLevel("INFO") + generate() diff --git a/metadata-ingestion/scripts/modeldocgen.sh b/metadata-ingestion/scripts/modeldocgen.sh new file mode 100755 index 0000000000000..c55be8bbda0bf --- /dev/null +++ b/metadata-ingestion/scripts/modeldocgen.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -euo pipefail + +OUTDIR=./generated/docs + +# Note: this assumes that datahub has already been built with `./gradlew build`. +DATAHUB_ROOT=.. +REGISTRY_ROOT="$DATAHUB_ROOT/metadata-models/src/main/resources" +SCHEMAS_ROOT="$DATAHUB_ROOT/metadata-models/src/mainGeneratedAvroSchema/avro/com/linkedin" +FILES="$REGISTRY_ROOT/entity-registry.yml $SCHEMAS_ROOT/mxe/MetadataChangeEvent.avsc" +# Since we depend on jq, check if jq is installed +if ! which jq > /dev/null; then + echo "jq is not installed. Please install jq and rerun (https://stedolan.github.io/jq/)" + exit 1 +fi + +find $SCHEMAS_ROOT -name "*.avsc" | sort | while read file +do +# Add all other files that are aspects but not included in the above + if (jq '.Aspect' -e $file > /dev/null) + then + FILES="${FILES} ${file}" + fi + echo $FILES > /tmp/docgen_files.txt +done + +FILES=$(cat /tmp/docgen_files.txt) + +rm -r $OUTDIR || true +#echo $FILES +python scripts/modeldocgen.py $FILES --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json $@ diff --git a/metadata-ingestion/scripts/modeldocupload.sh b/metadata-ingestion/scripts/modeldocupload.sh new file mode 100755 index 0000000000000..e04cf82c25fb4 --- /dev/null +++ b/metadata-ingestion/scripts/modeldocupload.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -euo pipefail + +datahub ingest -c generated/docs/pipeline.yml diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 0d89d127a6572..58fa876030c2f 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -190,6 +190,7 @@ def get_long_description(): "jsonpickle", "build", "twine", + "pydot", *list( dependency for plugin in [ diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 3a55899c30256..2c4317013ef5a 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -56,7 +56,12 @@ def ingest() -> None: default=False, help="Perform limited ingestion from the source to the sink to get a quick preview.", ) -def run(config: str, dry_run: bool, preview: bool) -> None: +@click.option( + "--strict-warnings/--no-strict-warnings", + default=False, + help="If enabled, ingestion runs with warnings will yield a non-zero error code", +) +def run(config: str, dry_run: bool, preview: bool, strict_warnings: bool) -> None: """Ingest metadata into DataHub.""" logger.debug("DataHub CLI version: %s", datahub_package.nice_version_name()) @@ -73,7 +78,7 @@ def run(config: str, dry_run: bool, preview: bool) -> None: logger.info("Starting metadata ingestion") pipeline.run() logger.info("Finished metadata ingestion") - ret = pipeline.pretty_print_summary() + ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings) sys.exit(ret) diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py index a32f15a812949..01bd0bf82cab2 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py @@ -1,3 +1,4 @@ +import json import logging from typing import Any, Callable, Dict, Generator, List, Optional, Union @@ -20,6 +21,7 @@ TimeTypeClass, UnionTypeClass, ) +from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass """A helper file for Avro schema -> MCE schema transformations""" @@ -235,6 +237,7 @@ def emit(self) -> Generator[SchemaField, None, None]: schema = self._schema actual_schema = self._actual_schema + if isinstance(schema, avro.schema.Field): # Field's schema is actually it's type. schema = schema.type @@ -259,8 +262,24 @@ def emit(self) -> Generator[SchemaField, None, None]: native_data_type = actual_schema.props.get( "native_data_type", native_data_type ) + + field_path = self._converter._get_cur_field_path() + merged_props = {} + merged_props.update(self._schema.other_props) + merged_props.update(schema.other_props) + + tags = None + if "deprecated" in merged_props: + description = ( + f"DEPRECATED: {self._schema.other_props['deprecated']}\n" + + description + ) + tags = GlobalTagsClass( + tags=[TagAssociationClass(tag="urn:li:tag:Deprecated")] + ) + field = SchemaField( - fieldPath=self._converter._get_cur_field_path(), + fieldPath=field_path, # Populate it with the simple native type for now. nativeDataType=native_data_type, type=self._converter._get_column_type( @@ -270,6 +289,8 @@ def emit(self) -> Generator[SchemaField, None, None]: recursive=False, nullable=self._converter._is_nullable(schema), isPartOfKey=self._converter._is_key_schema, + globalTags=tags, + jsonProps=json.dumps(merged_props) if merged_props else None, ) yield field diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index ab9dfaba2b078..213cd09a965ef 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -189,7 +189,7 @@ def raise_from_status(self, raise_warnings: bool = False) -> None: "Source reported warnings", self.source.get_report() ) - def pretty_print_summary(self) -> int: + def pretty_print_summary(self, warnings_as_failure: bool = False) -> int: click.echo() click.secho(f"Source ({self.config.source.type}) report:", bold=True) click.echo(self.source.get_report().as_string()) @@ -201,7 +201,7 @@ def pretty_print_summary(self) -> int: return 1 elif self.source.get_report().warnings or self.sink.get_report().warnings: click.secho("Pipeline finished with warnings", fg="yellow", bold=True) - return 0 + return 1 if warnings_as_failure else 0 else: click.secho("Pipeline finished successfully", fg="green", bold=True) return 0 diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json index 52f83f5941304..e7719534e7b00 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json @@ -10,7 +10,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Fri Oct 01 05:17:59 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:27 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -20,7 +20,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1633065479", + "Table Parameters: transient_lastDdlTime": "1638726627", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -75,7 +75,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "bar", @@ -91,7 +92,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null } ], "primaryKeys": null, @@ -106,6 +108,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } }, @@ -120,7 +124,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Fri Oct 01 05:17:59 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:28 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", @@ -130,7 +134,7 @@ "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1633065482", + "Table Parameters: transient_lastDdlTime": "1638726632", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -185,7 +189,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", @@ -203,7 +208,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", @@ -219,7 +225,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", @@ -237,7 +244,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" } ], "primaryKeys": null, @@ -252,6 +260,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } }, @@ -266,7 +276,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Oct 12 10:51:00 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:33 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -276,7 +286,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1634035860", + "Table Parameters: transient_lastDdlTime": "1638726633", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -331,7 +341,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=map].[type=string].recordid", @@ -350,7 +361,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"map\", \"key_type\": {\"type\": \"int\", \"native_data_type\": \"int\", \"_nullable\": true}, \"key_native_data_type\": \"int\"}" } ], "primaryKeys": null, @@ -365,6 +377,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } }, @@ -379,7 +393,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Mon Oct 11 12:54:49 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:33 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -389,7 +403,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1633956889", + "Table Parameters: transient_lastDdlTime": "1638726633", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -444,7 +458,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service", @@ -460,7 +475,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", @@ -476,7 +492,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider", @@ -492,7 +509,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=string].name", @@ -508,7 +526,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"varchar(50)\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=int].id", @@ -524,7 +543,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"tinyint\", \"_nullable\": true}" } ], "primaryKeys": null, @@ -539,6 +559,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } }, @@ -553,7 +575,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Fri Oct 01 05:17:56 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:23 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", @@ -562,7 +584,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1633065477", + "Table Parameters: transient_lastDdlTime": "1638726625", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -617,7 +639,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "bar", @@ -633,7 +656,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null } ], "primaryKeys": null, @@ -648,6 +672,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } }, @@ -662,7 +688,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Fri Oct 01 05:17:59 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:28 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -672,7 +698,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1633065479", + "Table Parameters: transient_lastDdlTime": "1638726628", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -727,7 +753,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service", @@ -743,7 +770,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", @@ -759,7 +787,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", @@ -777,7 +806,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" } ], "primaryKeys": null, @@ -792,6 +822,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } }, @@ -806,7 +838,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Tue Oct 12 10:29:17 UTC 2021", + "CreateTime:": "Sun Dec 05 17:50:33 UTC 2021", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -816,7 +848,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1634034557", + "Table Parameters: transient_lastDdlTime": "1638726633", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -869,11 +901,12 @@ } } }, - "nativeDataType": "uniontype<, struct>>", + "nativeDataType": "union", "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=int].foo", @@ -891,7 +924,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=double].foo", @@ -909,7 +943,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=array].[type=string].foo", @@ -923,11 +958,12 @@ } } }, - "nativeDataType": "array", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct0].foo", @@ -941,11 +977,12 @@ } } }, - "nativeDataType": "struct", + "nativeDataType": "struct0", "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct0].foo.[type=int].a", @@ -961,7 +998,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct0].foo.[type=string].b", @@ -977,7 +1015,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct1].foo", @@ -991,11 +1030,12 @@ } } }, - "nativeDataType": "struct", + "nativeDataType": "struct1", "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": null }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct1].foo.[type=int].c", @@ -1011,7 +1051,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, { "fieldPath": "[version=2.0].[type=struct].[type=union].[type=struct1].foo.[type=double].d", @@ -1027,7 +1068,8 @@ "recursive": false, "globalTags": null, "glossaryTerms": null, - "isPartOfKey": false + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" } ], "primaryKeys": null, @@ -1042,6 +1084,8 @@ "systemMetadata": { "lastObserved": 1586847600000, "runId": "hive-test", + "registryName": null, + "registryVersion": null, "properties": null } } diff --git a/metadata-ingestion/tests/integration/mysql/test_mysql.py b/metadata-ingestion/tests/integration/mysql/test_mysql.py index 2a548ee6c0abe..65b241b51a059 100644 --- a/metadata-ingestion/tests/integration/mysql/test_mysql.py +++ b/metadata-ingestion/tests/integration/mysql/test_mysql.py @@ -24,7 +24,9 @@ def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "mysql_to_file.yml").resolve() - result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) + result = runner.invoke( + datahub, ["ingest", "--strict-warnings", "-c", f"{config_file}"] + ) assert_result_ok(result) # Verify the output. diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json index aeaba2a9da8e9..45f0da917fbe7 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json @@ -1,1169 +1,1224 @@ [ - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", - "comment": "This table has array of structs", - "another.comment": "This table has no partitions", - "numfiles": "4", - "numrows": "1", - "rawdatasize": "32", - "totalsize": "138", - "transient_lastddltime": "1633434492" - }, - "externalUrl": null, - "description": "This table has array of structs", - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.array_struct_test", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "property_id", - "jsonPath": null, - "nullable": true, - "description": "id of property", +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "another.comment": "This table has no partitions", + "comment": "This table has array of structs", + "numfiles": "1", + "numrows": "1", + "rawdatasize": "32", + "totalsize": "33", + "transient_lastddltime": "1638688532" + }, + "externalUrl": null, + "description": "This table has array of structs", + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.array_struct_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "jsonPath": null, + "nullable": true, + "description": "id of property", + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", - "jsonPath": null, - "nullable": true, - "description": "service types and providers", + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "jsonPath": null, + "nullable": true, + "description": "service types and providers", + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(INTEGER())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + } + }, + "nativeDataType": "ARRAY(INTEGER())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(INTEGER())\"}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.map_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", - "numfiles": "0", - "numrows": "0", - "rawdatasize": "0", - "totalsize": "0", - "transient_lastddltime": "1634127353" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.map_test", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "keyvalue", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.map_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "0", + "numrows": "0", + "rawdatasize": "0", + "totalsize": "0", + "transient_lastddltime": "1638688536" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.map_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "keyvalue", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=map].[type=string].recordid", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=map].[type=string].recordid", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.MapType": { - "keyType": null, - "valueType": null - } + "com.linkedin.pegasus2avro.schema.MapType": { + "keyType": null, + "valueType": null } - }, - "nativeDataType": "MAP(INTEGER(), VARCHAR())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + } + }, + "nativeDataType": "MAP(INTEGER(), VARCHAR())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"MAP(INTEGER(), VARCHAR())\", \"key_type\": {\"type\": \"int\", \"native_data_type\": \"INTEGER()\", \"_nullable\": true}, \"key_native_data_type\": \"INTEGER()\"}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.nested_struct_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", - "numfiles": "0", - "numrows": "0", - "rawdatasize": "0", - "totalsize": "0", - "transient_lastddltime": "1634127353" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.nested_struct_test", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "property_id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.nested_struct_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "0", + "numrows": "0", + "rawdatasize": "0", + "totalsize": "0", + "transient_lastddltime": "1638688535" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.nested_struct_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "ROW([('type', VARCHAR()), ('provider', ROW([('name', VARCHAR(length=50)), ('id', SMALLINT())]))])", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ROW([('type', VARCHAR()), ('provider', ROW([('name', VARCHAR(length=50)), ('id', SMALLINT())]))])", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ROW([('type', VARCHAR()), ('provider', ROW([('name', VARCHAR(length=50)), ('id', SMALLINT())]))])\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "ROW([('name', VARCHAR(length=50)), ('id', SMALLINT())])", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=string].name", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ROW([('name', VARCHAR(length=50)), ('id', SMALLINT())])", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ROW([('name', VARCHAR(length=50)), ('id', SMALLINT())])\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=string].name", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=int].id", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR(length=50)\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=struct].provider.[type=int].id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "SMALLINT()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "SMALLINT()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"SMALLINT()\", \"_nullable\": true}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.pokes,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "transient_lastddltime": "1633435441" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.pokes", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "foo", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.pokes,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "transient_lastddltime": "1638688524" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.pokes", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "foo", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "bar", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "bar", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "baz", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "baz", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", - "numfiles": "0", - "numrows": "0", - "rawdatasize": "0", - "totalsize": "0", - "transient_lastddltime": "1633434486" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.struct_test", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "property_id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "0", + "numrows": "0", + "rawdatasize": "0", + "totalsize": "0", + "transient_lastddltime": "1638688528" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.struct_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(INTEGER())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + } + }, + "nativeDataType": "ARRAY(INTEGER())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(INTEGER())\"}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test_view_materialized,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "numfiles": "0", - "totalsize": "0", - "transient_lastddltime": "1633434491" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.struct_test_view_materialized", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "property_id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.struct_test_view_materialized,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "numfiles": "0", + "totalsize": "0", + "transient_lastddltime": "1638688535" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.struct_test_view_materialized", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))])\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(INTEGER())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + } + }, + "nativeDataType": "ARRAY(INTEGER())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(INTEGER())\"}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1._test_table_underscore,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", - "numfiles": "0", - "numrows": "0", - "rawdatasize": "0", - "totalsize": "0", - "transient_lastddltime": "1633434486" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1._test_table_underscore", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "foo", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1._test_table_underscore,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "0", + "numrows": "0", + "rawdatasize": "0", + "totalsize": "0", + "transient_lastddltime": "1638688527" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1._test_table_underscore", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "foo", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "bar", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "bar", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.union_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", - "numfiles": "0", - "numrows": "0", - "rawdatasize": "0", - "totalsize": "0", - "transient_lastddltime": "1634127353" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.union_test", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.union_test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "column_stats_accurate": "{\"BASIC_STATS\":\"true\"}", + "numfiles": "0", + "numrows": "0", + "rawdatasize": "0", + "totalsize": "0", + "transient_lastddltime": "1638688536" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.union_test", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "ROW([('tag', SMALLINT()), ('field0', INTEGER()), ('field1', DOUBLE()), ('field2', ARRAY(VARCHAR())), ('field3', ROW([('a', INTEGER()), ('b', VARCHAR())]))])", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=int].tag", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ROW([('tag', SMALLINT()), ('field0', INTEGER()), ('field1', DOUBLE()), ('field2', ARRAY(VARCHAR())), ('field3', ROW([('a', INTEGER()), ('b', VARCHAR())]))])", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ROW([('tag', SMALLINT()), ('field0', INTEGER()), ('field1', DOUBLE()), ('field2', ARRAY(VARCHAR())), ('field3', ROW([('a', INTEGER()), ('b', VARCHAR())]))])\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=int].tag", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "SMALLINT()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=int].field0", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "SMALLINT()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"SMALLINT()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=int].field0", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=double].field1", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"INTEGER()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=double].field1", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "DOUBLE()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=array].[type=string].field2", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "DOUBLE()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"DOUBLE()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=array].[type=string].field2", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(VARCHAR())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=struct].field3", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ARRAY(VARCHAR())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(VARCHAR())\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=struct].field3", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "ROW([('a', INTEGER()), ('b', VARCHAR())])", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=struct].field3.[type=int].a", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ROW([('a', INTEGER()), ('b', VARCHAR())])", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ROW([('a', INTEGER()), ('b', VARCHAR())])\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=struct].field3.[type=int].a", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=struct].field3.[type=string].b", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"INTEGER()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].foo.[type=struct].field3.[type=string].b", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR()\", \"_nullable\": true}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "transient_lastddltime": "1633434492", - "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", - "is_view": "True" - }, - "externalUrl": null, - "description": null, - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "hivedb.db1.array_struct_test_view", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "property_id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "transient_lastddltime": "1638688535", + "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", + "is_view": "True" + }, + "externalUrl": null, + "description": null, + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "hivedb.db1.array_struct_test_view", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(ROW([('type', VARCHAR()), ('provider', ARRAY(INTEGER()))]))\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"VARCHAR()\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(INTEGER())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + } + }, + "nativeDataType": "ARRAY(INTEGER())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(INTEGER())\"}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-hive-test", - "properties": null + } + ] } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "registryName": null, + "registryVersion": null, + "properties": null } +} ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/trino/trino_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_mces_golden.json index 9de96147ac5c4..2f5130fb996ce 100644 --- a/metadata-ingestion/tests/integration/trino/trino_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_mces_golden.json @@ -1,532 +1,558 @@ [ - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "library_catalog.librarydb.book", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "jsonPath": null, - "nullable": true, - "description": null, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "library_catalog.librarydb.book", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "name", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "name", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "author", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "author", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "publisher", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "publisher", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "tags", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "tags", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "JSON()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.RecordType": {} + } }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=int].genre_ids", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "JSON()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=int].genre_ids", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType" : null - } + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null } - }, - "nativeDataType": "ARRAY(INTEGER())", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + } + }, + "nativeDataType": "ARRAY(INTEGER())", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"ARRAY(INTEGER())\"}" + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "library_catalog.librarydb.issue_history", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "book_id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "library_catalog.librarydb.issue_history", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "book_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "member_id", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "member_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "issue_date", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "issue_date", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.DateType": {} + } }, - { - "fieldPath": "return_date", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "DATE()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "return_date", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "library_catalog.librarydb.member", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "library_catalog.librarydb.member", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "name", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "name", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "library_catalog.librarydb.book_in_circulation", - "platform": "urn:li:dataPlatform:trino", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "jsonPath": null, - "nullable": true, - "description": null, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "library_catalog.librarydb.book_in_circulation", + "platform": "urn:li:dataPlatform:trino", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "name", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "name", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "author", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "author", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "publisher", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "publisher", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=50)", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.StringType": {} + } }, - { - "fieldPath": "member_id", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "VARCHAR(length=50)", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "member_id", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false + "com.linkedin.pegasus2avro.schema.NumberType": {} + } }, - { - "fieldPath": "issue_date", - "jsonPath": null, - "nullable": true, - "description": null, + "nativeDataType": "INTEGER()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "issue_date", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "globalTags": null, - "glossaryTerms": null, - "isPartOfKey": false - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1632398400000, - "runId": "trino-test", - "properties": null + } + ] } }, - { - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 3, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"tags\", \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"genre_ids\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}]}", - "contentType": "application/json" - }, - "systemMetadata": null + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book,PROD)", + "entityKeyAspect": null, + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 3, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"tags\", \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"genre_ids\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1.0, \"sampleValues\": []}]}", + "contentType": "application/json" }, - { - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", - "contentType": "application/json" - }, - "systemMetadata": null + "systemMetadata": null +}, +{ + "auditHeader": null, + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.issue_history,PROD)", + "entityKeyAspect": null, + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", + "contentType": "application/json" }, - { - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}", - "contentType": "application/json" - }, - "systemMetadata": null + "systemMetadata": null +}, +{ + "auditHeader": null, + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.member,PROD)", + "entityKeyAspect": null, + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}", + "contentType": "application/json" }, - { - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", - "contentType": "application/json" - }, - "systemMetadata": null - } + "systemMetadata": null +}, +{ + "auditHeader": null, + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,library_catalog.librarydb.book_in_circulation,PROD)", + "entityKeyAspect": null, + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "value": "{\"timestampMillis\": 1632398400000, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", + "contentType": "application/json" + }, + "systemMetadata": null +} ] \ No newline at end of file diff --git a/metadata-io/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java b/metadata-io/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java index c4beceaf24a35..37ea2bf29bac3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/datahubusage/DataHubUsageEventType.java @@ -14,7 +14,9 @@ public enum DataHubUsageEventType { BROWSE_RESULT_CLICK_EVENT("BrowseResultClickEvent"), ENTITY_VIEW_EVENT("EntityViewEvent"), ENTITY_SECTION_VIEW_EVENT("EntitySectionViewEvent"), - ENTITY_ACTION_EVENT("EntityActionEvent"); + ENTITY_ACTION_EVENT("EntityActionEvent"), + RECOMMENDATION_IMPRESSION_EVENT("RecommendationImpressionEvent"), + RECOMMENDATION_CLICK_EVENT("RecommendationClickEvent"); private final String type; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java index 765467f2da629..607c3425026b3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java @@ -2,50 +2,33 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.search.elasticsearch.update.BulkListener; - import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BackoffPolicy; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.update.UpdateRequest; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.reindex.BulkByScrollResponse; import org.elasticsearch.index.reindex.DeleteByQueryRequest; -import static com.linkedin.metadata.graph.elastic.ESGraphQueryDAO.*; -import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.*; +import static com.linkedin.metadata.graph.elastic.ESGraphQueryDAO.buildQuery; +import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.INDEX_NAME; @Slf4j +@RequiredArgsConstructor public class ESGraphWriteDAO { - private final BulkProcessor bulkProcessor; - private final IndexConvention indexConvention; private final RestHighLevelClient client; - - public ESGraphWriteDAO(RestHighLevelClient searchClient, IndexConvention indexConvention, int bulkRequestsLimit, int bulkFlushPeriod, int numRetries, - long retryInterval) { - this.client = searchClient; - this.indexConvention = indexConvention; - this.bulkProcessor = BulkProcessor.builder( - (request, bulkListener) -> { - searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); - }, - BulkListener.getInstance()) - .setBulkActions(bulkRequestsLimit) - .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) - .build(); - } + private final IndexConvention indexConvention; + private final BulkProcessor bulkProcessor; /** * Updates or inserts the given search document. @@ -54,28 +37,21 @@ public ESGraphWriteDAO(RestHighLevelClient searchClient, IndexConvention indexCo * @param docId the ID of the document */ public void upsertDocument(@Nonnull String docId, @Nonnull String document) { - final IndexRequest indexRequest = new IndexRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId).source(document, XContentType.JSON); - final UpdateRequest updateRequest = new UpdateRequest(indexConvention.getIndexName(INDEX_NAME), docId).doc(document, XContentType.JSON) - .detectNoop(false) - .upsert(indexRequest); + final IndexRequest indexRequest = + new IndexRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId).source(document, XContentType.JSON); + final UpdateRequest updateRequest = + new UpdateRequest(indexConvention.getIndexName(INDEX_NAME), docId).doc(document, XContentType.JSON) + .detectNoop(false) + .upsert(indexRequest); bulkProcessor.add(updateRequest); } - public BulkByScrollResponse deleteByQuery( - @Nullable final String sourceType, - @Nonnull final Filter sourceEntityFilter, - @Nullable final String destinationType, - @Nonnull final Filter destinationEntityFilter, - @Nonnull final List relationshipTypes, - @Nonnull final RelationshipFilter relationshipFilter) { - BoolQueryBuilder finalQuery = buildQuery( - sourceType, - sourceEntityFilter, - destinationType, - destinationEntityFilter, - relationshipTypes, - relationshipFilter - ); + public BulkByScrollResponse deleteByQuery(@Nullable final String sourceType, @Nonnull final Filter sourceEntityFilter, + @Nullable final String destinationType, @Nonnull final Filter destinationEntityFilter, + @Nonnull final List relationshipTypes, @Nonnull final RelationshipFilter relationshipFilter) { + BoolQueryBuilder finalQuery = + buildQuery(sourceType, sourceEntityFilter, destinationType, destinationEntityFilter, relationshipTypes, + relationshipFilter); DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index d38a243a403c6..d2b3a1a260230 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -16,7 +16,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.IndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -51,6 +51,7 @@ public class ElasticSearchGraphService implements GraphService { private final IndexConvention _indexConvention; private final ESGraphWriteDAO _graphWriteDAO; private final ESGraphQueryDAO _graphReadDAO; + private final ESIndexBuilder _indexBuilder; private static final String DOC_DELIMETER = "--"; public static final String INDEX_NAME = "graph_service_v1"; @@ -206,8 +207,8 @@ public void removeEdgesFromNode( public void configure() { log.info("Setting up elastic graph index"); try { - new IndexBuilder(searchClient, _indexConvention.getIndexName(INDEX_NAME), - GraphRelationshipMappingsBuilder.getMappings(), Collections.emptyMap()).buildIndex(); + _indexBuilder.buildIndex(_indexConvention.getIndexName(INDEX_NAME), + GraphRelationshipMappingsBuilder.getMappings(), Collections.emptyMap()); } catch (IOException e) { e.printStackTrace(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java index ca801e0e29219..27cb7fdec22d3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/RecommendationsService.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.recommendation.candidatesource.RecommendationSource; import com.linkedin.metadata.recommendation.ranker.RecommendationModuleRanker; import com.linkedin.metadata.utils.ConcurrencyUtils; +import io.opentelemetry.extension.annotations.WithSpan; import java.util.List; import java.util.Map; import java.util.Optional; @@ -49,6 +50,7 @@ private void validateRecommendationSources(final List cand * @return List of recommendation modules */ @Nonnull + @WithSpan public List listRecommendations( @Nonnull Urn userUrn, @Nonnull RecommendationRequestContext requestContext, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java index 7470fd0f9024e..d228bd7a1f281 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationSource.java @@ -12,7 +12,7 @@ import com.linkedin.metadata.recommendation.SearchParams; import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.cache.NonEmptyEntitiesCache; -import com.linkedin.metadata.utils.ConcurrencyUtils; +import io.opentelemetry.extension.annotations.WithSpan; import java.net.URISyntaxException; import java.util.Collections; import java.util.Comparator; @@ -41,9 +41,7 @@ public abstract class EntitySearchAggregationSource implements RecommendationSou private final EntitySearchService _entitySearchService; private final NonEmptyEntitiesCache _nonEmptyEntitiesCache; - protected EntitySearchAggregationSource( - EntitySearchService entitySearchService, - EntityRegistry entityRegistry, + protected EntitySearchAggregationSource(EntitySearchService entitySearchService, EntityRegistry entityRegistry, CacheManager cacheManager) { _entitySearchService = entitySearchService; _nonEmptyEntitiesCache = new NonEmptyEntitiesCache(entityRegistry, entitySearchService, cacheManager); @@ -90,30 +88,25 @@ protected boolean isValidCandidate(T candidate) { } @Override - public List getRecommendations( - @Nonnull Urn userUrn, + @WithSpan + public List getRecommendations(@Nonnull Urn userUrn, @Nullable RecommendationRequestContext requestContext) { - // Fetch number of documents per platform for each entity type - List> resultPerEntity = - ConcurrencyUtils.transformAndCollectAsync(_nonEmptyEntitiesCache.getNonEmptyEntities(), - entity -> _entitySearchService.aggregateByValue(entity, getSearchFieldName(), null, getMaxContent() * 10)); + Map aggregationResult = + _entitySearchService.aggregateByValue(null, getSearchFieldName(), null, getMaxContent()); - // Merge the aggregated result into one - Map mergedResult = resultPerEntity.stream().reduce(this::mergeAggregation).orElse(Collections.emptyMap()); - - if (mergedResult.isEmpty()) { + if (aggregationResult.isEmpty()) { return Collections.emptyList(); } // If the aggregated values are not urn, simply get top k values with the most counts if (!isValueUrn()) { - return getTopKValues(mergedResult).stream() + return getTopKValues(aggregationResult).stream() .map(entry -> buildRecommendationContent(entry.getKey(), entry.getValue())) .collect(Collectors.toList()); } // If the aggregated values are urns, convert key into urns - Map urnCounts = mergedResult.entrySet().stream().map(entry -> { + Map urnCounts = aggregationResult.entrySet().stream().map(entry -> { try { Urn tagUrn = Urn.createFromString(entry.getKey()); return Optional.of(Pair.of(tagUrn, entry.getValue())); @@ -135,7 +128,8 @@ public List getRecommendations( // Get top K entries with the most count private List> getTopKValues(Map countMap) { - final PriorityQueue> queue = new PriorityQueue<>(getMaxContent(), Map.Entry.comparingByValue(Comparator.naturalOrder())); + final PriorityQueue> queue = + new PriorityQueue<>(getMaxContent(), Map.Entry.comparingByValue(Comparator.naturalOrder())); for (Map.Entry entry : countMap.entrySet()) { if (queue.size() < getMaxContent() && isValidCandidate(entry.getKey())) { queue.add(entry); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java index 43c0a77c95678..5bc306c05a487 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/MostPopularSource.java @@ -14,6 +14,7 @@ import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; +import io.opentelemetry.extension.annotations.WithSpan; import java.io.IOException; import java.net.URISyntaxException; import java.util.List; @@ -73,6 +74,7 @@ public boolean isEligible(@Nonnull Urn userUrn, @Nonnull RecommendationRequestCo } @Override + @WithSpan public List getRecommendations(@Nonnull Urn userUrn, @Nonnull RecommendationRequestContext requestContext) { SearchRequest searchRequest = buildSearchRequest(userUrn); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java index b1e0ef6ad6e59..b6f744ab3a660 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecentlyViewedSource.java @@ -14,6 +14,7 @@ import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.metrics.MetricUtils; +import io.opentelemetry.extension.annotations.WithSpan; import java.io.IOException; import java.net.URISyntaxException; import java.util.List; @@ -74,6 +75,7 @@ public boolean isEligible(@Nonnull Urn userUrn, @Nonnull RecommendationRequestCo } @Override + @WithSpan public List getRecommendations(@Nonnull Urn userUrn, @Nonnull RecommendationRequestContext requestContext) { SearchRequest searchRequest = buildSearchRequest(userUrn); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java index adb7e8177f592..7d43e3652b492 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/recommendation/candidatesource/RecommendationSource.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.recommendation.RecommendationModule; import com.linkedin.metadata.recommendation.RecommendationRenderType; import com.linkedin.metadata.recommendation.RecommendationRequestContext; +import io.opentelemetry.extension.annotations.WithSpan; import java.util.List; import java.util.Optional; import javax.annotation.Nonnull; @@ -47,6 +48,7 @@ public interface RecommendationSource { * @param requestContext Context of where the recommendations are being requested * @return list of recommendation candidates */ + @WithSpan List getRecommendations(@Nonnull Urn userUrn, @Nonnull RecommendationRequestContext requestContext); /** diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index 8a6ab9c68a4b2..94f1fd965df8c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -93,14 +93,14 @@ AutoCompleteResult autoComplete(@Nonnull String entityName, @Nonnull String quer /** * Returns number of documents per field value given the field and filters * - * @param entityName name of the entity + * @param entityName name of the entity, if empty aggregate over all entities * @param field the field name for aggregate * @param requestParams filters to apply before aggregating * @param limit the number of aggregations to return * @return */ @Nonnull - Map aggregateByValue(@Nonnull String entityName, @Nonnull String field, @Nullable Filter requestParams, + Map aggregateByValue(@Nullable String entityName, @Nonnull String field, @Nullable Filter requestParams, int limit); /** diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index da22ecd561139..243f958bf2aed 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -7,7 +7,7 @@ import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.SearchResult; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilders; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; @@ -23,7 +23,7 @@ @RequiredArgsConstructor public class ElasticSearchService implements EntitySearchService { - private final ESIndexBuilders indexBuilders; + private final EntityIndexBuilders indexBuilders; private final ESSearchDAO esSearchDAO; private final ESBrowseDAO esBrowseDAO; private final ESWriteDAO esWriteDAO; @@ -87,7 +87,7 @@ public AutoCompleteResult autoComplete(@Nonnull String entityName, @Nonnull Stri @Nonnull @Override - public Map aggregateByValue(@Nonnull String entityName, @Nonnull String field, + public Map aggregateByValue(@Nullable String entityName, @Nonnull String field, @Nullable Filter requestParams, int limit) { log.debug("Aggregating by value: {}, field: {}, requestParams: {}, limit: {}", entityName, field, requestParams, limit); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/IndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java similarity index 80% rename from metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/IndexBuilder.java rename to metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java index e808db22c2875..8777d592379b1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/IndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java @@ -1,10 +1,14 @@ package com.linkedin.metadata.search.elasticsearch.indexbuilder; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.MapDifference; import com.google.common.collect.Maps; import java.io.IOException; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.concurrent.TimeUnit; import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; @@ -28,22 +32,28 @@ @Slf4j @RequiredArgsConstructor -public class IndexBuilder { +public class ESIndexBuilder { private final RestHighLevelClient searchClient; - private final String indexName; - private final Map mappings; - private final Map settings; + private final int numShards; + private final int numReplicas; private static final int NUM_RETRIES = 3; + private static final List SETTINGS_TO_COMPARE = ImmutableList.of("number_of_shards", "number_of_replicas"); - public void buildIndex() throws IOException { + public void buildIndex(String indexName, Map mappings, Map settings) + throws IOException { // Check if index exists boolean exists = searchClient.indices().exists(new GetIndexRequest(indexName), RequestOptions.DEFAULT); + Map baseSettings = new HashMap<>(settings); + baseSettings.put("number_of_shards", numShards); + baseSettings.put("number_of_replicas", numReplicas); + Map finalSettings = ImmutableMap.of("index", baseSettings); + // If index doesn't exist, create index if (!exists) { - createIndex(indexName, mappings, settings); + createIndex(indexName, mappings, finalSettings); return; } @@ -65,7 +75,7 @@ public void buildIndex() throws IOException { .next(); // If there are no updates to mappings, return - if (mappingsDiff.areEqual() && equals(settings, oldSettings)) { + if (mappingsDiff.areEqual() && equals(finalSettings, oldSettings)) { log.info("No updates to index {}", indexName); return; } @@ -77,9 +87,10 @@ public void buildIndex() throws IOException { } String tempIndexName = indexName + "_" + System.currentTimeMillis(); - createIndex(tempIndexName, mappings, settings); + createIndex(tempIndexName, mappings, finalSettings); try { - searchClient.reindex(new ReindexRequest().setSourceIndices(indexName).setDestIndex(tempIndexName), + searchClient.reindex( + new ReindexRequest().setSourceIndices(indexName).setDestIndex(tempIndexName), RequestOptions.DEFAULT); } catch (Exception e) { log.info("Failed to reindex {} to {}: Exception {}", indexName, tempIndexName, e.toString()); @@ -151,14 +162,22 @@ private void createIndex(String indexName, Map mappings, Map newSettings, Settings oldSettings) { - if (!newSettings.containsKey("index") || !((Map) newSettings.get("index")).containsKey( - "analysis")) { + if (!newSettings.containsKey("index")) { + return true; + } + Map indexSettings = (Map) newSettings.get("index"); + if (!indexSettings.containsKey("analysis")) { return true; } - Map newAnalysis = - (Map) ((Map) newSettings.get("index")).get("analysis"); + // Compare analysis section + Map newAnalysis = (Map) indexSettings.get("analysis"); Settings oldAnalysis = oldSettings.getByPrefix("index.analysis."); - return equalsGroup(newAnalysis, oldAnalysis); + if (!equalsGroup(newAnalysis, oldAnalysis)) { + return false; + } + // Compare remaining settings + return SETTINGS_TO_COMPARE.stream() + .noneMatch(settingKey -> Objects.equals(indexSettings.get(settingKey), oldSettings.get("index." + settingKey))); } private boolean equalsGroup(Map newSettings, Settings oldSettings) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java index 56eb2b354efa2..6709a1160c03c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilder.java @@ -5,13 +5,12 @@ import java.util.Map; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RestHighLevelClient; @Slf4j @RequiredArgsConstructor public class EntityIndexBuilder { - private final RestHighLevelClient searchClient; + private final ESIndexBuilder indexBuilder; private final EntitySpec entitySpec; private final SettingsBuilder settingsBuilder; private final String indexName; @@ -21,6 +20,6 @@ public void buildIndex() throws IOException { Map mappings = MappingsBuilder.getMappings(entitySpec); Map settings = settingsBuilder.getSettings(); - new IndexBuilder(searchClient, indexName, mappings, settings).buildIndex(); + indexBuilder.buildIndex(indexName, mappings, settings); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilders.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java similarity index 79% rename from metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilders.java rename to metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java index 37bcd875e953d..349187bd347a3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilders.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/EntityIndexBuilders.java @@ -5,20 +5,19 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import lombok.RequiredArgsConstructor; -import org.elasticsearch.client.RestHighLevelClient; @RequiredArgsConstructor -public class ESIndexBuilders { +public class EntityIndexBuilders { + private final ESIndexBuilder indexBuilder; private final EntityRegistry entityRegistry; - private final RestHighLevelClient searchClient; private final IndexConvention indexConvention; private final SettingsBuilder settingsBuilder; public void buildAll() { for (EntitySpec entitySpec : entityRegistry.getEntitySpecs().values()) { try { - new EntityIndexBuilder(searchClient, entitySpec, settingsBuilder, + new EntityIndexBuilder(indexBuilder, entitySpec, settingsBuilder, indexConvention.getIndexName(entitySpec)).buildIndex(); } catch (IOException e) { e.printStackTrace(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 9643a51fe221a..16118fcecd3c9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -28,7 +28,7 @@ private static Map buildSettings(List urnStopWords) { .put("normalizer", buildNormalizers()) .put("analyzer", buildAnalyzers()) .build()); - return ImmutableMap.of("index", settings.build()); + return settings.build(); } private static Map buildFilters(List urnStopWords) { @@ -88,7 +88,7 @@ private static Map buildAnalyzers() { // Analyzer for text tokenized into words (split by spaces, periods, and slashes) analyzers.put("word_delimited", ImmutableMap.builder().put("tokenizer", "main_tokenizer") - .put("filter", ImmutableList.of("custom_delimiter", "lowercase")) + .put("filter", ImmutableList.of("custom_delimiter", "lowercase", "stop")) .build()); // Analyzer for splitting by slashes (used to get depth of browsePath) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index def29046b4eef..23089d0199efe 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -14,8 +14,6 @@ import com.linkedin.metadata.utils.metrics.MetricUtils; import io.opentelemetry.extension.annotations.WithSpan; import java.io.IOException; -import java.util.Collections; -import java.util.HashMap; import java.util.Map; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -61,7 +59,7 @@ private SearchResult executeAndExtract(@Nonnull EntitySpec entitySpec, @Nonnull // extract results, validated against document model as well return SearchRequestHandler.getBuilder(entitySpec).extractResult(searchResponse, from, size); } catch (Exception e) { - log.error("Search query failed:" + e.getMessage()); + log.error("Search query failed", e); throw new ESQueryException("Search query failed:", e); } } @@ -84,8 +82,8 @@ public SearchResult search(@Nonnull String entityName, @Nonnull String input, @N Timer.Context searchRequestTimer = MetricUtils.timer(this.getClass(), "searchRequest").time(); EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); // Step 1: construct the query - final SearchRequest searchRequest = - SearchRequestHandler.getBuilder(entitySpec).getSearchRequest(finalInput, postFilters, sortCriterion, from, size); + final SearchRequest searchRequest = SearchRequestHandler.getBuilder(entitySpec) + .getSearchRequest(finalInput, postFilters, sortCriterion, from, size); searchRequest.indices(indexConvention.getIndexName(entitySpec)); searchRequestTimer.stop(); // Step 2: execute the query and extract results, validated against document model as well @@ -141,23 +139,32 @@ public AutoCompleteResult autoComplete(@Nonnull String entityName, @Nonnull Stri /** * Returns number of documents per field value given the field and filters * - * @param entityName name of the entity + * @param entityName name of the entity, if null, aggregates over all entities * @param field the field name for aggregate * @param requestParams filters to apply before aggregating * @param limit the number of aggregations to return * @return */ @Nonnull - public Map aggregateByValue(@Nonnull String entityName, @Nonnull String field, + public Map aggregateByValue(@Nullable String entityName, @Nonnull String field, @Nullable Filter requestParams, int limit) { - EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); - final SearchRequest searchRequest = - SearchRequestHandler.getBuilder(entitySpec).getAggregationRequest(field, requestParams, limit); - searchRequest.indices(indexConvention.getIndexName(entitySpec)); - return executeAndExtract(entitySpec, searchRequest, 0, 0).getMetadata() - .getAggregations() - .stream() - .findFirst().>map(aggregationMetadata -> new HashMap<>(aggregationMetadata.getAggregations())) - .orElse(Collections.emptyMap()); + final SearchRequest searchRequest = SearchRequestHandler.getAggregationRequest(field, requestParams, limit); + String indexName; + if (entityName == null) { + indexName = indexConvention.getAllEntityIndicesPattern(); + } else { + EntitySpec entitySpec = entityRegistry.getEntitySpec(entityName); + indexName = indexConvention.getIndexName(entitySpec); + } + searchRequest.indices(indexName); + + try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esSearch").time()) { + final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT); + // extract results, validated against document model as well + return SearchRequestHandler.extractTermAggregations(searchResponse, field); + } catch (Exception e) { + log.error("Aggregation query failed", e); + throw new ESQueryException("Aggregation query failed:", e); + } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index b5093292d3e3f..548c20057e1fa 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -4,7 +4,6 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.template.DoubleMap; import com.linkedin.data.template.LongMap; -import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -20,11 +19,13 @@ import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchResultMetadata; import com.linkedin.metadata.search.features.Features; +import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.SearchUtil; import io.opentelemetry.extension.annotations.WithSpan; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -52,6 +53,7 @@ import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; + @Slf4j public class SearchRequestHandler { @@ -96,7 +98,7 @@ private Set getDefaultQueryFieldNames() { .collect(Collectors.toSet()); } - private BoolQueryBuilder getFilterQuery(@Nullable Filter filter) { + private static BoolQueryBuilder getFilterQuery(@Nullable Filter filter) { BoolQueryBuilder filterQuery = ESUtils.buildFilterQuery(filter); // Filter out entities that are marked "removed" filterQuery.mustNot(QueryBuilders.matchQuery("removed", true)); @@ -123,8 +125,7 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi searchSourceBuilder.from(from); searchSourceBuilder.size(size); - - searchSourceBuilder.query(getQuery(input)); + searchSourceBuilder.fetchSource("urn", null); BoolQueryBuilder filterQuery = getFilterQuery(filter); searchSourceBuilder.query(QueryBuilders.boolQuery().must(getQuery(input)).must(filterQuery)); @@ -171,7 +172,7 @@ public SearchRequest getFilterRequest(@Nullable Filter filters, @Nullable SortCr * @return {@link SearchRequest} that contains the aggregation query */ @Nonnull - public SearchRequest getAggregationRequest(@Nonnull String field, @Nullable Filter filter, int limit) { + public static SearchRequest getAggregationRequest(@Nonnull String field, @Nullable Filter filter, int limit) { SearchRequest searchRequest = new SearchRequest(); BoolQueryBuilder filterQuery = getFilterQuery(filter); @@ -292,7 +293,7 @@ private SearchResultMetadata extractSearchResultMetadata(@Nonnull SearchResponse final SearchResultMetadata searchResultMetadata = new SearchResultMetadata().setAggregations(new AggregationMetadataArray()); - final List aggregationMetadataList = extractAggregation(searchResponse); + final List aggregationMetadataList = extractAggregationMetadata(searchResponse); if (!aggregationMetadataList.isEmpty()) { searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); } @@ -300,7 +301,7 @@ private SearchResultMetadata extractSearchResultMetadata(@Nonnull SearchResponse return searchResultMetadata; } - private List extractAggregation(@Nonnull SearchResponse searchResponse) { + private List extractAggregationMetadata(@Nonnull SearchResponse searchResponse) { final List aggregationMetadataList = new ArrayList<>(); if (searchResponse.getAggregations() == null) { @@ -322,6 +323,20 @@ private List extractAggregation(@Nonnull SearchResponse sea return aggregationMetadataList; } + @WithSpan + public static Map extractTermAggregations(@Nonnull SearchResponse searchResponse, + @Nonnull String aggregationName) { + if (searchResponse.getAggregations() == null) { + return Collections.emptyMap(); + } + + Aggregation aggregation = searchResponse.getAggregations().get(aggregationName); + if (aggregation == null) { + return Collections.emptyMap(); + } + return extractTermAggregations((ParsedTerms) aggregation); + } + /** * Extracts term aggregations give a parsed term. * @@ -329,7 +344,7 @@ private List extractAggregation(@Nonnull SearchResponse sea * @return a map with aggregation key and corresponding doc counts */ @Nonnull - private Map extractTermAggregations(@Nonnull ParsedTerms terms) { + private static Map extractTermAggregations(@Nonnull ParsedTerms terms) { final Map aggResult = new HashMap<>(); List bucketList = terms.getBuckets(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java index 12ba9a84dca10..d557f3227e58d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java @@ -4,8 +4,8 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import javax.annotation.Nonnull; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BackoffPolicy; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.index.IndexRequest; @@ -14,33 +14,19 @@ import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.client.indices.GetIndexRequest; import org.elasticsearch.client.indices.GetIndexResponse; -import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.reindex.DeleteByQueryRequest; @Slf4j +@RequiredArgsConstructor public class ESWriteDAO { private final EntityRegistry entityRegistry; private final RestHighLevelClient searchClient; - private final BulkProcessor bulkProcessor; private final IndexConvention indexConvention; - - public ESWriteDAO(EntityRegistry entityRegistry, RestHighLevelClient searchClient, IndexConvention indexConvention, - int bulkRequestsLimit, int bulkFlushPeriod, int numRetries, long retryInterval) { - this.entityRegistry = entityRegistry; - this.indexConvention = indexConvention; - this.searchClient = searchClient; - this.bulkProcessor = BulkProcessor.builder( - (request, bulkListener) -> searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener), - BulkListener.getInstance()) - .setBulkActions(bulkRequestsLimit) - .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) - .build(); - } + private final BulkProcessor bulkProcessor; /** * Updates or inserts the given search document. diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 5d2c1d7891da1..a61d5622e300b 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -1,7 +1,6 @@ package com.linkedin.metadata.search.transformer; import com.fasterxml.jackson.databind.JsonNode; - import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; @@ -16,6 +15,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -23,12 +23,14 @@ * Class that provides a utility function that transforms the snapshot object into a search document */ @Slf4j +@RequiredArgsConstructor public class SearchDocumentTransformer { - private SearchDocumentTransformer() { - } + // Number of elements to index for a given array. + // The cap improves search speed when having fields with a large number of elements + private final int maxArrayLength; - public static Optional transformSnapshot( + public Optional transformSnapshot( final RecordTemplate snapshot, final EntitySpec entitySpec, final Boolean forDelete @@ -44,7 +46,7 @@ public static Optional transformSnapshot( return Optional.of(searchDocument.toString()); } - public static Optional transformAspect( + public Optional transformAspect( final Urn urn, final RecordTemplate aspect, final AspectSpec aspectSpec, @@ -61,7 +63,7 @@ public static Optional transformAspect( return Optional.of(searchDocument.toString()); } - public static void setValue(final SearchableFieldSpec fieldSpec, final List fieldValues, + public void setValue(final SearchableFieldSpec fieldSpec, final List fieldValues, final ObjectNode searchDocument, final Boolean forDelete) { DataSchema.Type valueType = fieldSpec.getPegasusSchema().getType(); Optional firstValue = fieldValues.stream().findFirst(); @@ -109,14 +111,15 @@ public static void setValue(final SearchableFieldSpec fieldSpec, final List getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add)); + fieldValues.subList(0, Math.min(fieldValues.size(), maxArrayLength)) + .forEach(value -> getNodeForValue(valueType, value, fieldType).ifPresent(arrayNode::add)); searchDocument.set(fieldName, arrayNode); } else if (!fieldValues.isEmpty()) { getNodeForValue(valueType, fieldValues.get(0), fieldType).ifPresent(node -> searchDocument.set(fieldName, node)); } } - private static Optional getNodeForValue(final DataSchema.Type schemaFieldType, final Object fieldValue, + private Optional getNodeForValue(final DataSchema.Type schemaFieldType, final Object fieldValue, final FieldType fieldType) { switch (schemaFieldType) { case BOOLEAN: diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java index ac8f7d970c5fe..799cb7583c8af 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java @@ -1,15 +1,14 @@ package com.linkedin.metadata.systemmetadata; import com.google.common.collect.ImmutableList; -import com.linkedin.metadata.search.elasticsearch.update.BulkListener; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.Map; import javax.annotation.Nonnull; +import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BackoffPolicy; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.delete.DeleteResponse; @@ -19,7 +18,6 @@ import org.elasticsearch.action.update.UpdateRequest; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; @@ -33,29 +31,15 @@ import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.SortOrder; -import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.*; +import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.INDEX_NAME; @Slf4j +@RequiredArgsConstructor public class ESSystemMetadataDAO { - private final BulkProcessor bulkProcessor; - private final IndexConvention indexConvention; private final RestHighLevelClient client; - - public ESSystemMetadataDAO(RestHighLevelClient searchClient, IndexConvention indexConvention, int bulkRequestsLimit, int bulkFlushPeriod, int numRetries, - long retryInterval) { - this.client = searchClient; - this.indexConvention = indexConvention; - this.bulkProcessor = BulkProcessor.builder( - (request, bulkListener) -> { - searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); - }, - BulkListener.getInstance()) - .setBulkActions(bulkRequestsLimit) - .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) - .build(); - } + private final IndexConvention indexConvention; + private final BulkProcessor bulkProcessor; /** * Updates or inserts the given search document. @@ -64,16 +48,16 @@ public ESSystemMetadataDAO(RestHighLevelClient searchClient, IndexConvention ind * @param docId the ID of the document */ public void upsertDocument(@Nonnull String docId, @Nonnull String document) { - final IndexRequest indexRequest = new IndexRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId).source(document, XContentType.JSON); - final UpdateRequest updateRequest = new UpdateRequest(indexConvention.getIndexName(INDEX_NAME), docId).doc(document, XContentType.JSON) - .detectNoop(false) - .upsert(indexRequest); + final IndexRequest indexRequest = + new IndexRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId).source(document, XContentType.JSON); + final UpdateRequest updateRequest = + new UpdateRequest(indexConvention.getIndexName(INDEX_NAME), docId).doc(document, XContentType.JSON) + .detectNoop(false) + .upsert(indexRequest); bulkProcessor.add(updateRequest); } - public DeleteResponse deleteByDocId( - @Nonnull final String docId - ) { + public DeleteResponse deleteByDocId(@Nonnull final String docId) { DeleteRequest deleteRequest = new DeleteRequest(indexConvention.getIndexName(INDEX_NAME), docId); try { @@ -86,9 +70,7 @@ public DeleteResponse deleteByDocId( return null; } - public BulkByScrollResponse deleteByUrn( - @Nonnull final String urn - ) { + public BulkByScrollResponse deleteByUrn(@Nonnull final String urn) { BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); finalQuery.must(QueryBuilders.termQuery("urn", urn)); @@ -114,7 +96,8 @@ public SearchResponse findByParams(Map searchParams) { SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); - searchParams.entrySet().forEach(entry -> finalQuery.must(QueryBuilders.termQuery(entry.getKey(), entry.getValue()))); + searchParams.entrySet() + .forEach(entry -> finalQuery.must(QueryBuilders.termQuery(entry.getKey(), entry.getValue()))); searchSourceBuilder.query(finalQuery); // this is the max page size elastic will return @@ -160,12 +143,10 @@ public SearchResponse findRuns(Integer pageOffset, Integer pageSize) { bucketSort.size(pageSize); bucketSort.from(pageOffset); - TermsAggregationBuilder aggregation = - AggregationBuilders.terms("runId") - .field("runId") - .subAggregation(AggregationBuilders.max("maxTimestamp").field("lastUpdated")) - .subAggregation(bucketSort); - + TermsAggregationBuilder aggregation = AggregationBuilders.terms("runId") + .field("runId") + .subAggregation(AggregationBuilders.max("maxTimestamp").field("lastUpdated")) + .subAggregation(bucketSort); searchSourceBuilder.aggregation(aggregation); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index ce7956481787a..50e5c9c6e5a5a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -4,7 +4,7 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.IndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.mxe.SystemMetadata; import java.io.IOException; @@ -40,9 +40,10 @@ @RequiredArgsConstructor public class ElasticSearchSystemMetadataService implements SystemMetadataService { - private final RestHighLevelClient searchClient; + private final RestHighLevelClient _searchClient; private final IndexConvention _indexConvention; private final ESSystemMetadataDAO _esDAO; + private final ESIndexBuilder _indexBuilder; private static final String DOC_DELIMETER = "--"; public static final String INDEX_NAME = "system_metadata_service_v1"; @@ -161,10 +162,9 @@ public List listRuns(Integer pageOffset, Integer pageSize) @Override public void configure() { log.info("Setting up system metadata index"); - IndexBuilder ib = new IndexBuilder(this.searchClient, _indexConvention.getIndexName(INDEX_NAME), - SystemMetadataMappingsBuilder.getMappings(), Collections.emptyMap()); try { - ib.buildIndex(); + _indexBuilder.buildIndex(_indexConvention.getIndexName(INDEX_NAME), SystemMetadataMappingsBuilder.getMappings(), + Collections.emptyMap()); } catch (IOException ie) { throw new RuntimeException("Could not configure system metadata index", ie); } @@ -175,7 +175,7 @@ public void clear() { DeleteByQueryRequest deleteRequest = new DeleteByQueryRequest(_indexConvention.getIndexName(INDEX_NAME)).setQuery(QueryBuilders.matchAllQuery()); try { - searchClient.deleteByQuery(deleteRequest, RequestOptions.DEFAULT); + _searchClient.deleteByQuery(deleteRequest, RequestOptions.DEFAULT); } catch (Exception e) { log.error("Failed to clear system metadata service: {}", e.toString()); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index dd7076d23e8f0..fa1606a28e90c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -8,13 +8,12 @@ import com.linkedin.data.ByteString; import com.linkedin.metadata.aspect.EnvelopedAspect; import com.linkedin.metadata.dao.exception.ESQueryException; -import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.dao.utils.RecordUtils; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.search.elasticsearch.update.BulkListener; +import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.timeseries.TimeseriesAspectService; import com.linkedin.metadata.timeseries.elastic.indexbuilder.MappingsBuilder; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; @@ -34,7 +33,6 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BackoffPolicy; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; @@ -42,7 +40,6 @@ import org.elasticsearch.action.update.UpdateRequest; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; @@ -68,18 +65,11 @@ public class ElasticSearchTimeseriesAspectService implements TimeseriesAspectSer public ElasticSearchTimeseriesAspectService(@Nonnull RestHighLevelClient searchClient, @Nonnull IndexConvention indexConvention, @Nonnull TimeseriesAspectIndexBuilders indexBuilders, - @Nonnull EntityRegistry entityRegistry, int bulkRequestsLimit, int bulkFlushPeriod, int numRetries, - long retryInterval) { + @Nonnull EntityRegistry entityRegistry, @Nonnull BulkProcessor bulkProcessor) { _indexConvention = indexConvention; _indexBuilders = indexBuilders; _searchClient = searchClient; - _bulkProcessor = BulkProcessor.builder( - (request, bulkListener) -> searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener), - BulkListener.getInstance()) - .setBulkActions(bulkRequestsLimit) - .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) - .build(); + _bulkProcessor = bulkProcessor; _esAggregatedStatsDAO = new ESAggregatedStatsDAO(indexConvention, searchClient, entityRegistry); } @@ -128,12 +118,8 @@ public void upsertDocument(@Nonnull String entityName, @Nonnull String aspectNam } @Override - public List getAspectValues( - @Nonnull final Urn urn, - @Nonnull String entityName, - @Nonnull String aspectName, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis, + public List getAspectValues(@Nonnull final Urn urn, @Nonnull String entityName, + @Nonnull String aspectName, @Nullable Long startTimeMillis, @Nullable Long endTimeMillis, @Nullable Integer limit) { final BoolQueryBuilder filterQueryBuilder = ESUtils.buildFilterQuery(null); filterQueryBuilder.must(QueryBuilders.matchQuery("urn", urn.toString())); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java index f8c48bfbc8baa..d0fd26d737cf0 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/indexbuilder/TimeseriesAspectIndexBuilders.java @@ -3,20 +3,19 @@ import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.IndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import java.util.Collections; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RestHighLevelClient; @Slf4j @RequiredArgsConstructor public class TimeseriesAspectIndexBuilders { + private final ESIndexBuilder _indexBuilder; private final EntityRegistry _entityRegistry; - private final RestHighLevelClient _searchClient; private final IndexConvention _indexConvention; public void buildAll() { @@ -24,9 +23,9 @@ public void buildAll() { for (AspectSpec aspectSpec : entitySpec.getAspectSpecs()) { if (aspectSpec.isTimeseries()) { try { - new IndexBuilder(_searchClient, + _indexBuilder.buildIndex( _indexConvention.getTimeseriesAspectIndexName(entitySpec.getName(), aspectSpec.getName()), - MappingsBuilder.getMappings(aspectSpec), Collections.emptyMap()).buildIndex(); + MappingsBuilder.getMappings(aspectSpec), Collections.emptyMap()); } catch (IOException e) { log.error("Issue while building timeseries field index for entity {} aspect {}", entitySpec.getName(), aspectSpec.getName()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/DockerTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/DockerTestUtils.java new file mode 100644 index 0000000000000..364ccd86d45fd --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/DockerTestUtils.java @@ -0,0 +1,21 @@ +package com.linkedin.metadata; + +import com.github.dockerjava.api.DockerClient; + +public class DockerTestUtils { + + final private static int MIN_MEMORY_NEEDED_GB = 7; + + public static void checkContainerEngine(DockerClient dockerClient) { + final long dockerEngineMemoryBytes = dockerClient.infoCmd().exec().getMemTotal(); + final long dockerEngineMemoryGB = dockerEngineMemoryBytes / 1000 / 1000 / 1000; + if (dockerEngineMemoryGB < MIN_MEMORY_NEEDED_GB) { + final String error = String.format("Total Docker memory configured: %s GB (%d bytes) is below the minimum threshold " + + "of %d GB", dockerEngineMemoryGB, dockerEngineMemoryBytes, MIN_MEMORY_NEEDED_GB); + throw new IllegalStateException(error); + } + } + + private DockerTestUtils() { + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/DgraphGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/DgraphGraphServiceTest.java index 894cb00a103ce..249a3b31b1857 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/DgraphGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/DgraphGraphServiceTest.java @@ -27,6 +27,7 @@ import java.util.Set; import java.util.concurrent.TimeUnit; +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; import static com.linkedin.metadata.search.utils.QueryUtils.EMPTY_FILTER; import static com.linkedin.metadata.search.utils.QueryUtils.newFilter; import static com.linkedin.metadata.search.utils.QueryUtils.newRelationshipFilter; @@ -52,8 +53,8 @@ public void setup() { .withTmpFs(Collections.singletonMap("/dgraph", "rw,noexec,nosuid,size=1g")) .withStartupTimeout(Duration.ofMinutes(1)) .withStartupAttempts(3); + checkContainerEngine(_container.getDockerClient()); _container.start(); - Slf4jLogConsumer logConsumer = new Slf4jLogConsumer(log); _container.followOutput(logConsumer); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/ElasticSearchGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/ElasticSearchGraphServiceTest.java index af76422f3a98e..3d270662014ac 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/ElasticSearchGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/ElasticSearchGraphServiceTest.java @@ -8,8 +8,13 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; +import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import javax.annotation.Nonnull; import org.apache.http.HttpHost; import org.apache.http.impl.nio.reactor.IOReactorConfig; import org.elasticsearch.client.RestClient; @@ -22,14 +27,10 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import javax.annotation.Nonnull; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; - +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; +import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.INDEX_NAME; import static org.testng.Assert.assertEquals; -import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.INDEX_NAME; public class ElasticSearchGraphServiceTest extends GraphServiceTestBase { @@ -45,6 +46,7 @@ public class ElasticSearchGraphServiceTest extends GraphServiceTestBase { @BeforeTest public void setup() { _elasticsearchContainer = new ElasticsearchContainer(IMAGE_NAME); + checkContainerEngine(_elasticsearchContainer.getDockerClient()); _elasticsearchContainer.start(); _searchClient = buildRestClient(); _client = buildService(); @@ -73,8 +75,10 @@ private RestHighLevelClient buildRestClient() { @Nonnull private ElasticSearchGraphService buildService() { ESGraphQueryDAO readDAO = new ESGraphQueryDAO(_searchClient, _indexConvention); - ESGraphWriteDAO writeDAO = new ESGraphWriteDAO(_searchClient, _indexConvention, 1, 1, 1, 1); - return new ElasticSearchGraphService(_searchClient, _indexConvention, writeDAO, readDAO); + ESGraphWriteDAO writeDAO = + new ESGraphWriteDAO(_searchClient, _indexConvention, ElasticSearchServiceTest.getBulkProcessor(_searchClient)); + return new ElasticSearchGraphService(_searchClient, _indexConvention, writeDAO, readDAO, + ElasticSearchServiceTest.getIndexBuilder(_searchClient)); } @AfterTest @@ -83,7 +87,8 @@ public void tearDown() { } @Override - protected @Nonnull GraphService getGraphService() { + @Nonnull + protected GraphService getGraphService() { return _client; } @@ -105,41 +110,35 @@ protected void assertEqualsAnyOrder(RelatedEntitiesResult actual, RelatedEntitie protected void assertEqualsAnyOrder(List actual, List expected, Comparator comparator) { // https://github.com/linkedin/datahub/issues/3115 // ElasticSearchGraphService produces duplicates, which is here ignored until fixed - assertEquals( - new HashSet<>(actual), - new HashSet<>(expected) - ); + assertEquals(new HashSet<>(actual), new HashSet<>(expected)); } @Override - public void testFindRelatedEntitiesSourceEntityFilter(Filter sourceEntityFilter, - List relationshipTypes, - RelationshipFilter relationships, - List expectedRelatedEntities) throws Exception { + public void testFindRelatedEntitiesSourceEntityFilter(Filter sourceEntityFilter, List relationshipTypes, + RelationshipFilter relationships, List expectedRelatedEntities) throws Exception { if (relationships.getDirection() == RelationshipDirection.UNDIRECTED) { // https://github.com/linkedin/datahub/issues/3114 throw new SkipException("ElasticSearchGraphService does not implement UNDIRECTED relationship filter"); } - super.testFindRelatedEntitiesSourceEntityFilter(sourceEntityFilter, relationshipTypes, relationships, expectedRelatedEntities); + super.testFindRelatedEntitiesSourceEntityFilter(sourceEntityFilter, relationshipTypes, relationships, + expectedRelatedEntities); } @Override public void testFindRelatedEntitiesDestinationEntityFilter(Filter destinationEntityFilter, - List relationshipTypes, - RelationshipFilter relationships, - List expectedRelatedEntities) throws Exception { + List relationshipTypes, RelationshipFilter relationships, List expectedRelatedEntities) + throws Exception { if (relationships.getDirection() == RelationshipDirection.UNDIRECTED) { // https://github.com/linkedin/datahub/issues/3114 throw new SkipException("ElasticSearchGraphService does not implement UNDIRECTED relationship filter"); } - super.testFindRelatedEntitiesDestinationEntityFilter(destinationEntityFilter, relationshipTypes, relationships, expectedRelatedEntities); + super.testFindRelatedEntitiesDestinationEntityFilter(destinationEntityFilter, relationshipTypes, relationships, + expectedRelatedEntities); } @Override - public void testFindRelatedEntitiesSourceType(String datasetType, - List relationshipTypes, - RelationshipFilter relationships, - List expectedRelatedEntities) throws Exception { + public void testFindRelatedEntitiesSourceType(String datasetType, List relationshipTypes, + RelationshipFilter relationships, List expectedRelatedEntities) throws Exception { if (relationships.getDirection() == RelationshipDirection.UNDIRECTED) { // https://github.com/linkedin/datahub/issues/3114 throw new SkipException("ElasticSearchGraphService does not implement UNDIRECTED relationship filter"); @@ -152,10 +151,8 @@ public void testFindRelatedEntitiesSourceType(String datasetType, } @Override - public void testFindRelatedEntitiesDestinationType(String datasetType, - List relationshipTypes, - RelationshipFilter relationships, - List expectedRelatedEntities) throws Exception { + public void testFindRelatedEntitiesDestinationType(String datasetType, List relationshipTypes, + RelationshipFilter relationships, List expectedRelatedEntities) throws Exception { if (relationships.getDirection() == RelationshipDirection.UNDIRECTED) { // https://github.com/linkedin/datahub/issues/3114 throw new SkipException("ElasticSearchGraphService does not implement UNDIRECTED relationship filter"); @@ -164,7 +161,8 @@ public void testFindRelatedEntitiesDestinationType(String datasetType, // https://github.com/linkedin/datahub/issues/3116 throw new SkipException("ElasticSearchGraphService does not support empty destination type"); } - super.testFindRelatedEntitiesDestinationType(datasetType, relationshipTypes, relationships, expectedRelatedEntities); + super.testFindRelatedEntitiesDestinationType(datasetType, relationshipTypes, relationships, + expectedRelatedEntities); } @Test @@ -175,23 +173,18 @@ public void testFindRelatedEntitiesNoRelationshipTypes() { } @Override - public void testRemoveEdgesFromNode(@Nonnull Urn nodeToRemoveFrom, - @Nonnull List relationTypes, - @Nonnull RelationshipFilter relationshipFilter, - List expectedOutgoingRelatedUrnsBeforeRemove, - List expectedIncomingRelatedUrnsBeforeRemove, - List expectedOutgoingRelatedUrnsAfterRemove, - List expectedIncomingRelatedUrnsAfterRemove) throws Exception { + public void testRemoveEdgesFromNode(@Nonnull Urn nodeToRemoveFrom, @Nonnull List relationTypes, + @Nonnull RelationshipFilter relationshipFilter, List expectedOutgoingRelatedUrnsBeforeRemove, + List expectedIncomingRelatedUrnsBeforeRemove, + List expectedOutgoingRelatedUrnsAfterRemove, + List expectedIncomingRelatedUrnsAfterRemove) throws Exception { if (relationshipFilter.getDirection() == RelationshipDirection.UNDIRECTED) { // https://github.com/linkedin/datahub/issues/3114 throw new SkipException("ElasticSearchGraphService does not implement UNDIRECTED relationship filter"); } - super.testRemoveEdgesFromNode( - nodeToRemoveFrom, - relationTypes, relationshipFilter, - expectedOutgoingRelatedUrnsBeforeRemove, expectedIncomingRelatedUrnsBeforeRemove, - expectedOutgoingRelatedUrnsAfterRemove, expectedIncomingRelatedUrnsAfterRemove - ); + super.testRemoveEdgesFromNode(nodeToRemoveFrom, relationTypes, relationshipFilter, + expectedOutgoingRelatedUrnsBeforeRemove, expectedIncomingRelatedUrnsBeforeRemove, + expectedOutgoingRelatedUrnsAfterRemove, expectedIncomingRelatedUrnsAfterRemove); } @Test @@ -205,7 +198,8 @@ public void testRemoveEdgesFromNodeNoRelationshipTypes() { @Override public void testConcurrentAddEdge() { // https://github.com/linkedin/datahub/issues/3124 - throw new SkipException("This test is flaky for ElasticSearchGraphService, ~5% of the runs fail on a race condition"); + throw new SkipException( + "This test is flaky for ElasticSearchGraphService, ~5% of the runs fail on a race condition"); } @Test @@ -221,5 +215,4 @@ public void testConcurrentRemoveNodes() { // https://github.com/linkedin/datahub/issues/3118 throw new SkipException("ElasticSearchGraphService produces duplicates"); } - } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationCandidateSourceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationCandidateSourceTest.java index 7083b03580634..2d3fac8b608e1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationCandidateSourceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/candidatesource/EntitySearchAggregationCandidateSourceTest.java @@ -1,6 +1,5 @@ package com.linkedin.metadata.recommendation.candidatesource; -import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.CorpuserUrn; import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; @@ -86,19 +85,9 @@ public boolean isEligible(@Nonnull Urn userUrn, @Nonnull RecommendationRequestCo }; } - @Test - public void testWhenNonEmptyCacheReturnsEmpty() { - Mockito.when(_nonEmptyEntitiesCache.getNonEmptyEntities()).thenReturn(Collections.emptyList()); - List candidates = _valueBasedCandidateSource.getRecommendations(USER, CONTEXT); - assertTrue(candidates.isEmpty()); - Mockito.verifyZeroInteractions(_entitySearchService); - assertFalse(_valueBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); - } - @Test public void testWhenSearchServiceReturnsEmpty() { - Mockito.when(_nonEmptyEntitiesCache.getNonEmptyEntities()).thenReturn(ImmutableList.of("testEntity")); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testValue"), eq(null), anyInt())) + Mockito.when(_entitySearchService.aggregateByValue(eq(null), eq("testValue"), eq(null), anyInt())) .thenReturn(Collections.emptyMap()); List candidates = _valueBasedCandidateSource.getRecommendations(USER, CONTEXT); assertTrue(candidates.isEmpty()); @@ -107,9 +96,8 @@ public void testWhenSearchServiceReturnsEmpty() { @Test public void testWhenSearchServiceReturnsValueResults() { - // One entity type, one result - Mockito.when(_nonEmptyEntitiesCache.getNonEmptyEntities()).thenReturn(ImmutableList.of("testEntity")); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testValue"), eq(null), anyInt())) + // One result + Mockito.when(_entitySearchService.aggregateByValue(eq(null), eq("testValue"), eq(null), anyInt())) .thenReturn(ImmutableMap.of("value1", 1L)); List candidates = _valueBasedCandidateSource.getRecommendations(USER, CONTEXT); assertEquals(candidates.size(), 1); @@ -127,8 +115,8 @@ public void testWhenSearchServiceReturnsValueResults() { assertEquals(params.getContentParams().getCount().longValue(), 1L); assertTrue(_valueBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); - // One entity type, multiple result - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testValue"), eq(null), anyInt())) + // Multiple result + Mockito.when(_entitySearchService.aggregateByValue(eq(null), eq("testValue"), eq(null), anyInt())) .thenReturn(ImmutableMap.of("value1", 1L, "value2", 2L, "value3", 3L)); candidates = _valueBasedCandidateSource.getRecommendations(USER, CONTEXT); assertEquals(candidates.size(), 2); @@ -157,54 +145,15 @@ public void testWhenSearchServiceReturnsValueResults() { assertNotNull(params.getContentParams()); assertEquals(params.getContentParams().getCount().longValue(), 2L); assertTrue(_valueBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); - - // Multiple entity type, multiple result - Mockito.when(_nonEmptyEntitiesCache.getNonEmptyEntities()) - .thenReturn(ImmutableList.of("testEntity", "testEntity2")); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testValue"), eq(null), anyInt())) - .thenReturn(ImmutableMap.of("value1", 1L, "value3", 3L)); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity2"), eq("testValue"), eq(null), anyInt())) - .thenReturn(ImmutableMap.of("value1", 3L, "value2", 2L)); - candidates = _valueBasedCandidateSource.getRecommendations(USER, CONTEXT); - assertEquals(candidates.size(), 2); - content = candidates.get(0); - assertEquals(content.getValue(), "value1"); - assertNull(content.getEntity()); - params = content.getParams(); - assertNotNull(params); - assertNotNull(params.getSearchParams()); - assertTrue(StringUtils.isEmpty(params.getSearchParams().getQuery())); - assertEquals(params.getSearchParams().getFilters().size(), 1); - assertEquals(params.getSearchParams().getFilters().get(0), - new Criterion().setField("testValue").setValue("value1")); - assertNotNull(params.getContentParams()); - assertEquals(params.getContentParams().getCount().longValue(), 4L); - content = candidates.get(1); - assertEquals(content.getValue(), "value3"); - assertNull(content.getEntity()); - params = content.getParams(); - assertNotNull(params); - assertNotNull(params.getSearchParams()); - assertTrue(StringUtils.isEmpty(params.getSearchParams().getQuery())); - assertEquals(params.getSearchParams().getFilters().size(), 1); - assertEquals(params.getSearchParams().getFilters().get(0), - new Criterion().setField("testValue").setValue("value3")); - assertNotNull(params.getContentParams()); - assertEquals(params.getContentParams().getCount().longValue(), 3L); - assertTrue(_valueBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); } @Test public void testWhenSearchServiceReturnsUrnResults() { - // One entity type, one result - Mockito.when(_nonEmptyEntitiesCache.getNonEmptyEntities()).thenReturn(ImmutableList.of("testEntity")); + // One result Urn testUrn1 = new TestEntityUrn("testUrn1", "testUrn1", "testUrn1"); Urn testUrn2 = new TestEntityUrn("testUrn2", "testUrn2", "testUrn2"); Urn testUrn3 = new TestEntityUrn("testUrn3", "testUrn3", "testUrn3"); -// Urn testUrn1 = new TestEntityUrn("testUrn1", TestEntityUtil.getTestEntityUrn().toString(), "VALUE_1"); -// Urn testUrn2 = new TestEntityUrn("testUrn2", TestEntityUtil.getTestEntityUrn().toString(), "VALUE_1"); -// Urn testUrn3 = new TestEntityUrn("testUrn3", TestEntityUtil.getTestEntityUrn().toString(), "VALUE_1"); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testUrn"), eq(null), anyInt())) + Mockito.when(_entitySearchService.aggregateByValue(eq(null), eq("testUrn"), eq(null), anyInt())) .thenReturn(ImmutableMap.of(testUrn1.toString(), 1L)); List candidates = _urnBasedCandidateSource.getRecommendations(USER, CONTEXT); assertEquals(candidates.size(), 1); @@ -222,8 +171,8 @@ public void testWhenSearchServiceReturnsUrnResults() { assertEquals(params.getContentParams().getCount().longValue(), 1L); assertTrue(_urnBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); - // One entity type, multiple result - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testUrn"), eq(null), anyInt())) + // Multiple result + Mockito.when(_entitySearchService.aggregateByValue(eq(null), eq("testUrn"), eq(null), anyInt())) .thenReturn(ImmutableMap.of(testUrn1.toString(), 1L, testUrn2.toString(), 2L, testUrn3.toString(), 3L)); candidates = _urnBasedCandidateSource.getRecommendations(USER, CONTEXT); assertEquals(candidates.size(), 2); @@ -252,40 +201,5 @@ public void testWhenSearchServiceReturnsUrnResults() { assertNotNull(params.getContentParams()); assertEquals(params.getContentParams().getCount().longValue(), 2L); assertTrue(_urnBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); - - // Multiple entity type, multiple result - Mockito.when(_nonEmptyEntitiesCache.getNonEmptyEntities()) - .thenReturn(ImmutableList.of("testEntity", "testEntity2")); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity"), eq("testUrn"), eq(null), anyInt())) - .thenReturn(ImmutableMap.of(testUrn1.toString(), 1L, testUrn3.toString(), 3L)); - Mockito.when(_entitySearchService.aggregateByValue(eq("testEntity2"), eq("testUrn"), eq(null), anyInt())) - .thenReturn(ImmutableMap.of(testUrn1.toString(), 3L, testUrn2.toString(), 2L)); - candidates = _urnBasedCandidateSource.getRecommendations(USER, CONTEXT); - assertEquals(candidates.size(), 2); - content = candidates.get(0); - assertEquals(content.getValue(), testUrn1.toString()); - assertEquals(content.getEntity(), testUrn1); - params = content.getParams(); - assertNotNull(params); - assertNotNull(params.getSearchParams()); - assertTrue(StringUtils.isEmpty(params.getSearchParams().getQuery())); - assertEquals(params.getSearchParams().getFilters().size(), 1); - assertEquals(params.getSearchParams().getFilters().get(0), - new Criterion().setField("testUrn").setValue(testUrn1.toString())); - assertNotNull(params.getContentParams()); - assertEquals(params.getContentParams().getCount().longValue(), 4L); - content = candidates.get(1); - assertEquals(content.getValue(), testUrn3.toString()); - assertEquals(content.getEntity(), testUrn3); - params = content.getParams(); - assertNotNull(params); - assertNotNull(params.getSearchParams()); - assertTrue(StringUtils.isEmpty(params.getSearchParams().getQuery())); - assertEquals(params.getSearchParams().getFilters().size(), 1); - assertEquals(params.getSearchParams().getFilters().get(0), - new Criterion().setField("testUrn").setValue(testUrn3.toString())); - assertNotNull(params.getContentParams()); - assertEquals(params.getContentParams().getCount().longValue(), 3L); - assertTrue(_urnBasedCandidateSource.getRecommendationModule(USER, CONTEXT).isPresent()); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java index f99d1a6ade0a0..8d7a18313793a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java @@ -9,7 +9,8 @@ import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilders; +import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; @@ -32,6 +33,7 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; import static org.testng.Assert.assertEquals; @@ -57,6 +59,7 @@ public void setup() { _indexConvention = new IndexConventionImpl(null); _elasticsearchContainer = new ElasticsearchContainer(IMAGE_NAME); _settingsBuilder = new SettingsBuilder(Collections.emptyList()); + checkContainerEngine(_elasticsearchContainer.getDockerClient()); _elasticsearchContainer.start(); _searchClient = buildRestClient(); _elasticSearchService = buildEntitySearchService(); @@ -86,11 +89,13 @@ private RestHighLevelClient buildRestClient() { @Nonnull private ElasticSearchService buildEntitySearchService() { - ESIndexBuilders indexBuilders = - new ESIndexBuilders(_entityRegistry, _searchClient, _indexConvention, _settingsBuilder); + EntityIndexBuilders indexBuilders = + new EntityIndexBuilders(ElasticSearchServiceTest.getIndexBuilder(_searchClient), _entityRegistry, + _indexConvention, _settingsBuilder); ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention); ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, 1, 1, 1, 1); + ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, + ElasticSearchServiceTest.getBulkProcessor(_searchClient)); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java index a6ce63fb5c401..f5813fd10cf02 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java @@ -9,10 +9,12 @@ import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.search.SearchResult; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilders; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; +import com.linkedin.metadata.search.elasticsearch.update.BulkListener; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; @@ -20,9 +22,13 @@ import javax.annotation.Nonnull; import org.apache.http.HttpHost; import org.apache.http.impl.nio.reactor.IOReactorConfig; +import org.elasticsearch.action.bulk.BackoffPolicy; +import org.elasticsearch.action.bulk.BulkProcessor; +import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestClientBuilder; import org.elasticsearch.client.RestHighLevelClient; +import org.elasticsearch.common.unit.TimeValue; import org.testcontainers.elasticsearch.ElasticsearchContainer; import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; import org.testng.annotations.AfterTest; @@ -30,6 +36,7 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; import static org.testng.Assert.assertEquals; @@ -53,6 +60,7 @@ public void setup() { _indexConvention = new IndexConventionImpl(null); _elasticsearchContainer = new ElasticsearchContainer(IMAGE_NAME); _settingsBuilder = new SettingsBuilder(Collections.emptyList()); + checkContainerEngine(_elasticsearchContainer.getDockerClient()); _elasticsearchContainer.start(); _searchClient = buildRestClient(); _elasticSearchService = buildService(); @@ -78,13 +86,28 @@ private RestHighLevelClient buildRestClient() { return new RestHighLevelClient(builder); } + public static BulkProcessor getBulkProcessor(RestHighLevelClient searchClient) { + return BulkProcessor.builder((request, bulkListener) -> { + searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); + }, BulkListener.getInstance()) + .setBulkActions(1) + .setFlushInterval(TimeValue.timeValueSeconds(1)) + .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(1000), 1)) + .build(); + } + + public static ESIndexBuilder getIndexBuilder(RestHighLevelClient searchClient) { + return new ESIndexBuilder(searchClient, 1, 1); + } + @Nonnull private ElasticSearchService buildService() { - ESIndexBuilders indexBuilders = - new ESIndexBuilders(_entityRegistry, _searchClient, _indexConvention, _settingsBuilder); + EntityIndexBuilders indexBuilders = + new EntityIndexBuilders(getIndexBuilder(_searchClient), _entityRegistry, _indexConvention, _settingsBuilder); ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention); ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, 1, 1, 1, 1); + ESWriteDAO writeDAO = + new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, getBulkProcessor(_searchClient)); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index c9cbc83ebdbea..45c5b9183588f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -22,9 +22,10 @@ public class SearchDocumentTransformerTest { @Test public void testTransform() throws IOException { + SearchDocumentTransformer searchDocumentTransformer = new SearchDocumentTransformer(1000); TestEntitySnapshot snapshot = TestEntityUtil.getSnapshot(); EntitySpec testEntitySpec = TestEntitySpecBuilder.getSpec(); - Optional result = SearchDocumentTransformer.transformSnapshot(snapshot, testEntitySpec, false); + Optional result = searchDocumentTransformer.transformSnapshot(snapshot, testEntitySpec, false); assertTrue(result.isPresent()); ObjectNode parsedJson = (ObjectNode) OBJECT_MAPPER.readTree(result.get()); assertEquals(parsedJson.get("urn").asText(), snapshot.getUrn().toString()); @@ -51,9 +52,10 @@ public void testTransform() throws IOException { @Test public void testTransformForDelete() throws IOException { + SearchDocumentTransformer searchDocumentTransformer = new SearchDocumentTransformer(1000); TestEntitySnapshot snapshot = TestEntityUtil.getSnapshot(); EntitySpec testEntitySpec = TestEntitySpecBuilder.getSpec(); - Optional result = SearchDocumentTransformer.transformSnapshot(snapshot, testEntitySpec, true); + Optional result = searchDocumentTransformer.transformSnapshot(snapshot, testEntitySpec, true); assertTrue(result.isPresent()); ObjectNode parsedJson = (ObjectNode) OBJECT_MAPPER.readTree(result.get()); assertEquals(parsedJson.get("urn").asText(), snapshot.getUrn().toString()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java index 9b0154d4d7d56..ffe9b366dd170 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java @@ -2,6 +2,7 @@ import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; +import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import com.linkedin.mxe.SystemMetadata; @@ -18,9 +19,11 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.INDEX_NAME; -import static org.testng.Assert.*; +import static org.testng.Assert.assertEquals; + public class ElasticSearchSystemMetadataServiceTest { @@ -36,6 +39,7 @@ public class ElasticSearchSystemMetadataServiceTest { @BeforeTest public void setup() { _elasticsearchContainer = new ElasticsearchContainer(IMAGE_NAME); + checkContainerEngine(_elasticsearchContainer.getDockerClient()); _elasticsearchContainer.start(); _searchClient = buildRestClient(); _client = buildService(); @@ -63,8 +67,10 @@ private RestHighLevelClient buildRestClient() { @Nonnull private ElasticSearchSystemMetadataService buildService() { - ESSystemMetadataDAO dao = new ESSystemMetadataDAO(_searchClient, _indexConvention, 1, 1, 1, 1); - return new ElasticSearchSystemMetadataService(_searchClient, _indexConvention, dao); + ESSystemMetadataDAO dao = new ESSystemMetadataDAO(_searchClient, _indexConvention, + ElasticSearchServiceTest.getBulkProcessor(_searchClient)); + return new ElasticSearchSystemMetadataService(_searchClient, _indexConvention, dao, + ElasticSearchServiceTest.getIndexBuilder(_searchClient)); } @AfterTest diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java index 445bcaaa66935..77d81c4f84a76 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; import com.linkedin.metadata.timeseries.transformer.TimeseriesAspectTransformer; @@ -50,8 +51,10 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import static com.linkedin.metadata.ElasticSearchTestUtils.*; -import static org.testng.Assert.*; +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; +import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; public class ElasticSearchTimeseriesAspectServiceTest { @@ -89,6 +92,7 @@ public void setup() { TestEntityProfile.class.getClassLoader().getResourceAsStream("test-entity-registry.yml")); _indexConvention = new IndexConventionImpl(null); _elasticsearchContainer = new ElasticsearchContainer(IMAGE_NAME); + checkContainerEngine(_elasticsearchContainer.getDockerClient()); _elasticsearchContainer.start(); _searchClient = buildRestClient(); _elasticSearchTimeseriesAspectService = buildService(); @@ -112,8 +116,8 @@ private RestHighLevelClient buildRestClient() { @Nonnull private ElasticSearchTimeseriesAspectService buildService() { return new ElasticSearchTimeseriesAspectService(_searchClient, _indexConvention, - new TimeseriesAspectIndexBuilders(_entityRegistry, _searchClient, _indexConvention), _entityRegistry, 1, 1, 3, - 1); + new TimeseriesAspectIndexBuilders(ElasticSearchServiceTest.getIndexBuilder(_searchClient), _entityRegistry, + _indexConvention), _entityRegistry, ElasticSearchServiceTest.getBulkProcessor(_searchClient)); } @AfterTest @@ -263,7 +267,8 @@ public void testGetAggregatedStatsLatestStatForDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 23 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec latestStatAggregationSpec = @@ -299,7 +304,8 @@ public void testGetAggregatedStatsLatestStrArrayDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 23 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec latestStatAggregationSpec = @@ -343,7 +349,8 @@ public void testGetAggregatedStatsLatestStatForTwoDays() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 47 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec latestStatAggregationSpec = @@ -382,7 +389,8 @@ public void testGetAggregatedStatsLatestStatForFirst10HoursOfDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 9 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec latestStatAggregationSpec = @@ -420,7 +428,8 @@ public void testGetAggregatedStatsLatestStatForCol1Day1() { Criterion hasCol1 = new Criterion().setField("componentProfiles.key").setCondition(Condition.EQUAL).setValue("col1"); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, hasCol1, startTimeCriterion, endTimeCriterion)); + Filter filter = QueryUtils.getFilterFromCriteria( + ImmutableList.of(hasUrnCriterion, hasCol1, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec latestStatAggregationSpec = @@ -461,7 +470,8 @@ public void testGetAggregatedStatsLatestStatForAllColumnsDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(lastEntryTimeStamp)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec latestStatAggregationSpec = @@ -506,7 +516,8 @@ public void testGetAggregatedStatsSumStatForFirst10HoursOfDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 9 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate the sum of stat value AggregationSpec sumAggregationSpec = @@ -546,7 +557,8 @@ public void testGetAggregatedStatsSumStatForCol2Day1() { Criterion hasCol2 = new Criterion().setField("componentProfiles.key").setCondition(Condition.EQUAL).setValue("col2"); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, hasCol2, startTimeCriterion, endTimeCriterion)); + Filter filter = QueryUtils.getFilterFromCriteria( + ImmutableList.of(hasUrnCriterion, hasCol2, startTimeCriterion, endTimeCriterion)); // Aggregate the sum of stat value AggregationSpec sumStatAggregationSpec = @@ -589,7 +601,8 @@ public void testGetAggregatedStatsCardinalityAggStrStatDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 23 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec cardinalityStatAggregationSpec = @@ -624,7 +637,8 @@ public void testGetAggregatedStatsSumStatsCollectionDay1() { .setCondition(Condition.LESS_THAN_OR_EQUAL_TO) .setValue(String.valueOf(_startTime + 23 * TIME_INCREMENT)); - Filter filter = QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); + Filter filter = + QueryUtils.getFilterFromCriteria(ImmutableList.of(hasUrnCriterion, startTimeCriterion, endTimeCriterion)); // Aggregate on latest stat value AggregationSpec cardinalityStatAggregationSpec = diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataAuditEventsProcessor.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataAuditEventsProcessor.java index 9e1dcdad8c60e..26b4e37067643 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataAuditEventsProcessor.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataAuditEventsProcessor.java @@ -8,6 +8,7 @@ import com.linkedin.gms.factory.common.GraphServiceFactory; import com.linkedin.gms.factory.common.SystemMetadataServiceFactory; import com.linkedin.gms.factory.search.EntitySearchServiceFactory; +import com.linkedin.gms.factory.search.SearchDocumentTransformerFactory; import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.dao.utils.RecordUtils; import com.linkedin.metadata.extractor.AspectExtractor; @@ -57,22 +58,25 @@ @Slf4j @Component @Conditional(MetadataChangeLogProcessorCondition.class) -@Import({GraphServiceFactory.class, EntitySearchServiceFactory.class, SystemMetadataServiceFactory.class}) +@Import({GraphServiceFactory.class, EntitySearchServiceFactory.class, SystemMetadataServiceFactory.class, + SearchDocumentTransformerFactory.class}) @EnableKafka public class MetadataAuditEventsProcessor { private final GraphService _graphService; private final EntitySearchService _entitySearchService; private final SystemMetadataService _systemMetadataService; + private final SearchDocumentTransformer _searchDocumentTransformer; private final Histogram kafkaLagStats = MetricUtils.get().histogram(MetricRegistry.name(this.getClass(), "kafkaLag")); @Autowired public MetadataAuditEventsProcessor(GraphService graphService, EntitySearchService entitySearchService, - SystemMetadataService systemMetadataService) { + SystemMetadataService systemMetadataService, SearchDocumentTransformer searchDocumentTransformer) { _graphService = graphService; _entitySearchService = entitySearchService; _systemMetadataService = systemMetadataService; + _searchDocumentTransformer = searchDocumentTransformer; _graphService.configure(); _entitySearchService.configure(); @@ -176,7 +180,7 @@ private void updateSearchService(final RecordTemplate snapshot, final EntitySpec Optional searchDocument; try { - searchDocument = SearchDocumentTransformer.transformSnapshot(snapshot, entitySpec, false); + searchDocument = _searchDocumentTransformer.transformSnapshot(snapshot, entitySpec, false); } catch (Exception e) { log.error("Error in getting documents from snapshot: {} for snapshot {}", e, snapshot); return; diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java index bff5bdcfdf838..bf5de456008de 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java @@ -11,6 +11,7 @@ import com.linkedin.gms.factory.common.SystemMetadataServiceFactory; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.search.EntitySearchServiceFactory; +import com.linkedin.gms.factory.search.SearchDocumentTransformerFactory; import com.linkedin.gms.factory.timeseries.TimeseriesAspectServiceFactory; import com.linkedin.metadata.EventUtils; import com.linkedin.metadata.extractor.FieldExtractor; @@ -55,14 +56,15 @@ import org.springframework.kafka.annotation.KafkaListener; import org.springframework.stereotype.Component; -import static com.linkedin.metadata.search.utils.QueryUtils.*; +import static com.linkedin.metadata.search.utils.QueryUtils.createRelationshipFilter; +import static com.linkedin.metadata.search.utils.QueryUtils.newRelationshipFilter; @Slf4j @Component @Conditional(MetadataChangeLogProcessorCondition.class) @Import({GraphServiceFactory.class, EntitySearchServiceFactory.class, TimeseriesAspectServiceFactory.class, - EntityRegistryFactory.class, SystemMetadataServiceFactory.class}) + EntityRegistryFactory.class, SystemMetadataServiceFactory.class, SearchDocumentTransformerFactory.class}) @EnableKafka public class MetadataChangeLogProcessor { @@ -71,17 +73,20 @@ public class MetadataChangeLogProcessor { private final TimeseriesAspectService _timeseriesAspectService; private final SystemMetadataService _systemMetadataService; private final EntityRegistry _entityRegistry; + private final SearchDocumentTransformer _searchDocumentTransformer; private final Histogram kafkaLagStats = MetricUtils.get().histogram(MetricRegistry.name(this.getClass(), "kafkaLag")); @Autowired public MetadataChangeLogProcessor(GraphService graphService, EntitySearchService entitySearchService, - TimeseriesAspectService timeseriesAspectService, SystemMetadataService systemMetadataService, EntityRegistry entityRegistry) { + TimeseriesAspectService timeseriesAspectService, SystemMetadataService systemMetadataService, + EntityRegistry entityRegistry, SearchDocumentTransformer searchDocumentTransformer) { _graphService = graphService; _entitySearchService = entitySearchService; _timeseriesAspectService = timeseriesAspectService; _systemMetadataService = systemMetadataService; _entityRegistry = entityRegistry; + _searchDocumentTransformer = searchDocumentTransformer; _timeseriesAspectService.configure(); } @@ -162,7 +167,8 @@ public void consume(final ConsumerRecord consumerRecord) } } - private Pair, Set> getEdgesAndRelationshipTypesFromAspect(Urn urn, AspectSpec aspectSpec, RecordTemplate aspect) { + private Pair, Set> getEdgesAndRelationshipTypesFromAspect(Urn urn, AspectSpec aspectSpec, + RecordTemplate aspect) { final Set relationshipTypesBeingAdded = new HashSet<>(); final List edgesToAdd = new ArrayList<>(); @@ -209,7 +215,7 @@ private void updateGraphService(Urn urn, AspectSpec aspectSpec, RecordTemplate a private void updateSearchService(String entityName, Urn urn, AspectSpec aspectSpec, RecordTemplate aspect) { Optional searchDocument; try { - searchDocument = SearchDocumentTransformer.transformAspect(urn, aspect, aspectSpec, false); + searchDocument = _searchDocumentTransformer.transformAspect(urn, aspect, aspectSpec, false); } catch (Exception e) { log.error("Error in getting documents from aspect: {} for aspect {}", e, aspectSpec.getName()); return; @@ -270,11 +276,13 @@ private void deleteGraphData(Urn urn, AspectSpec aspectSpec, RecordTemplate aspe final Set relationshipTypesBeingAdded = edgeAndRelationTypes.getSecond(); if (relationshipTypesBeingAdded.size() > 0) { _graphService.removeEdgesFromNode(urn, new ArrayList<>(relationshipTypesBeingAdded), - createRelationshipFilter(new Filter().setOr(new ConjunctiveCriterionArray()), RelationshipDirection.OUTGOING)); + createRelationshipFilter(new Filter().setOr(new ConjunctiveCriterionArray()), + RelationshipDirection.OUTGOING)); } } - private void deleteSearchData(Urn urn, String entityName, AspectSpec aspectSpec, RecordTemplate aspect, Boolean isKeyAspect) { + private void deleteSearchData(Urn urn, String entityName, AspectSpec aspectSpec, RecordTemplate aspect, + Boolean isKeyAspect) { String docId; try { docId = URLEncoder.encode(urn.toString(), "UTF-8"); @@ -290,7 +298,7 @@ private void deleteSearchData(Urn urn, String entityName, AspectSpec aspectSpec, Optional searchDocument; try { - searchDocument = SearchDocumentTransformer.transformAspect(urn, aspect, aspectSpec, true); + searchDocument = _searchDocumentTransformer.transformAspect(urn, aspect, aspectSpec, true); } catch (Exception e) { log.error("Error in getting documents from aspect: {} for aspect {}", e, aspectSpec.getName()); return; diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/GlossaryTermAssociation.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/GlossaryTermAssociation.pdl index bcd859f8b5898..e10507b52534e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/common/GlossaryTermAssociation.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/common/GlossaryTermAssociation.pdl @@ -9,7 +9,7 @@ record GlossaryTermAssociation { */ @Searchable = { "fieldName": "glossaryTerms", - "fieldType": "URN_PARTIAL", + "fieldType": "URN", "addToFilters": true, "filterNameOverride": "Glossary Term" } diff --git a/metadata-models/src/main/pegasus/com/linkedin/common/TagAssociation.pdl b/metadata-models/src/main/pegasus/com/linkedin/common/TagAssociation.pdl index 165879eba3998..b04eb0811ec98 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/common/TagAssociation.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/common/TagAssociation.pdl @@ -10,7 +10,7 @@ record TagAssociation { */ @Searchable = { "fieldName": "tags", - "fieldType": "URN_PARTIAL", + "fieldType": "URN", "hasValuesFieldName": "hasTags", "addToFilters": true, "filterNameOverride": "Tag" diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/BaseEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/BaseEntity.pdl deleted file mode 100644 index 50f6eac371459..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/BaseEntity.pdl +++ /dev/null @@ -1,12 +0,0 @@ -namespace com.linkedin.metadata.entity - -/** - * Common fields that apply to all entities - */ -record BaseEntity { - - /** - * Whether the entity has been removed or not - */ - removed: optional boolean = false -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/ChartEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/ChartEntity.pdl deleted file mode 100644 index 977097ed564ac..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/ChartEntity.pdl +++ /dev/null @@ -1,24 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.ChartUrn - -/** - * Data model for a Chart entity - */ -record ChartEntity includes BaseEntity { - - /** - * Urn for the chart - */ - urn: ChartUrn - - /** - * Dashboard tool - */ - dashboardTool: optional string - - /** - * Chart Id - */ - chartId: optional string -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/CorpGroupEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/CorpGroupEntity.pdl deleted file mode 100644 index 7833b210850f4..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/CorpGroupEntity.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.CorpGroupUrn - -/** - * Data model for a CorpGroup entity(go/groupId) - */ -record CorpGroupEntity includes BaseEntity { - - /** - * Urn for the LDAP Group - */ - urn: CorpGroupUrn - - /** - * name of the group, e.g. wherehows-dev, ask_metadata - */ - name: optional string -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/CorpUserEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/CorpUserEntity.pdl deleted file mode 100644 index 8a01900b29525..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/CorpUserEntity.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.CorpuserUrn - -/** - * Data model for a CorpUser entity - */ -record CorpUserEntity includes BaseEntity { - - /** - * Urn for the LDAP User - */ - urn: CorpuserUrn - - /** - * LDAP name(id) : e.g. hzhang2, ywang5 .. - */ - name: optional string -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DashboardEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DashboardEntity.pdl deleted file mode 100644 index 655cf0341fbec..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DashboardEntity.pdl +++ /dev/null @@ -1,24 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.DashboardUrn - -/** - * Data model for a Dashboard entity - */ -record DashboardEntity includes BaseEntity { - - /** - * Urn for the dashboard - */ - urn: DashboardUrn - - /** - * Dashboard tool - */ - dashboardTool: optional string - - /** - * Dashboard Id - */ - dashboardId: optional string -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataFlowEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataFlowEntity.pdl deleted file mode 100644 index 1c9e777b52307..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataFlowEntity.pdl +++ /dev/null @@ -1,30 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.DataFlowUrn - -/** - * Data model for a DataFlow entity - */ -record DataFlowEntity includes BaseEntity { - - /** - * Urn for the DataFlow - */ - urn: DataFlowUrn - - /** - * Workflow orchestrator ex: Azkaban, Airflow - */ - orchestrator: optional string - - /** - * Id of the flow - */ - flowId: optional string - - /** - * Cluster of the flow - */ - cluster: optional string - -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataJobEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataJobEntity.pdl deleted file mode 100644 index 91dfdeacecbf7..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataJobEntity.pdl +++ /dev/null @@ -1,27 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.DataJobUrn -import com.linkedin.common.DataFlowUrn - - -/** - * Data model for a DataJob entity - */ -record DataJobEntity includes BaseEntity { - - /** - * Urn for the DataJob - */ - urn: DataJobUrn - - /** - * Urn of the associated DataFlow - */ - flow: optional DataFlowUrn - - /** - * Id of the job - */ - jobId: optional string - -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataProcessEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataProcessEntity.pdl deleted file mode 100644 index 346e2665a3874..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DataProcessEntity.pdl +++ /dev/null @@ -1,31 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.DataPlatformUrn -import com.linkedin.common.FabricType -import com.linkedin.common.DataProcessUrn - -/** - * Data model for a Data Process entity - */ -record DataProcessEntity { - - /** - * Urn for the Data Process - */ - urn: DataProcessUrn - - /** - * Data Process name(id) - */ - name: optional string - - /** - * Process Orchestrator for this process in the form. Options can be Airflow, Azkaban, Azure Data Factory - */ - orchestrator: optional string - - /** - * Fabric type where dataset belongs to or where it was generated. - */ - origin: optional FabricType -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DatasetEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DatasetEntity.pdl deleted file mode 100644 index 23c1ab0aa3514..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/DatasetEntity.pdl +++ /dev/null @@ -1,31 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.DataPlatformUrn -import com.linkedin.common.DatasetUrn -import com.linkedin.common.FabricType - -/** - * Data model for a dataset entity - */ -record DatasetEntity includes BaseEntity { - - /** - * Urn for the dataset - */ - urn: DatasetUrn - - /** - * Dataset native name e.g. {db}.{table}, /dir/subdir/{name}, or {name} - */ - name: optional string - - /** - * Platform urn for the dataset in the form of urn:li:platform:{platform_name} - */ - platform: optional DataPlatformUrn - - /** - * Fabric type where dataset belongs to or where it was generated. - */ - origin: optional FabricType -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/Entity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/Entity.pdl deleted file mode 100644 index 6ba41ac73940c..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/Entity.pdl +++ /dev/null @@ -1,6 +0,0 @@ -namespace com.linkedin.metadata.entity - -/** - * A union of all supported entity types. - */ -typeref Entity = union[CorpUserEntity, DatasetEntity, DataProcessEntity, MLModelEntity, DataFlowEntity, DataJobEntity] \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/GlossaryNodeEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/GlossaryNodeEntity.pdl deleted file mode 100644 index e0ea997bf75ab..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/GlossaryNodeEntity.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.GlossaryNodeUrn - -/** - * Data model for a GlossaryNode entity - */ -record GlossaryNodeEntity includes BaseEntity { - - /** - * Urn for the GlossaryNode - */ - urn: GlossaryNodeUrn - - /** - * Business node name - */ - name: optional string -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/GlossaryTermEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/GlossaryTermEntity.pdl deleted file mode 100644 index 4e3dd82a5f8a9..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/GlossaryTermEntity.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.GlossaryTermUrn - -/** - * Data model for a GlossaryTerm entity - */ -record GlossaryTermEntity includes BaseEntity { - - /** - * Urn for the GlossaryTerm - */ - urn: GlossaryTermUrn - - /** - * Business term name - */ - name: optional string -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/MLModelEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/MLModelEntity.pdl deleted file mode 100644 index 5b5e25814567d..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/MLModelEntity.pdl +++ /dev/null @@ -1,31 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.DataPlatformUrn -import com.linkedin.common.FabricType -import com.linkedin.common.MLModelUrn - -/** - * Data model for a ML Model entity - */ -record MLModelEntity includes BaseEntity { - - /** - * Urn for the ML Model - */ - urn: MLModelUrn - - /** - * ML Model native name - */ - name: optional string - - /** - * Platform urn for the ML Model in the form of urn:li:platform:{platform_name} - */ - platform: optional DataPlatformUrn - - /** - * Fabric type where ML Model belongs to or where it was generated. - */ - origin: optional FabricType -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/TagEntity.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/TagEntity.pdl deleted file mode 100644 index 154d623dded69..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/entity/TagEntity.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.entity - -import com.linkedin.common.TagUrn - -/** - * Data model for a tag entity - */ -record TagEntity includes BaseEntity { - - /** - * Urn for the tag - */ - urn: TagUrn - - /** - * Name of the tag - */ - name: optional string -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/BaseRelationship.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/BaseRelationship.pdl deleted file mode 100644 index 9a787f150176c..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/BaseRelationship.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.relationship - -import com.linkedin.common.Urn - -/** - * Common fields that apply to all relationships - */ -record BaseRelationship { - - /** - * Urn for the source of the relationship - */ - source: Urn - - /** - * Urn for the destination of the relationship - */ - destination: Urn -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Consumes.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Consumes.pdl deleted file mode 100644 index 1386f0a42c3d5..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Consumes.pdl +++ /dev/null @@ -1,11 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.DatasetUrn", - "source" : "com.linkedin.common.urn.DataJobUrn" -} ] -record Consumes includes BaseRelationship { -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Contains.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Contains.pdl deleted file mode 100644 index 8de23b7ed02d9..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Contains.pdl +++ /dev/null @@ -1,17 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the Has-A relationship - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.ChartUrn", - "source" : "com.linkedin.common.urn.DashboardUrn" -},{ - "destination" : "com.linkedin.common.urn.GlossaryTermUrn", - "source" : "com.linkedin.common.urn.GlossaryNodeUrn" -},{ - "destination" : "com.linkedin.common.urn.GlossaryNodeUrn", - "source" : "com.linkedin.common.urn.GlossaryNodeUrn" -} ] -record Contains includes BaseRelationship { -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/DownstreamOf.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/DownstreamOf.pdl deleted file mode 100644 index 0036b3cb1d720..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/DownstreamOf.pdl +++ /dev/null @@ -1,21 +0,0 @@ -namespace com.linkedin.metadata.relationship - -import com.linkedin.dataset.DatasetLineageType - -/** - * A generic model for the DownstreamOf relationship - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.DatasetUrn", - "source" : "com.linkedin.common.urn.DatasetUrn" -}, { - "destination" : "com.linkedin.common.urn.DatasetUrn", - "source" : "com.linkedin.common.urn.ChartUrn" -} ] -record DownstreamOf includes BaseRelationship { - - /** - * The type of the lineage - */ - type: optional DatasetLineageType -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/EvaluatedOn.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/EvaluatedOn.pdl deleted file mode 100644 index a8ffb5755b814..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/EvaluatedOn.pdl +++ /dev/null @@ -1,12 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the Evaluated-On relationship - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.DatasetUrn", - "source" : "com.linkedin.common.urn.MLModelUrn" -}] -record EvaluatedOn includes BaseRelationship { - -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/IsPartOf.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/IsPartOf.pdl deleted file mode 100644 index b60b06e655a7a..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/IsPartOf.pdl +++ /dev/null @@ -1,12 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the Is-Part-Of relationship - */ -@pairings = [ -{ - "destination" : "com.linkedin.common.urn.DataFlowUrn", - "source" : "com.linkedin.common.urn.DataJobUrn" -} ] -record IsPartOf includes BaseRelationship { -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/OwnedBy.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/OwnedBy.pdl deleted file mode 100644 index 22d9e1ffbab20..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/OwnedBy.pdl +++ /dev/null @@ -1,36 +0,0 @@ -namespace com.linkedin.metadata.relationship - -import com.linkedin.common.OwnershipType - -/** - * A generic model for the Owned-By relationship - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.DatasetUrn" -}, { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.DataProcessUrn" -}, { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.MLModelUrn" - }, { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.DataJobUrn" - }, { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.DataFlowUrn" -}, { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.GlossaryTermUrn" -}, { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.GlossaryNodeUrn" - } ] -record OwnedBy includes BaseRelationship { - - /** - * The type of the ownership - */ - type: OwnershipType -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Produces.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Produces.pdl deleted file mode 100644 index 7f7001e3e6c43..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Produces.pdl +++ /dev/null @@ -1,11 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.DatasetUrn", - "source" : "com.linkedin.common.urn.DataJobUrn" -} ] -record Produces includes BaseRelationship { -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Relationship.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Relationship.pdl deleted file mode 100644 index 982c3e888e9b9..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/Relationship.pdl +++ /dev/null @@ -1,6 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A union of all supported relationship types. - */ -typeref Relationship = union[Contains, IsPartOf, OwnedBy, Consumes, Produces] \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/ReportsTo.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/ReportsTo.pdl deleted file mode 100644 index 6c9f3b9df458d..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/ReportsTo.pdl +++ /dev/null @@ -1,11 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the Reports-To relationship - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.CorpuserUrn", - "source" : "com.linkedin.common.urn.CorpuserUrn" -} ] -record ReportsTo includes BaseRelationship { -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/RunsBefore.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/RunsBefore.pdl deleted file mode 100644 index e00d39e11d603..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/RunsBefore.pdl +++ /dev/null @@ -1,11 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.DataJobUrn", - "source" : "com.linkedin.common.urn.DataJobUrn" -} ] -record RunsBefore includes BaseRelationship { -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/TrainedOn.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/TrainedOn.pdl deleted file mode 100644 index 0b52b11600462..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/relationship/TrainedOn.pdl +++ /dev/null @@ -1,12 +0,0 @@ -namespace com.linkedin.metadata.relationship - -/** - * A generic model for the Trained-On relationship - */ -@pairings = [ { - "destination" : "com.linkedin.common.urn.DatasetUrn", - "source" : "com.linkedin.common.urn.MLModelUrn" -}] -record TrainedOn includes BaseRelationship { - -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/BaseDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/BaseDocument.pdl deleted file mode 100644 index 48535aaac2e94..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/BaseDocument.pdl +++ /dev/null @@ -1,17 +0,0 @@ -namespace com.linkedin.metadata.search - -/** - * Common fields that may apply to all documents - */ -record BaseDocument { - - /** - * Whether the entity has been removed or not - */ - removed: optional boolean = false - - /** - * All paths representing the hierarchy of this entity. This is essential for browsing various paths leading to this entity. - */ - browsePaths: optional array[string] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/ChartDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/ChartDocument.pdl deleted file mode 100644 index 8365755a70b05..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/ChartDocument.pdl +++ /dev/null @@ -1,57 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.chart.ChartQueryType -import com.linkedin.chart.ChartType -import com.linkedin.common.AccessLevel -import com.linkedin.common.ChartUrn - -/** - * Data model for Chart entity search - */ -record ChartDocument includes BaseDocument { - - /** - * Urn for the Chart - */ - urn: ChartUrn - - /** - * Title of the chart - */ - title: optional string - - /** - * Detailed description about the chart - */ - description: optional string - - /** - * Dashboard tool ex: Looker, Redash - */ - tool: optional string - - /** - * Chart query type - */ - queryType: optional ChartQueryType - - /** - * LDAP usernames of corp users who are the owners of this chart - */ - owners: optional array[string] - - /** - * Type of the chart - */ - type: optional ChartType - - /** - * Access level for the chart - */ - access: optional AccessLevel - - /** - * List of tags for this dataset - */ - tags: optional array[string] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/CorpGroupDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/CorpGroupDocument.pdl deleted file mode 100644 index 75acb19f4a811..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/CorpGroupDocument.pdl +++ /dev/null @@ -1,34 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.CorpGroupUrn - -/** - * Data model for Corp Group entity search - */ -record CorpGroupDocument includes BaseDocument { - - /** - * Urn for the Corp group. - */ - urn: CorpGroupUrn - - /** - * Email of the corp group - */ - email: optional string - - /** - * ldap usernames of corp users who are direct members of this group - */ - members: optional array[string] - - /** - * ldap usernames of corp users who are direct admins of this group - */ - admins: optional array[string] - - /** - * List of group names who are part of this group - */ - groups: optional array[string] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/CorpUserInfoDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/CorpUserInfoDocument.pdl deleted file mode 100644 index 9ab24cf19a486..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/CorpUserInfoDocument.pdl +++ /dev/null @@ -1,59 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.CorpuserUrn - -/** - * Data model for CorpUserInfo entity search - */ -record CorpUserInfoDocument includes BaseDocument { - - /** - * Urn for the CorpUser. - */ - urn: CorpuserUrn - - /** - * ldap of the CorpUser - */ - ldap: optional string - - /** - * title of the CorpUser - */ - title: optional string - - /** - * direct manager's ldap of the CorpUser - */ - managerLdap: optional string - - /** - * Common name of the CorpUser, format is firstName + lastName (split by a whitespace) - */ - fullName: optional string - - /** - * About me section of the user - */ - aboutMe: optional string - - /** - * Teams that the user belongs to e.g. Metadata - */ - teams: optional array[string] - - /** - * Skills that the user possesses e.g. Machine Learning - */ - skills: optional array[string] - - /** - * Whether the corpUser is active, ref: https://iwww.corp.linkedin.com/wiki/cf/display/GTSD/Accessing+Active+Directory+via+LDAP+tools - */ - active: optional boolean - - /** - * The user's full email(s). - */ - emails: optional array[string] -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DashboardDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DashboardDocument.pdl deleted file mode 100644 index a18adda7c998f..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DashboardDocument.pdl +++ /dev/null @@ -1,45 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.AccessLevel -import com.linkedin.common.DashboardUrn - -/** - * Data model for Chart entity search - */ -record DashboardDocument includes BaseDocument { - - /** - * Urn for the Dashboard - */ - urn: DashboardUrn - - /** - * Title of the dashboard - */ - title: optional string - - /** - * Detailed description about the dashboard - */ - description: optional string - - /** - * Dashboard tool ex: Looker, Redash - */ - tool: optional string - - /** - * LDAP usernames of corp users who are the owners of this dashboard - */ - owners: optional array[string] - - /** - * Access level for the dashboard - */ - access: optional AccessLevel - - /** - * List of tags for this dataset - */ - tags: optional array[string] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataFlowDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataFlowDocument.pdl deleted file mode 100644 index fec19a88c3b8c..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataFlowDocument.pdl +++ /dev/null @@ -1,60 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.AccessLevel -import com.linkedin.common.DataFlowUrn - -/** - * Data model for DataFlow entity search - */ -record DataFlowDocument includes BaseDocument { - - /** - * Urn for the DataFlow - */ - urn: DataFlowUrn - - /** - * Id of the flow - */ - flowId: optional string - - /** - * Name of the flow - */ - name: optional string - - /** - * Description of the flow - */ - description: optional string - - /** - * Workflow orchestrator ex: Azkaban, Airflow - */ - orchestrator: optional string - - /** - * Cluster of the flow - */ - cluster: optional string - - /** - * Project of the flow - */ - project: optional string - - /** - * LDAP usernames of corp users who are the owners of this flow - */ - owners: optional array[string] - - /** - * Flag to indicate if the flow has non empty corp users as owners or not. - */ - hasOwners: optional boolean - - /** - * List of tags for this dataset - */ - tags: optional array[string] -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataJobDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataJobDocument.pdl deleted file mode 100644 index d0b1d3a0c1c3b..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataJobDocument.pdl +++ /dev/null @@ -1,77 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.AccessLevel -import com.linkedin.common.DataJobUrn -import com.linkedin.common.DatasetUrn - - -/** - * Data model for DataJob entity search - */ -record DataJobDocument includes BaseDocument { - - /* - * Urn for the DataJob - */ - urn: DataJobUrn - - /** - * Optional description of the job - */ - description: optional string - - /** - * Optional name of the job - */ - name: optional string - - /** - * Name of the associated data flow - */ - dataFlow: optional string - - /** - * Id of the job - */ - jobId: optional string - - /** - * LDAP usernames of corp users who are the owners of this job - */ - owners: optional array[string] - - /** - * Flag to indicate if the job has non empty corp users as owners or not. - */ - hasOwners: optional boolean - - /** - * Lineage information represented by the number of immediate input datasets of this job. - */ - numInputDatasets: optional long - - /** - * Lineage information represented by the number of immediate output datasets of this job. - */ - numOutputDatasets: optional long - - /** - * List of inputs for this job - */ - inputs: optional array[DatasetUrn] - - /** - * List of outputs for this job - */ - outputs: optional array[DatasetUrn] - - /** - * Workflow orchestrator ex: Azkaban, Airflow - */ - orchestrator: optional string - - /** - * List of tags for this dataset - */ - tags: optional array[string] -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataProcessDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataProcessDocument.pdl deleted file mode 100644 index b4179129a5fb5..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DataProcessDocument.pdl +++ /dev/null @@ -1,61 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.DataProcessUrn -import com.linkedin.common.DatasetUrn -import com.linkedin.common.FabricType - -/** - * Data model for data process entity search - */ -record DataProcessDocument includes BaseDocument { - - /** - * Urn for the data process - */ - urn: DataProcessUrn - - /** - * Process native name e.g. a ETL script name - */ - name: optional string - - /** - * Orchestrator name for this process, such as Azure Data Factory - */ - orchestrator: optional string - - /** - * Fabric type where data process belongs to or where it was generated - */ - origin: optional FabricType - - /** - * LDAP usernames of corp users who are the owners of this process - */ - owners: optional array[string] - - /** - * Flag to indicate if the process has non empty corp users as owners or not. - */ - hasOwners: optional boolean - - /** - * Lineage information represented by the number of immediate input datasets of this process. - */ - numInputDatasets: optional long - - /** - * Lineage information represented by the number of immediate output datasets of this process. - */ - numOutputDatasets: optional long - - /** - * List of inputs for this process - */ - inputs: optional array[DatasetUrn] - - /** - * List of outputs for this process - */ - outputs: optional array[DatasetUrn] -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DatasetDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DatasetDocument.pdl deleted file mode 100644 index b5cc13939ab7e..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/DatasetDocument.pdl +++ /dev/null @@ -1,100 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.DatasetUrn -import com.linkedin.common.FabricType - -/** - * Data model for dataset entity search - */ -record DatasetDocument includes BaseDocument { - - /** - * Urn for the dataset - */ - urn: DatasetUrn - - /** - * Dataset native name e.g. {db}.{table}, /dir/subdir/{name}, or {name} - */ - name: optional string - - /** - * Platform name for the dataset - */ - platform: optional string - - /** - * Fabric type where dataset belongs to or where it was generated - */ - origin: optional FabricType - - /** - * LDAP usernames of corp users who are the owners of this dataset - */ - owners: optional array[string] - - /** - * Flag to indicate if the dataset is deprecated. - */ - deprecated: optional boolean - - /** - * Documentation of the dataset. - */ - description: optional string - - /** - * Field paths of the dataset - */ - fieldPaths: optional array[string] - - /** - * Flag to indicate if the dataset has non empty corp users as owners or not. - */ - hasOwners: optional boolean - - /** - * Flag to indicate if the dataset has non-empty schema or not. - */ - hasSchema: optional boolean - - /** - * Lineage information represented by the number of immediate downstream datasets of this dataset. - */ - numDownstreamDatasets: optional long - - /** - * List of upstreams for this dataset - */ - upstreams: optional array[DatasetUrn] - - /** - * List of tags for this dataset - */ - tags: optional array[string] - - /** - * List of field descriptions - */ - fieldDescriptions: optional array[string] - - /** - * List of tags applied to fields - */ - fieldTags: optional array[string] - - /** - * List of field descriptions - */ - editedFieldDescriptions: optional array[string] - - /** - * List of tags applied to fields - */ - editedFieldTags: optional array[string] - - /** - * List of terms for this dataset - */ - glossaryTerms: optional array[string] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/Document.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/Document.pdl deleted file mode 100644 index d4482e953d36a..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/Document.pdl +++ /dev/null @@ -1,6 +0,0 @@ -namespace com.linkedin.metadata.search - -/** - * A union of all supported document types. - */ -typeref Document = union[CorpUserInfoDocument, DatasetDocument] \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/GlossaryNodeInfoDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/GlossaryNodeInfoDocument.pdl deleted file mode 100644 index 5205caf45220f..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/GlossaryNodeInfoDocument.pdl +++ /dev/null @@ -1,29 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.GlossaryNodeUrn - -/** - * Data model for GlossaryNodeInfo entity search - */ -record GlossaryNodeInfoDocument includes BaseDocument { - - /** - * Urn for the GlossaryNode. - */ - urn: GlossaryNodeUrn - - /** - * Name of business node - */ - name: optional string - - /** - * Definition of business node - */ - definition: optional string - - /** - * LDAP usernames of corp users who are the owners of this business node - */ - owners: optional array[string] -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/GlossaryTermInfoDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/GlossaryTermInfoDocument.pdl deleted file mode 100644 index c21ebaab0b99e..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/GlossaryTermInfoDocument.pdl +++ /dev/null @@ -1,39 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.GlossaryTermUrn - -/** - * Data model for GlossaryTermInfo entity search - */ -record GlossaryTermInfoDocument includes BaseDocument { - - /** - * Urn for the GlossaryTerm. - */ - urn: GlossaryTermUrn - - /** - * Name of business term - */ - name: optional string - - /** - * Definition of business term - */ - definition: optional string - - /** - * LDAP usernames of corp users who are the owners of this dataset - */ - owners: optional array[string] - - /** - * Source of the Business Term (INTERNAL or EXTERNAL) with default value as INTERNAL - */ - termSource: optional string - - /** - * External Reference to the business-term (URL) - */ - sourceRef: optional string -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/MLModelDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/MLModelDocument.pdl deleted file mode 100644 index c3a7fcfed223e..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/MLModelDocument.pdl +++ /dev/null @@ -1,71 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.MLModelUrn -import com.linkedin.common.DatasetUrn -import com.linkedin.common.FabricType - -/** - * Data model for ML Model entity search - */ -record MLModelDocument includes BaseDocument { - - /** - * Urn for the model - */ - urn: MLModelUrn - - /** - * Name of model - */ - name: optional string - - /** - * Platform name for the model - */ - platform: optional string - - /** - * Fabric type where model belongs to or where it was generated - */ - origin: optional FabricType - - /** - * Description of the model - */ - description: optional string - - /** - * Timestamp model was created - */ - createdTimestamp: optional long - - /** - * Whether or not the Model has owners - */ - hasOwners: optional boolean - - /** - * LDAP usernames of corp users who are the owners of this model - */ - owners: optional array[string] - - /** - * Type of Algorithm or Model such as whether it is a Naive Bayes classifier, Convolutional Neural Network, etc - */ - type: optional string - - /** - * What datasets were used to train the model? - */ - trainingDatasets: optional array[DatasetUrn] - - /** - * What datasets were used to evaluate the model? - */ - evaluationDatasets: optional array[DatasetUrn] - - /** - * Whether or not the model is currently active - */ - active: optional boolean -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/TagDocument.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/TagDocument.pdl deleted file mode 100644 index abab05f22d303..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/TagDocument.pdl +++ /dev/null @@ -1,19 +0,0 @@ -namespace com.linkedin.metadata.search - -import com.linkedin.common.TagUrn - -/** - * Data model for tag entity search - */ -record TagDocument includes BaseDocument { - - /** - * Urn for the dataset - */ - urn: TagUrn - - /** - * Tag name e.g. `Legacy` - */ - name: optional string -} diff --git a/metadata-models/src/main/pegasus/com/linkedin/mxe/MetadataGraphEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/mxe/MetadataGraphEvent.pdl deleted file mode 100644 index 66993993a1ec4..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/mxe/MetadataGraphEvent.pdl +++ /dev/null @@ -1,31 +0,0 @@ -namespace com.linkedin.mxe - -import com.linkedin.avro2pegasus.events.KafkaAuditHeader -import com.linkedin.metadata.entity.Entity -import com.linkedin.metadata.relationship.Relationship - -/** - * Kafka event for capturing update made to a list of entities and relationships. - */ -record MetadataGraphEvent { - - /** - * Kafka audit header. See go/kafkaauditheader for more info. - */ - auditHeader: optional KafkaAuditHeader - - /** - * A list of entity updates-or-inserts. Only fields updated are set in the case of partial update. - */ - upsertedEntities: array[Entity] - - /** - * A list of removed relationships. Only fields used to identify the relationships to remove are set. - */ - removedRelationships: array[Relationship] - - /** - * A list of relationship updates-or-inserts. - */ - upsertedRelationships: array[Relationship] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/mxe/MetadataSearchEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/mxe/MetadataSearchEvent.pdl deleted file mode 100644 index 79d434ae044bf..0000000000000 --- a/metadata-models/src/main/pegasus/com/linkedin/mxe/MetadataSearchEvent.pdl +++ /dev/null @@ -1,20 +0,0 @@ -namespace com.linkedin.mxe - -import com.linkedin.avro2pegasus.events.KafkaAuditHeader -import com.linkedin.metadata.search.Document - -/** - * Kafka event for capturing update made to a list of search documents. - */ -record MetadataSearchEvent { - - /** - * Kafka audit header. See go/kafkaauditheader for more info. - */ - auditHeader: optional KafkaAuditHeader - - /** - * A list of search document updates-or-inserts. Only fields updated are set in the case of partial update. - */ - upsertedDocuments: array[Document] -} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/schema/EditableSchemaFieldInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/schema/EditableSchemaFieldInfo.pdl index e870fd1ba9cc0..9594281280e21 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/schema/EditableSchemaFieldInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/schema/EditableSchemaFieldInfo.pdl @@ -28,7 +28,7 @@ record EditableSchemaFieldInfo { @Searchable = { "/tags/*/tag": { "fieldName": "editedFieldTags", - "fieldType": "URN_PARTIAL", + "fieldType": "URN", "boostScore": 0.5 } } @@ -40,7 +40,7 @@ record EditableSchemaFieldInfo { @Searchable = { "/terms/*/urn": { "fieldName": "editedFieldGlossaryTerms", - "fieldType": "URN_PARTIAL", + "fieldType": "URN", "boostScore": 0.5 } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl index ce8edb560a72f..98a6fc3cff024 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl @@ -5,22 +5,23 @@ import com.linkedin.common.GlobalTags import com.linkedin.common.GlossaryTerms /** - * SchemaField to describe metadata related to dataset schema. Schema normalization rules: http://go/tms-schema + * SchemaField to describe metadata related to dataset schema. */ record SchemaField { /** - * Flattened name of the field. Field is computed from jsonPath field. For data translation rules refer to wiki page above. + * Flattened name of the field. Field is computed from jsonPath field. */ @Searchable = { "fieldName": "fieldPaths", - "fieldType": "TEXT_PARTIAL" + "fieldType": "TEXT" } fieldPath: SchemaFieldPath /** * Flattened name of a field in JSON Path notation. */ + @Deprecated jsonPath: optional string /** @@ -59,7 +60,7 @@ record SchemaField { @Searchable = { "/tags/*/tag": { "fieldName": "fieldTags", - "fieldType": "URN_PARTIAL", + "fieldType": "URN", "boostScore": 0.5 } } @@ -71,7 +72,7 @@ record SchemaField { @Searchable = { "/terms/*/urn": { "fieldName": "fieldGlossaryTerms", - "fieldType": "URN_PARTIAL", + "fieldType": "URN", "boostScore": 0.5 } } diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 2c2d1114d0cd4..f8c71cf6214b7 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -1,5 +1,6 @@ entities: - name: dataset + doc: Datasets represent logical or physical data assets stored or represented in various data platforms. Tables, Views, Streams are all instances of datasets. keyAspect: datasetKey aspects: - viewProperties @@ -7,10 +8,12 @@ entities: - datasetProfile - datasetUsageStatistics - name: dataHubPolicy + doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc. keyAspect: dataHubPolicyKey aspects: - dataHubPolicyInfo - name: corpuser + doc: CorpUser represents an identity of a person (or an account) in the enterprise. keyAspect: corpUserKey aspects: - corpUserInfo diff --git a/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java b/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java index e3e2bbfbb650e..11be355a7a37b 100644 --- a/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java +++ b/metadata-models/src/test/java/com/linkedin/metadata/ModelValidation.java @@ -5,8 +5,6 @@ import com.linkedin.data.template.UnionTemplate; import com.linkedin.metadata.validator.AspectValidator; import com.linkedin.metadata.validator.DeltaValidator; -import com.linkedin.metadata.validator.EntityValidator; -import com.linkedin.metadata.validator.RelationshipValidator; import com.linkedin.metadata.validator.SnapshotValidator; import java.io.IOException; import java.util.List; @@ -15,30 +13,14 @@ import javax.annotation.Nonnull; import org.testng.annotations.Test; -import static com.linkedin.metadata.ModelValidationConstants.*; -import static org.testng.AssertJUnit.*; +import static com.linkedin.metadata.ModelValidationConstants.IGNORED_ASPECT_CLASSES; +import static com.linkedin.metadata.ModelValidationConstants.IGNORED_DELTA_CLASSES; +import static com.linkedin.metadata.ModelValidationConstants.IGNORED_SNAPSHOT_CLASSES; +import static org.testng.AssertJUnit.assertFalse; public class ModelValidation { - @Test - public void validateEntities() throws Exception { - List> entities = - getRecordTemplatesInPackage("com.linkedin.metadata.entity", IGNORED_ENTITY_CLASSES); - - assertFalse("Failed to find any entities", entities.isEmpty()); - entities.forEach(EntityValidator::validateEntitySchema); - } - - @Test - public void validateRelationships() throws Exception { - List> relationships = - getRecordTemplatesInPackage("com.linkedin.metadata.relationship", IGNORED_RELATIONSHIP_CLASSES); - - assertFalse("Failed to find any relationships", relationships.isEmpty()); - relationships.forEach(RelationshipValidator::validateRelationshipSchema); - } - @Test public void validateAspects() throws Exception { List> aspects = @@ -53,7 +35,6 @@ public void validateSnapshots() throws Exception { List> snapshots = getRecordTemplatesInPackage("com.linkedin.metadata.snapshot", IGNORED_SNAPSHOT_CLASSES); - assertFalse("Failed to find any snapshots", snapshots.isEmpty()); snapshots.forEach(SnapshotValidator::validateSnapshotSchema); } diff --git a/metadata-models/src/test/java/com/linkedin/metadata/ModelValidationConstants.java b/metadata-models/src/test/java/com/linkedin/metadata/ModelValidationConstants.java index a2117471d3f6d..11fa8cdc965d4 100644 --- a/metadata-models/src/test/java/com/linkedin/metadata/ModelValidationConstants.java +++ b/metadata-models/src/test/java/com/linkedin/metadata/ModelValidationConstants.java @@ -3,9 +3,6 @@ import com.google.common.collect.ImmutableSet; import com.linkedin.data.template.RecordTemplate; import com.linkedin.data.template.UnionTemplate; -import com.linkedin.metadata.entity.BaseEntity; -import com.linkedin.metadata.relationship.BaseRelationship; -import com.linkedin.metadata.search.BaseDocument; import java.util.Set; @@ -15,13 +12,6 @@ private ModelValidationConstants() { // Util class } - static final Set> IGNORED_ENTITY_CLASSES = ImmutableSet.of(BaseEntity.class); - - static final Set> IGNORED_RELATIONSHIP_CLASSES = - ImmutableSet.of(BaseRelationship.class); - - static final Set> IGNORED_DOCUMENT_CLASSES = ImmutableSet.of(BaseDocument.class); - static final Set> IGNORED_ASPECT_CLASSES = ImmutableSet.of(); static final Set> IGNORED_SNAPSHOT_CLASSES = ImmutableSet.of(); diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java index 4362786e1f0b7..96ccf86387046 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java @@ -1,15 +1,13 @@ package com.linkedin.gms.factory.common; +import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.graph.elastic.ESGraphQueryDAO; import com.linkedin.metadata.graph.elastic.ESGraphWriteDAO; import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; -import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -18,41 +16,19 @@ @Configuration @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) -@Import({RestHighLevelClientFactory.class, IndexConventionFactory.class}) +@Import({BaseElasticSearchComponentsFactory.class}) public class ElasticSearchGraphServiceFactory { @Autowired - @Qualifier("elasticSearchRestHighLevelClient") - private RestHighLevelClient searchClient; - - @Autowired - @Qualifier(IndexConventionFactory.INDEX_CONVENTION_BEAN) - private IndexConvention indexConvention; - - @Value("${elasticsearch.bulkProcessor.requestsLimit}") - private Integer bulkRequestsLimit; - - @Value("${elasticsearch.bulkProcessor.flushPeriod}") - private Integer bulkFlushPeriod; - - @Value("${elasticsearch.bulkProcessor.numRetries}") - private Integer numRetries; - - @Value("${elasticsearch.bulkProcessor.retryInterval}") - private Long retryInterval; + @Qualifier("baseElasticSearchComponents") + private BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components; @Bean(name = "elasticSearchGraphService") @Nonnull protected ElasticSearchGraphService getInstance() { - return new ElasticSearchGraphService( - searchClient, - indexConvention, - new ESGraphWriteDAO( - searchClient, - indexConvention, - bulkRequestsLimit, - bulkFlushPeriod, - numRetries, - retryInterval), - new ESGraphQueryDAO(searchClient, indexConvention)); + return new ElasticSearchGraphService(components.getSearchClient(), components.getIndexConvention(), + new ESGraphWriteDAO(components.getSearchClient(), components.getIndexConvention(), + components.getBulkProcessor()), + new ESGraphQueryDAO(components.getSearchClient(), components.getIndexConvention()), + components.getIndexBuilder()); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java index 38d3b782b8558..25afaef5e8eb7 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java @@ -1,14 +1,12 @@ package com.linkedin.gms.factory.common; +import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.systemmetadata.ESSystemMetadataDAO; import com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService; -import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -17,41 +15,17 @@ @Configuration @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) -@Import({RestHighLevelClientFactory.class, IndexConventionFactory.class}) +@Import({BaseElasticSearchComponentsFactory.class}) public class ElasticSearchSystemMetadataServiceFactory { @Autowired - @Qualifier("elasticSearchRestHighLevelClient") - private RestHighLevelClient searchClient; - - @Autowired - @Qualifier(IndexConventionFactory.INDEX_CONVENTION_BEAN) - private IndexConvention indexConvention; - - @Value("${elasticsearch.bulkProcessor.requestsLimit}") - private Integer bulkRequestsLimit; - - @Value("${elasticsearch.bulkProcessor.flushPeriod}") - private Integer bulkFlushPeriod; - - @Value("${elasticsearch.bulkProcessor.numRetries}") - private Integer numRetries; - - @Value("${elasticsearch.bulkProcessor.retryInterval}") - private Long retryInterval; + @Qualifier("baseElasticSearchComponents") + private BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components; @Bean(name = "elasticSearchSystemMetadataService") @Nonnull protected ElasticSearchSystemMetadataService getInstance() { - return new ElasticSearchSystemMetadataService( - searchClient, - indexConvention, - new ESSystemMetadataDAO( - searchClient, - indexConvention, - bulkRequestsLimit, - bulkFlushPeriod, - numRetries, - retryInterval) - ); + return new ElasticSearchSystemMetadataService(components.getSearchClient(), components.getIndexConvention(), + new ESSystemMetadataDAO(components.getSearchClient(), components.getIndexConvention(), + components.getBulkProcessor()), components.getIndexBuilder()); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java index 9efb9323d00fc..a2816830f33ce 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java @@ -19,7 +19,7 @@ public class IndexConventionFactory { public static final String INDEX_CONVENTION_BEAN = "searchIndexConvention"; - @Value("${elasticsearch.indexPrefix:}") + @Value("${elasticsearch.index.prefix:}") private String indexPrefix; @Bean(name = INDEX_CONVENTION_BEAN) diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java new file mode 100644 index 0000000000000..d083c8bec4e43 --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java @@ -0,0 +1,56 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.gms.factory.common.IndexConventionFactory; +import com.linkedin.gms.factory.common.RestHighLevelClientFactory; +import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import javax.annotation.Nonnull; +import lombok.Value; +import org.elasticsearch.action.bulk.BulkProcessor; +import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.PropertySource; + + +/** + * Factory for components required for any services using elasticsearch + */ +@Configuration +@Import({RestHighLevelClientFactory.class}) +@PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) +public class BaseElasticSearchComponentsFactory { + @Value + public static class BaseElasticSearchComponents { + RestHighLevelClient searchClient; + IndexConvention indexConvention; + BulkProcessor bulkProcessor; + ESIndexBuilder indexBuilder; + } + + @Autowired + @Qualifier("elasticSearchRestHighLevelClient") + private RestHighLevelClient searchClient; + + @Autowired + @Qualifier(IndexConventionFactory.INDEX_CONVENTION_BEAN) + private IndexConvention indexConvention; + + @Autowired + @Qualifier("elasticSearchBulkProcessor") + private BulkProcessor bulkProcessor; + + @Autowired + @Qualifier("elasticSearchIndexBuilder") + private ESIndexBuilder indexBuilder; + + @Bean(name = "baseElasticSearchComponents") + @Nonnull + protected BaseElasticSearchComponents getInstance() { + return new BaseElasticSearchComponents(searchClient, indexConvention, bulkProcessor, indexBuilder); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java new file mode 100644 index 0000000000000..0a7877acce8cf --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java @@ -0,0 +1,52 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.gms.factory.common.RestHighLevelClientFactory; +import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.search.elasticsearch.update.BulkListener; +import javax.annotation.Nonnull; +import org.elasticsearch.action.bulk.BackoffPolicy; +import org.elasticsearch.action.bulk.BulkProcessor; +import org.elasticsearch.client.RequestOptions; +import org.elasticsearch.client.RestHighLevelClient; +import org.elasticsearch.common.unit.TimeValue; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.PropertySource; + + +@Configuration +@Import({RestHighLevelClientFactory.class}) +@PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) +public class ElasticSearchBulkProcessorFactory { + @Autowired + @Qualifier("elasticSearchRestHighLevelClient") + private RestHighLevelClient searchClient; + + @Value("${elasticsearch.bulkProcessor.requestsLimit}") + private Integer bulkRequestsLimit; + + @Value("${elasticsearch.bulkProcessor.flushPeriod}") + private Integer bulkFlushPeriod; + + @Value("${elasticsearch.bulkProcessor.numRetries}") + private Integer numRetries; + + @Value("${elasticsearch.bulkProcessor.retryInterval}") + private Long retryInterval; + + @Bean(name = "elasticSearchBulkProcessor") + @Nonnull + protected BulkProcessor getInstance() { + return BulkProcessor.builder((request, bulkListener) -> { + searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); + }, BulkListener.getInstance()) + .setBulkActions(bulkRequestsLimit) + .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) + .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) + .build(); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java new file mode 100644 index 0000000000000..46562cb54e338 --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java @@ -0,0 +1,36 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.gms.factory.common.RestHighLevelClientFactory; +import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import javax.annotation.Nonnull; +import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.PropertySource; + + +@Configuration +@Import({RestHighLevelClientFactory.class}) +@PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) +public class ElasticSearchIndexBuilderFactory { + @Autowired + @Qualifier("elasticSearchRestHighLevelClient") + private RestHighLevelClient searchClient; + + @Value("${elasticsearch.index.numShards}") + private Integer numShards; + + @Value("${elasticsearch.index.numReplicas}") + private Integer numReplicas; + + @Bean(name = "elasticSearchIndexBuilder") + @Nonnull + protected ESIndexBuilder getInstance() { + return new ESIndexBuilder(searchClient, numShards, numReplicas); + } +} \ No newline at end of file diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java index 40c7717a3d0a4..551085fd7e363 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java @@ -1,22 +1,17 @@ package com.linkedin.gms.factory.search; -import com.linkedin.gms.factory.common.IndexConventionFactory; -import com.linkedin.gms.factory.common.RestHighLevelClientFactory; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; -import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilders; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; -import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -25,16 +20,11 @@ @Configuration @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) -@Import({RestHighLevelClientFactory.class, IndexConventionFactory.class, EntityRegistryFactory.class, - SettingsBuilderFactory.class}) +@Import({EntityRegistryFactory.class, SettingsBuilderFactory.class}) public class ElasticSearchServiceFactory { @Autowired - @Qualifier("elasticSearchRestHighLevelClient") - private RestHighLevelClient searchClient; - - @Autowired - @Qualifier(IndexConventionFactory.INDEX_CONVENTION_BEAN) - private IndexConvention indexConvention; + @Qualifier("baseElasticSearchComponents") + private BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components; @Autowired @Qualifier("entityRegistry") @@ -44,25 +34,16 @@ public class ElasticSearchServiceFactory { @Qualifier("settingsBuilder") private SettingsBuilder settingsBuilder; - @Value("${elasticsearch.bulkProcessor.requestsLimit}") - private Integer bulkRequestsLimit; - - @Value("${elasticsearch.bulkProcessor.flushPeriod}") - private Integer bulkFlushPeriod; - - @Value("${elasticsearch.bulkProcessor.numRetries}") - private Integer numRetries; - - @Value("${elasticsearch.bulkProcessor.retryInterval}") - private Long retryInterval; - @Bean(name = "elasticSearchService") @Nonnull protected ElasticSearchService getInstance() { - ESSearchDAO esSearchDAO = new ESSearchDAO(entityRegistry, searchClient, indexConvention); - return new ElasticSearchService(new ESIndexBuilders(entityRegistry, searchClient, indexConvention, settingsBuilder), - esSearchDAO, new ESBrowseDAO(entityRegistry, searchClient, indexConvention), - new ESWriteDAO(entityRegistry, searchClient, indexConvention, bulkRequestsLimit, bulkFlushPeriod, numRetries, - retryInterval)); + ESSearchDAO esSearchDAO = + new ESSearchDAO(entityRegistry, components.getSearchClient(), components.getIndexConvention()); + return new ElasticSearchService( + new EntityIndexBuilders(components.getIndexBuilder(), entityRegistry, components.getIndexConvention(), + settingsBuilder), esSearchDAO, + new ESBrowseDAO(entityRegistry, components.getSearchClient(), components.getIndexConvention()), + new ESWriteDAO(entityRegistry, components.getSearchClient(), components.getIndexConvention(), + components.getBulkProcessor())); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java new file mode 100644 index 0000000000000..f4ba72b4aadce --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/SearchDocumentTransformerFactory.java @@ -0,0 +1,21 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; +import com.linkedin.metadata.search.transformer.SearchDocumentTransformer; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.PropertySource; + + +@Configuration +@PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) +public class SearchDocumentTransformerFactory { + @Value("${elasticsearch.index.maxArrayLength}") + private int maxArrayLength; + + @Bean("searchDocumentTransformer") + protected SearchDocumentTransformer getInstance() { + return new SearchDocumentTransformer(maxArrayLength); + } +} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java index 2f7ddc4af3bae..06d9cf951025e 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java @@ -1,18 +1,14 @@ package com.linkedin.gms.factory.timeseries; -import com.linkedin.gms.factory.common.IndexConventionFactory; -import com.linkedin.gms.factory.common.RestHighLevelClientFactory; import com.linkedin.gms.factory.entityregistry.EntityRegistryFactory; +import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.timeseries.elastic.ElasticSearchTimeseriesAspectService; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; -import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import org.elasticsearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -21,37 +17,21 @@ @Configuration @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) -@Import({RestHighLevelClientFactory.class, IndexConventionFactory.class, EntityRegistryFactory.class}) +@Import({BaseElasticSearchComponentsFactory.class, EntityRegistryFactory.class}) public class ElasticSearchTimeseriesAspectServiceFactory { @Autowired - @Qualifier("elasticSearchRestHighLevelClient") - private RestHighLevelClient searchClient; - - @Autowired - @Qualifier(IndexConventionFactory.INDEX_CONVENTION_BEAN) - private IndexConvention indexConvention; + @Qualifier("baseElasticSearchComponents") + private BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components; @Autowired @Qualifier("entityRegistry") private EntityRegistry entityRegistry; - @Value("${elasticsearch.bulkProcessor.requestsLimit}") - private Integer bulkRequestsLimit; - - @Value("${elasticsearch.bulkProcessor.flushPeriod}") - private Integer bulkFlushPeriod; - - @Value("${elasticsearch.bulkProcessor.numRetries}") - private Integer numRetries; - - @Value("${elasticsearch.bulkProcessor.retryInterval}") - private Long retryInterval; - @Bean(name = "elasticSearchTimeseriesAspectService") @Nonnull protected ElasticSearchTimeseriesAspectService getInstance() { - return new ElasticSearchTimeseriesAspectService(searchClient, indexConvention, - new TimeseriesAspectIndexBuilders(entityRegistry, searchClient, indexConvention), entityRegistry, - bulkRequestsLimit, bulkFlushPeriod, numRetries, retryInterval); + return new ElasticSearchTimeseriesAspectService(components.getSearchClient(), components.getIndexConvention(), + new TimeseriesAspectIndexBuilders(components.getIndexBuilder(), entityRegistry, + components.getIndexConvention()), entityRegistry, components.getBulkProcessor()); } } \ No newline at end of file diff --git a/metadata-service/factories/src/main/resources/application.yml b/metadata-service/factories/src/main/resources/application.yml index e41654fff276f..9aa08b31a47c5 100644 --- a/metadata-service/factories/src/main/resources/application.yml +++ b/metadata-service/factories/src/main/resources/application.yml @@ -83,7 +83,11 @@ elasticsearch: flushPeriod: ${ES_BULK_FLUSH_PERIOD:1} numRetries: ${ES_BULK_NUM_RETRIES:3} retryInterval: ${ES_BULK_RETRY_INTERVAL:1} - indexPrefix: ${INDEX_PREFIX:} + index: + prefix: ${INDEX_PREFIX:} + numShards: ${ELASTICSEARCH_NUM_SHARDS_PER_INDEX:1} + numReplicas: ${ELASTICSEARCH_NUM_REPLICAS_PER_INDEX:1} + maxArrayLength: ${SEARCH_DOCUMENT_MAX_ARRAY_LENGTH:1000} # TODO: Kafka topic convention kafka: diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/JavaEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/JavaEntityClient.java index d4457c6c06ad0..e294fe4ca3692 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/JavaEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/JavaEntityClient.java @@ -32,6 +32,7 @@ import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; import com.linkedin.r2.RemoteInvocationException; +import io.opentelemetry.extension.annotations.WithSpan; import java.time.Clock; import java.util.List; import java.util.Map; @@ -183,6 +184,7 @@ public void batchUpdate(@Nonnull final Set entities, @Nonnull final Auth * @throws RemoteInvocationException */ @Nonnull + @WithSpan public SearchResult search( @Nonnull String entity, @Nonnull String input,