diff --git a/metadata-ingestion/examples/recipes/elasticsearch_to_datahub.yml b/metadata-ingestion/examples/recipes/elasticsearch_to_datahub.yml new file mode 100644 index 0000000000000..cd1e7901dce13 --- /dev/null +++ b/metadata-ingestion/examples/recipes/elasticsearch_to_datahub.yml @@ -0,0 +1,11 @@ +source: + type: "elasticsearch" + config: + host: 'localhost:9200' + username: "" + password: "" + +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 88952c5351ba4..adb8303bef94d 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -100,6 +100,7 @@ def get_long_description(): "datahub-business-glossary": set(), "dbt": {"requests"}, "druid": sql_common | {"pydruid>=0.6.2"}, + "elasticsearch": {"elasticsearch"}, "feast": {"docker"}, "glue": aws_common, "hive": sql_common @@ -204,6 +205,7 @@ def get_long_description(): for plugin in [ "bigquery", "bigquery-usage", + "elasticsearch", "looker", "glue", "mariadb", @@ -278,6 +280,7 @@ def get_long_description(): "bigquery-usage = datahub.ingestion.source.usage.bigquery_usage:BigQueryUsageSource", "dbt = datahub.ingestion.source.dbt:DBTSource", "druid = datahub.ingestion.source.sql.druid:DruidSource", + "elasticsearch = datahub.ingestion.source.elastic_search:ElasticsearchSource", "feast = datahub.ingestion.source.feast:FeastSource", "glue = datahub.ingestion.source.aws.glue:GlueSource", "sagemaker = datahub.ingestion.source.aws.sagemaker:SagemakerSource", diff --git a/metadata-ingestion/source_docs/elastic_search.md b/metadata-ingestion/source_docs/elastic_search.md new file mode 100644 index 0000000000000..541cd59cab4c0 --- /dev/null +++ b/metadata-ingestion/source_docs/elastic_search.md @@ -0,0 +1,62 @@ +# Elastic Search + +For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). + +## Setup + +To install this plugin, run `pip install 'acryl-datahub[elasticsearch]'`. + +## Capabilities + +This plugin extracts the following: + +- Metadata for indexes +- Column types associated with each index field + +## Quickstart recipe + +Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options. + +For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes). + +```yml +source: + type: "elasticsearch" + config: + # Coordinates + host: 'localhost:9200' + # Credentials + username: "" + password: "" + # Options + env = "prod" + index_pattern: + allow: [".*some_index_name_pattern*"] + deny: [".*skip_index_name_pattern*"] + +sink: + # sink configs +``` + +## Config details + +Note that a `.` is used to denote nested fields in the YAML recipe. + + +| Field | Required | Default | Description | +| --------------------------- | -------- | ---------------- |---------------------------------------------------------------| +| `host` | | "localhost:9092" | The elastic search host URI. | +| `username` | | "" | The username credential. | +| `password` | | "" | The password credential. | +| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | +| `index_pattern.allow` | | | List of regex patterns for indexes to include in ingestion. | +| `index_pattern.deny` | | | List of regex patterns for indexes to exclude from ingestion. | +| `index_pattern.ignoreCase` | | `True` | Whether regex matching should ignore case or not | + +## Compatibility + +Coming soon! + +## Questions + +If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py new file mode 100644 index 0000000000000..6cd63cf77b2c7 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/elastic_search.py @@ -0,0 +1,334 @@ +import json +import logging +import re +from collections import defaultdict +from dataclasses import dataclass, field +from hashlib import md5 +from typing import Any, Dict, Generator, Iterable, List, Optional, Type + +from elasticsearch import Elasticsearch +from pydantic import validator + +from datahub.configuration import ConfigModel +from datahub.configuration.common import AllowDenyPattern +from datahub.emitter.mce_builder import ( + DEFAULT_ENV, + make_data_platform_urn, + make_dataset_urn, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + SchemaField, + SchemaFieldDataType, + SchemaMetadata, +) +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + ChangeTypeClass, + DatasetPropertiesClass, + DateTypeClass, + NullTypeClass, + NumberTypeClass, + OtherSchemaClass, + RecordTypeClass, + StringTypeClass, + SubTypesClass, +) + +logger = logging.getLogger(__name__) + + +class ElasticToSchemaFieldConverter: + # FieldPath format version. + version_string: str = "[version=2.0]" + + _field_type_to_schema_field_type: Dict[str, Type] = { + # Bool + "boolean": BooleanTypeClass, + # Binary + "binary": BytesTypeClass, + # Numbers + "byte": NumberTypeClass, + "integer": NumberTypeClass, + "long": NumberTypeClass, + "short": NumberTypeClass, + "double": NumberTypeClass, + "float": NumberTypeClass, + "half_float": NumberTypeClass, + "scaled_float": NumberTypeClass, + "unsigned_long": NumberTypeClass, + "token_count": NumberTypeClass, + # Dates + "date": DateTypeClass, + "date_nanos": DateTypeClass, + # Strings + "keyword": StringTypeClass, + "constant_keyword": StringTypeClass, + "wildcard": StringTypeClass, + "text": StringTypeClass, + "match_only_text": StringTypeClass, + "completion": StringTypeClass, + "search_as_you_type": StringTypeClass, + # Records + "object": RecordTypeClass, + "flattened": RecordTypeClass, + "nested": RecordTypeClass, + # Arrays + "histogram": ArrayTypeClass, + "aggregate_metric_double": ArrayTypeClass, + } + + @staticmethod + def get_column_type(elastic_column_type: str) -> SchemaFieldDataType: + + type_class: Optional[ + Type + ] = ElasticToSchemaFieldConverter._field_type_to_schema_field_type.get( + elastic_column_type + ) + if type_class is None: + logger.warning( + f"Cannot map {elastic_column_type!r} to SchemaFieldDataType, using NullTypeClass." + ) + type_class = NullTypeClass + + return SchemaFieldDataType(type=type_class()) + + def __init__(self) -> None: + self._prefix_name_stack: List[str] = [self.version_string] + + def _get_cur_field_path(self) -> str: + return ".".join(self._prefix_name_stack) + + def _get_schema_fields( + self, elastic_schema_dict: Dict[str, Any] + ) -> Generator[SchemaField, None, None]: + # append each schema field (sort so output is consistent) + for columnName, column in elastic_schema_dict.items(): + elastic_type: Optional[str] = column.get("type") + nested_props: Optional[Dict[str, Any]] = column.get("properties") + if elastic_type is not None: + self._prefix_name_stack.append(f"[type={elastic_type}].{columnName}") + schema_field_data_type = self.get_column_type(elastic_type) + schema_field = SchemaField( + fieldPath=self._get_cur_field_path(), + nativeDataType=elastic_type, + type=schema_field_data_type, + description=None, + nullable=True, + recursive=False, + ) + yield schema_field + self._prefix_name_stack.pop() + elif nested_props: + self._prefix_name_stack.append(f"[type={columnName}]") + yield from self._get_schema_fields(nested_props) + self._prefix_name_stack.pop() + else: + # Unexpected! Log a warning. + logger.warning( + f"Elastic schema does not have either 'type' or 'properties'!" + f" Schema={json.dumps(elastic_schema_dict)}" + ) + continue + + @classmethod + def get_schema_fields( + cls, elastic_mappings: Dict[str, Any] + ) -> Generator[SchemaField, None, None]: + converter = cls() + properties = elastic_mappings.get("properties") + if not properties: + raise ValueError( + f"Missing 'properties' in elastic search mappings={json.dumps(elastic_mappings)}!" + ) + yield from converter._get_schema_fields(properties) + + +@dataclass +class ElasticsearchSourceReport(SourceReport): + index_scanned: int = 0 + filtered: List[str] = field(default_factory=list) + + def report_index_scanned(self, index: str) -> None: + self.index_scanned += 1 + + def report_dropped(self, index: str) -> None: + self.filtered.append(index) + + +class ElasticsearchSourceConfig(ConfigModel): + host: str = "localhost:9092" + username: str = "" + password: str = "" + env: str = DEFAULT_ENV + index_pattern: AllowDenyPattern = AllowDenyPattern( + allow=[".*"], deny=["^_.*", "^ilm-history.*"] + ) + + @validator("host") + def host_colon_port_comma(cls, host_val: str) -> str: + for entry in host_val.split(","): + # The port can be provided but is not required. + port = None + if ":" in entry: + (host, port) = entry.rsplit(":", 1) + else: + host = entry + assert re.match( + # This regex is quite loose. Many invalid hostnames or IPs will slip through, + # but it serves as a good first line of validation. We defer to Kafka for the + # remaining validation. + r"^[\w\-\.\:]+$", + host, + ), f"host contains bad characters, found {host}" + if port is not None: + assert port.isdigit(), f"port must be all digits, found {port}" + return host_val + + +class ElasticsearchSource(Source): + def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext): + super().__init__(ctx) + self.source_config = config + self.client = Elasticsearch( + self.source_config.host, + http_auth=(self.source_config.username, self.source_config.password), + ) + self.report = ElasticsearchSourceReport() + self.data_stream_partition_count: Dict[str, int] = defaultdict(int) + self.platform: str = "elasticsearch" + + @classmethod + def create( + cls, config_dict: Dict[str, Any], ctx: PipelineContext + ) -> "ElasticsearchSource": + config = ElasticsearchSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_workunits(self) -> Iterable[MetadataWorkUnit]: + indices = self.client.indices.get_alias(index="*") + + for index in indices: + self.report.report_index_scanned(index) + + if self.source_config.index_pattern.allowed(index): + for mcp in self._extract_mcps(index): + wu = MetadataWorkUnit(id=f"index-{index}", mcp=mcp) + self.report.report_workunit(wu) + yield wu + else: + self.report.report_dropped(index) + + for mcp in self._get_data_stream_index_count_mcps(): + wu = MetadataWorkUnit(id=f"index-{index}", mcp=mcp) + self.report.report_workunit(wu) + yield wu + + def _get_data_stream_index_count_mcps( + self, + ) -> Iterable[MetadataChangeProposalWrapper]: + for data_stream, count in self.data_stream_partition_count.items(): + dataset_urn: str = make_dataset_urn( + self.platform, data_stream, self.source_config.env + ) + yield MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=dataset_urn, + aspectName="datasetProperties", + aspect=DatasetPropertiesClass( + customProperties={"numPartitions": str(count)} + ), + changeType=ChangeTypeClass.UPSERT, + ) + + def _extract_mcps(self, index: str) -> Iterable[MetadataChangeProposalWrapper]: + logger.debug(f"index = {index}") + raw_index = self.client.indices.get(index=index) + raw_index_metadata = raw_index[index] + + # 0. Dedup data_streams. + data_stream = raw_index_metadata.get("data_stream") + if data_stream: + index = data_stream + self.data_stream_partition_count[index] += 1 + if self.data_stream_partition_count[index] > 1: + # This is a duplicate, skip processing it further. + return + + # 1. Construct and emit the schemaMetadata aspect + # 1.1 Generate the schema fields from ES mappings. + index_mappings = raw_index_metadata["mappings"] + index_mappings_json_str: str = json.dumps(index_mappings) + md5_hash = md5(index_mappings_json_str.encode()).hexdigest() + schema_fields = list( + ElasticToSchemaFieldConverter.get_schema_fields(index_mappings) + ) + + # 1.2 Generate the SchemaMetadata aspect + schema_metadata = SchemaMetadata( + schemaName=index, + platform=make_data_platform_urn(self.platform), + version=0, + hash=md5_hash, + platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str), + fields=schema_fields, + ) + + # 1.3 Emit the mcp + dataset_urn: str = make_dataset_urn( + self.platform, index, self.source_config.env + ) + yield MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=dataset_urn, + aspectName="schemaMetadata", + aspect=schema_metadata, + changeType=ChangeTypeClass.UPSERT, + ) + + # 2. Construct and emit the status aspect. + yield MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=dataset_urn, + aspectName="status", + aspect=StatusClass(removed=False), + changeType=ChangeTypeClass.UPSERT, + ) + + # 3. Construct and emit subtype + yield MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=dataset_urn, + aspectName="subTypes", + aspect=SubTypesClass( + typeNames=["Index" if not data_stream else "DataStream"] + ), + changeType=ChangeTypeClass.UPSERT, + ) + + # 4. Construct and emit properties if needed + index_aliases = raw_index_metadata.get("aliases", {}).keys() + if index_aliases: + yield MetadataChangeProposalWrapper( + entityType="dataset", + entityUrn=dataset_urn, + aspectName="datasetProperties", + aspect=DatasetPropertiesClass( + customProperties={"aliases": ",".join(index_aliases)} + ), + changeType=ChangeTypeClass.UPSERT, + ) + + def get_report(self): + return self.report + + def close(self): + if self.client: + self.client.close() diff --git a/metadata-ingestion/tests/unit/test_elasticsearch_source.py b/metadata-ingestion/tests/unit/test_elasticsearch_source.py new file mode 100644 index 0000000000000..1c54703d9c8b6 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_elasticsearch_source.py @@ -0,0 +1,2433 @@ +import json +import logging +import re +from typing import Any, Dict, List, Tuple + +import pytest + +from datahub.ingestion.source.elastic_search import ElasticToSchemaFieldConverter +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField + +logger = logging.getLogger(__name__) + + +def assert_field_paths_are_unique(fields: List[SchemaField]) -> None: + fields_paths = [f.fieldPath for f in fields if re.match(".*[^]]$", f.fieldPath)] + + if fields_paths: + assert len(fields_paths) == len(set(fields_paths)) + + +def assret_field_paths_match( + fields: List[SchemaField], expected_field_paths: List[str] +) -> None: + logger.debug('FieldPaths=\n"' + '",\n"'.join(f.fieldPath for f in fields) + '"') + assert len(fields) == len(expected_field_paths) + for f, efp in zip(fields, expected_field_paths): + assert f.fieldPath == efp + assert_field_paths_are_unique(fields) + + +# NOTE: Currently this is the list of all elastic indices that datahub uses for reasonable coverage. +# Simplify these later to just have enough coverage. +schema_test_cases: Dict[str, Tuple[str, List[str]]] = { + ".ds-datahub_usage_event-000001": ( + """{ + "@timestamp": { + "type": "date" + }, + "actorUrn": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "browserId": { + "type": "keyword" + }, + "corp_user_name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "corp_user_username": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "dataset_name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "dataset_platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "date": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "entityType": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "entityUrn": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "hash": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "height": { + "type": "long" + }, + "index": { + "type": "long" + }, + "moduleId": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "path": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "prevPathname": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "query": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "renderId": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "renderType": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "scenarioType": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "search": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "timestamp": { + "type": "date" + }, + "title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "total": { + "type": "long" + }, + "type": { + "type": "keyword" + }, + "url": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "userAgent": { + "type": "keyword" + }, + "width": { + "type": "long" + } +}""", + [ + "[version=2.0].[type=date].@timestamp", + "[version=2.0].[type=text].actorUrn", + "[version=2.0].[type=keyword].browserId", + "[version=2.0].[type=text].corp_user_name", + "[version=2.0].[type=text].corp_user_username", + "[version=2.0].[type=text].dataset_name", + "[version=2.0].[type=text].dataset_platform", + "[version=2.0].[type=text].date", + "[version=2.0].[type=text].entityType", + "[version=2.0].[type=text].entityUrn", + "[version=2.0].[type=text].hash", + "[version=2.0].[type=long].height", + "[version=2.0].[type=long].index", + "[version=2.0].[type=text].moduleId", + "[version=2.0].[type=text].path", + "[version=2.0].[type=text].prevPathname", + "[version=2.0].[type=text].query", + "[version=2.0].[type=text].renderId", + "[version=2.0].[type=text].renderType", + "[version=2.0].[type=text].scenarioType", + "[version=2.0].[type=text].search", + "[version=2.0].[type=date].timestamp", + "[version=2.0].[type=text].title", + "[version=2.0].[type=long].total", + "[version=2.0].[type=keyword].type", + "[version=2.0].[type=text].url", + "[version=2.0].[type=keyword].userAgent", + "[version=2.0].[type=long].width", + ], + ), + "chartindex_v2": ( + """{ + "access": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "normalizer": "keyword_normalizer" + }, + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedDescription": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "glossaryTerms": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "queryType": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "normalizer": "keyword_normalizer" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "title": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "tool": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "type": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "normalizer": "keyword_normalizer" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].access", + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=keyword].editedDescription", + "[version=2.0].[type=text].glossaryTerms", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=keyword].queryType", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].title", + "[version=2.0].[type=keyword].tool", + "[version=2.0].[type=keyword].type", + "[version=2.0].[type=keyword].urn", + ], + ), + "corpgroupindex_v2": ( + """{ + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "displayName": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=keyword].displayName", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "corpuserindex_v2": ( + """{ + "active": { + "type": "boolean" + }, + "email": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "fullName": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasTags": { + "type": "boolean" + }, + "ldap": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "managerLdap": { + "type": "text", + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "skills": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "status": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "teams": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "title": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=boolean].active", + "[version=2.0].[type=keyword].email", + "[version=2.0].[type=keyword].fullName", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].ldap", + "[version=2.0].[type=text].managerLdap", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=keyword].skills", + "[version=2.0].[type=keyword].status", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].teams", + "[version=2.0].[type=keyword].title", + "[version=2.0].[type=keyword].urn", + ], + ), + "dashboardindex_v2": ( + """{ + "access": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "normalizer": "keyword_normalizer" + }, + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedDescription": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "glossaryTerms": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "title": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "tool": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].access", + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=keyword].editedDescription", + "[version=2.0].[type=text].glossaryTerms", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].title", + "[version=2.0].[type=keyword].tool", + "[version=2.0].[type=keyword].urn", + ], + ), + "dataflowindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "cluster": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedDescription": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "flowId": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "glossaryTerms": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "orchestrator": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "project": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].cluster", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=keyword].editedDescription", + "[version=2.0].[type=keyword].flowId", + "[version=2.0].[type=text].glossaryTerms", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=keyword].orchestrator", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=keyword].project", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "datahubpolicyindex_v2": ( + """{ + "urn": { + "type": "keyword" + } +}""", + ["[version=2.0].[type=keyword].urn"], + ), + "datahubretentionindex_v2": ( + """{ + "urn": { + "type": "keyword" + } +}""", + ["[version=2.0].[type=keyword].urn"], + ), + "datajobindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "dataFlow": { + "type": "text", + "fields": { + "ngram": { + "type": "text", + "analyzer": "partial_urn_component" + } + }, + "analyzer": "urn_component" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedDescription": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "glossaryTerms": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "inputs": { + "type": "text", + "analyzer": "urn_component" + }, + "jobId": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "numInputDatasets": { + "type": "long" + }, + "numOutputDatasets": { + "type": "long" + }, + "outputs": { + "type": "text", + "analyzer": "urn_component" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=text].dataFlow", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=keyword].editedDescription", + "[version=2.0].[type=text].glossaryTerms", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=text].inputs", + "[version=2.0].[type=keyword].jobId", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=long].numInputDatasets", + "[version=2.0].[type=long].numOutputDatasets", + "[version=2.0].[type=text].outputs", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "dataplatformindex_v2": ( + """{ + "urn": { + "type": "keyword" + } +}""", + ["[version=2.0].[type=keyword].urn"], + ), + "dataprocessindex_v2": ( + """{ + "hasOwners": { + "type": "boolean" + }, + "inputs": { + "type": "text", + "analyzer": "urn_component" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "numInputDatasets": { + "type": "long" + }, + "numOutputDatasets": { + "type": "long" + }, + "orchestrator": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "origin": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "outputs": { + "type": "text", + "analyzer": "urn_component" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=text].inputs", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=long].numInputDatasets", + "[version=2.0].[type=long].numOutputDatasets", + "[version=2.0].[type=keyword].orchestrator", + "[version=2.0].[type=keyword].origin", + "[version=2.0].[type=text].outputs", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=keyword].urn", + ], + ), + "dataset_datasetprofileaspect_v1": ( + """{ + "@timestamp": { + "type": "date" + }, + "event": { + "type": "object", + "enabled": false + }, + "eventGranularity": { + "type": "keyword" + }, + "isExploded": { + "type": "boolean" + }, + "messageId": { + "type": "keyword" + }, + "systemMetadata": { + "type": "object", + "enabled": false + }, + "timestampMillis": { + "type": "date" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=date].@timestamp", + "[version=2.0].[type=object].event", + "[version=2.0].[type=keyword].eventGranularity", + "[version=2.0].[type=boolean].isExploded", + "[version=2.0].[type=keyword].messageId", + "[version=2.0].[type=object].systemMetadata", + "[version=2.0].[type=date].timestampMillis", + "[version=2.0].[type=keyword].urn", + ], + ), + "dataset_datasetusagestatisticsaspect_v1": ( + """{ + "@timestamp": { + "type": "date" + }, + "event": { + "type": "object", + "enabled": false + }, + "eventGranularity": { + "type": "keyword" + }, + "fieldCounts": { + "properties": { + "count": { + "type": "integer" + }, + "fieldPath": { + "type": "keyword" + } + } + }, + "isExploded": { + "type": "boolean" + }, + "messageId": { + "type": "keyword" + }, + "systemMetadata": { + "type": "object", + "enabled": false + }, + "timestampMillis": { + "type": "date" + }, + "topSqlQueries": { + "type": "keyword" + }, + "totalSqlQueries": { + "type": "integer" + }, + "uniqueUserCount": { + "type": "integer" + }, + "urn": { + "type": "keyword" + }, + "userCounts": { + "properties": { + "count": { + "type": "integer" + }, + "user": { + "type": "keyword" + }, + "userEmail": { + "type": "keyword" + } + } + } +}""", + [ + "[version=2.0].[type=date].@timestamp", + "[version=2.0].[type=object].event", + "[version=2.0].[type=keyword].eventGranularity", + "[version=2.0].[type=fieldCounts].[type=integer].count", + "[version=2.0].[type=fieldCounts].[type=keyword].fieldPath", + "[version=2.0].[type=boolean].isExploded", + "[version=2.0].[type=keyword].messageId", + "[version=2.0].[type=object].systemMetadata", + "[version=2.0].[type=date].timestampMillis", + "[version=2.0].[type=keyword].topSqlQueries", + "[version=2.0].[type=integer].totalSqlQueries", + "[version=2.0].[type=integer].uniqueUserCount", + "[version=2.0].[type=keyword].urn", + "[version=2.0].[type=userCounts].[type=integer].count", + "[version=2.0].[type=userCounts].[type=keyword].user", + "[version=2.0].[type=userCounts].[type=keyword].userEmail", + ], + ), + "datasetindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "deprecated": { + "type": "boolean" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedDescription": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedFieldDescriptions": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "editedFieldGlossaryTerms": { + "type": "text", + "analyzer": "urn_component" + }, + "editedFieldTags": { + "type": "text", + "analyzer": "urn_component" + }, + "fieldDescriptions": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "fieldGlossaryTerms": { + "type": "text", + "analyzer": "urn_component" + }, + "fieldPaths": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "fieldTags": { + "type": "text", + "analyzer": "urn_component" + }, + "glossaryTerms": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "materialized": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "origin": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "keyword": { + "type": "keyword" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "typeNames": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "normalizer": "keyword_normalizer" + }, + "upstreams": { + "type": "text", + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=keyword].editedDescription", + "[version=2.0].[type=keyword].editedFieldDescriptions", + "[version=2.0].[type=text].editedFieldGlossaryTerms", + "[version=2.0].[type=text].editedFieldTags", + "[version=2.0].[type=keyword].fieldDescriptions", + "[version=2.0].[type=text].fieldGlossaryTerms", + "[version=2.0].[type=keyword].fieldPaths", + "[version=2.0].[type=text].fieldTags", + "[version=2.0].[type=text].glossaryTerms", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=boolean].materialized", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=keyword].origin", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].typeNames", + "[version=2.0].[type=text].upstreams", + "[version=2.0].[type=keyword].urn", + ], + ), + "glossarynodeindex_v2": ( + """{ + "definition": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasOwners": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].definition", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=keyword].urn", + ], + ), + "glossarytermindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "definition": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasOwners": { + "type": "boolean" + }, + "hasRelatedTerms": { + "type": "text", + "analyzer": "urn_component" + }, + "isRelatedTerms": { + "type": "text", + "analyzer": "urn_component" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "sourceRef": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "termSource": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].definition", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=text].hasRelatedTerms", + "[version=2.0].[type=text].isRelatedTerms", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=keyword].sourceRef", + "[version=2.0].[type=keyword].termSource", + "[version=2.0].[type=keyword].urn", + ], + ), + "ilm-history-2-000001": ( + """{ + "@timestamp": { + "type": "date", + "format": "epoch_millis" + }, + "error_details": { + "type": "text" + }, + "index": { + "type": "keyword" + }, + "index_age": { + "type": "long" + }, + "policy": { + "type": "keyword" + }, + "state": { + "dynamic": "true", + "properties": { + "action": { + "type": "keyword" + }, + "action_time": { + "type": "date", + "format": "epoch_millis" + }, + "creation_date": { + "type": "date", + "format": "epoch_millis" + }, + "failed_step": { + "type": "keyword" + }, + "is_auto-retryable_error": { + "type": "keyword" + }, + "phase": { + "type": "keyword" + }, + "phase_definition": { + "type": "text" + }, + "phase_time": { + "type": "date", + "format": "epoch_millis" + }, + "step": { + "type": "keyword" + }, + "step_info": { + "type": "text" + }, + "step_time": { + "type": "date", + "format": "epoch_millis" + } + } + }, + "success": { + "type": "boolean" + } +}""", + [ + "[version=2.0].[type=date].@timestamp", + "[version=2.0].[type=text].error_details", + "[version=2.0].[type=keyword].index", + "[version=2.0].[type=long].index_age", + "[version=2.0].[type=keyword].policy", + "[version=2.0].[type=state].[type=keyword].action", + "[version=2.0].[type=state].[type=date].action_time", + "[version=2.0].[type=state].[type=date].creation_date", + "[version=2.0].[type=state].[type=keyword].failed_step", + "[version=2.0].[type=state].[type=keyword].is_auto-retryable_error", + "[version=2.0].[type=state].[type=keyword].phase", + "[version=2.0].[type=state].[type=text].phase_definition", + "[version=2.0].[type=state].[type=date].phase_time", + "[version=2.0].[type=state].[type=keyword].step", + "[version=2.0].[type=state].[type=text].step_info", + "[version=2.0].[type=state].[type=date].step_time", + "[version=2.0].[type=boolean].success", + ], + ), + "mlfeatureindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "deprecated": { + "type": "boolean" + }, + "featureNamespace": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=keyword].featureNamespace", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "mlfeaturetableindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "deprecated": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "mlmodeldeploymentindex_v2": ( + """{ + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "deprecated": { + "type": "boolean" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "origin": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "keyword": { + "type": "keyword" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=keyword].origin", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "mlmodelgroupindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "deprecated": { + "type": "boolean" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "origin": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=keyword].origin", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "mlmodelindex_v2": ( + """{ + "browsePaths": { + "type": "text", + "fields": { + "length": { + "type": "token_count", + "analyzer": "slash_pattern" + } + }, + "analyzer": "browse_path_hierarchy", + "fielddata": true + }, + "customProperties": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "deprecated": { + "type": "boolean" + }, + "description": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasDescription": { + "type": "boolean" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "origin": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "keyword": { + "type": "keyword" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "type": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=text].browsePaths", + "[version=2.0].[type=keyword].customProperties", + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=keyword].description", + "[version=2.0].[type=boolean].hasDescription", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=keyword].origin", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].type", + "[version=2.0].[type=keyword].urn", + ], + ), + "mlprimarykeyindex_v2": ( + """{ + "deprecated": { + "type": "boolean" + }, + "featureNamespace": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "hasOwners": { + "type": "boolean" + }, + "hasTags": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "platform": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "tags": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=boolean].deprecated", + "[version=2.0].[type=keyword].featureNamespace", + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=boolean].hasTags", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=text].platform", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=text].tags", + "[version=2.0].[type=keyword].urn", + ], + ), + "schemafieldindex_v2": ( + """{ + "fieldPath": { + "type": "keyword", + "normalizer": "keyword_normalizer" + }, + "parent": { + "type": "text", + "analyzer": "urn_component" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].fieldPath", + "[version=2.0].[type=text].parent", + "[version=2.0].[type=keyword].urn", + ], + ), + "system_metadata_service_v1": ( + """{ + "aspect": { + "type": "keyword" + }, + "lastUpdated": { + "type": "long" + }, + "registryName": { + "type": "keyword" + }, + "registryVersion": { + "type": "keyword" + }, + "runId": { + "type": "keyword" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=keyword].aspect", + "[version=2.0].[type=long].lastUpdated", + "[version=2.0].[type=keyword].registryName", + "[version=2.0].[type=keyword].registryVersion", + "[version=2.0].[type=keyword].runId", + "[version=2.0].[type=keyword].urn", + ], + ), + "tagindex_v2": ( + """{ + "hasOwners": { + "type": "boolean" + }, + "name": { + "type": "keyword", + "fields": { + "delimited": { + "type": "text", + "analyzer": "word_delimited" + }, + "ngram": { + "type": "text", + "analyzer": "partial" + } + }, + "normalizer": "keyword_normalizer" + }, + "owners": { + "type": "text", + "analyzer": "urn_component" + }, + "removed": { + "type": "boolean" + }, + "urn": { + "type": "keyword" + } +}""", + [ + "[version=2.0].[type=boolean].hasOwners", + "[version=2.0].[type=keyword].name", + "[version=2.0].[type=text].owners", + "[version=2.0].[type=boolean].removed", + "[version=2.0].[type=keyword].urn", + ], + ), +} + + +@pytest.mark.parametrize( + "schema, expected_field_paths", + schema_test_cases.values(), + ids=schema_test_cases.keys(), +) +def test_elastic_search_schema_conversion( + schema: str, expected_field_paths: List[str] +) -> None: + schema_dict: Dict[str, Any] = json.loads(schema) + mappings: Dict[str, Any] = {"properties": schema_dict} + actual_fields = list(ElasticToSchemaFieldConverter.get_schema_fields(mappings)) + assret_field_paths_match(actual_fields, expected_field_paths) diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index e7581990ab779..5417796e3ea53 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -8,6 +8,8 @@ entities: - datasetProfile - datasetUsageStatistics - operation + - schemaMetadata + - status - name: dataHubPolicy doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc. keyAspect: dataHubPolicyKey