From 8fcd5e8e5711e5c48a950e63dc8ffaafa5f7bce1 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 8 Oct 2024 10:16:49 -0500 Subject: [PATCH 01/11] fix(datahub-gc): Update ingestion-datahub-gc.yaml (#11556) --- .../main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml index 47f6deedda2c8..e70ab1162a381 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml @@ -30,5 +30,6 @@ extraArgs: {} debugMode: false executorId: default - source: {{source}}{{^source}}SYSTEM{{/source}} - headers: {} \ No newline at end of file + source: + type: {{source}}{{^source}}SYSTEM{{/source}} + headers: {} From 35dfff775576c6a5fe7e1bea6250451fc8982632 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 8 Oct 2024 10:52:40 -0500 Subject: [PATCH 02/11] fix(ci): fix cassandra test flake (#11557) --- .../CassandraAspectMigrationsDaoTest.java | 6 +++--- .../{ => cassandra}/CassandraEntityServiceTest.java | 8 +++++--- .../{ => cassandra}/CassandraTimelineServiceTest.java | 4 +++- metadata-io/src/test/resources/testng-cassandra.xml | 10 ++++++++++ metadata-io/src/test/resources/testng-other.xml | 2 ++ metadata-io/src/test/resources/testng.xml | 1 + 6 files changed, 24 insertions(+), 7 deletions(-) rename metadata-io/src/test/java/com/linkedin/metadata/entity/{ => cassandra}/CassandraAspectMigrationsDaoTest.java (92%) rename metadata-io/src/test/java/com/linkedin/metadata/entity/{ => cassandra}/CassandraEntityServiceTest.java (97%) rename metadata-io/src/test/java/com/linkedin/metadata/timeline/{ => cassandra}/CassandraTimelineServiceTest.java (94%) create mode 100644 metadata-io/src/test/resources/testng-cassandra.xml diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraAspectMigrationsDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraAspectMigrationsDaoTest.java similarity index 92% rename from metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraAspectMigrationsDaoTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraAspectMigrationsDaoTest.java index d9077b7279160..11b00011ee6e0 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraAspectMigrationsDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraAspectMigrationsDaoTest.java @@ -1,12 +1,12 @@ -package com.linkedin.metadata.entity; +package com.linkedin.metadata.entity.cassandra; import static org.mockito.Mockito.*; import com.datastax.oss.driver.api.core.CqlSession; import com.linkedin.metadata.CassandraTestUtils; import com.linkedin.metadata.config.PreProcessHooks; -import com.linkedin.metadata.entity.cassandra.CassandraAspectDao; -import com.linkedin.metadata.entity.cassandra.CassandraRetentionService; +import com.linkedin.metadata.entity.AspectMigrationsDaoTest; +import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.service.UpdateIndicesService; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java similarity index 97% rename from metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraEntityServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index ef6c9e56e132b..5535866f87c99 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.entity; +package com.linkedin.metadata.entity.cassandra; import static org.mockito.Mockito.*; import static org.testng.Assert.*; @@ -11,8 +11,10 @@ import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; import com.linkedin.metadata.config.PreProcessHooks; -import com.linkedin.metadata.entity.cassandra.CassandraAspectDao; -import com.linkedin.metadata.entity.cassandra.CassandraRetentionService; +import com.linkedin.metadata.entity.EntityServiceAspectRetriever; +import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.entity.EntityServiceTest; +import com.linkedin.metadata.entity.ListResult; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeline/CassandraTimelineServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeline/cassandra/CassandraTimelineServiceTest.java similarity index 94% rename from metadata-io/src/test/java/com/linkedin/metadata/timeline/CassandraTimelineServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/timeline/cassandra/CassandraTimelineServiceTest.java index 4f515ba014412..6dbeef32730f7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeline/CassandraTimelineServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeline/cassandra/CassandraTimelineServiceTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.timeline; +package com.linkedin.metadata.timeline.cassandra; import static org.mockito.Mockito.mock; @@ -9,6 +9,8 @@ import com.linkedin.metadata.entity.cassandra.CassandraAspectDao; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.models.registry.EntityRegistryException; +import com.linkedin.metadata.timeline.TimelineServiceImpl; +import com.linkedin.metadata.timeline.TimelineServiceTest; import org.testcontainers.containers.CassandraContainer; import org.testng.Assert; import org.testng.annotations.AfterClass; diff --git a/metadata-io/src/test/resources/testng-cassandra.xml b/metadata-io/src/test/resources/testng-cassandra.xml new file mode 100644 index 0000000000000..ea3d0dae0ede6 --- /dev/null +++ b/metadata-io/src/test/resources/testng-cassandra.xml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/metadata-io/src/test/resources/testng-other.xml b/metadata-io/src/test/resources/testng-other.xml index e214fdb8c1f61..c591e8fcc4e43 100644 --- a/metadata-io/src/test/resources/testng-other.xml +++ b/metadata-io/src/test/resources/testng-other.xml @@ -8,6 +8,8 @@ + + diff --git a/metadata-io/src/test/resources/testng.xml b/metadata-io/src/test/resources/testng.xml index fdd1c1a6c8921..f004136dec9a4 100644 --- a/metadata-io/src/test/resources/testng.xml +++ b/metadata-io/src/test/resources/testng.xml @@ -9,6 +9,7 @@ parallel followed by everything else. + \ No newline at end of file From 86d394c1e1eb89b610763d00f2f0e1d42795263b Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Tue, 8 Oct 2024 13:51:27 -0500 Subject: [PATCH 03/11] feat(scan): add scanning to setup images (#11563) --- .github/workflows/docker-unified.yml | 99 ++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index ceee59215e431..d1c16b567158a 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -480,6 +480,39 @@ jobs: context: . file: ./docker/kafka-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + kafka_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Kafka Setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, kafka_setup_build ] + if: ${{ needs.setup.outputs.kafka_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" mysql_setup_build: name: Build and Push DataHub MySQL Setup Docker Image @@ -501,6 +534,39 @@ jobs: context: . file: ./docker/mysql-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + mysql_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan MySQL Setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, mysql_setup_build ] + if: ${{ needs.setup.outputs.mysql_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" elasticsearch_setup_build: name: Build and Push DataHub Elasticsearch Setup Docker Image @@ -522,6 +588,39 @@ jobs: context: . file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + elasticsearch_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan ElasticSearch setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, elasticsearch_setup_build ] + if: ${{ needs.setup.outputs.elasticsearch_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' ) }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" datahub_ingestion_base_build: name: Build and Push DataHub Ingestion (Base) Docker Image From 5335454d6a2073b1e6d17124b03148bd997d8a9f Mon Sep 17 00:00:00 2001 From: Udbhav Agrawal <92296417+udbhav-hbk@users.noreply.github.com> Date: Wed, 9 Oct 2024 01:37:10 +0530 Subject: [PATCH 04/11] Fix data search for fieldType = OBJECT for non-string fields (#11524) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- .../query/request/SearchRequestHandler.java | 44 ++++++++++++- .../metadata/search/utils/ESUtils.java | 3 +- .../metadata/search/utils/ESUtilsTest.java | 62 ++++++++++++++++++- 3 files changed, 106 insertions(+), 3 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index cb02fb1c8b2f7..01a1e9cb15984 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -13,6 +13,8 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.linkedin.common.urn.Urn; +import com.linkedin.data.schema.DataSchema; +import com.linkedin.data.schema.MapDataSchema; import com.linkedin.data.template.DoubleMap; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; @@ -662,8 +664,48 @@ private static Map> buildSearchableF Collections.emptySet()))) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + List objectFieldSpec = + entitySpec.getSearchableFieldSpecs().stream() + .filter( + searchableFieldSpec -> + searchableFieldSpec.getSearchableAnnotation().getFieldType() + == SearchableAnnotation.FieldType.OBJECT) + .collect(Collectors.toList()); + + Map> objectFieldTypes = new HashMap<>(); + + objectFieldSpec.forEach( + fieldSpec -> { + String fieldName = fieldSpec.getSearchableAnnotation().getFieldName(); + DataSchema.Type dataType = + ((MapDataSchema) fieldSpec.getPegasusSchema()).getValues().getType(); + + Set fieldType; + + switch (dataType) { + case BOOLEAN: + fieldType = Set.of(SearchableAnnotation.FieldType.BOOLEAN); + break; + case INT: + fieldType = Set.of(SearchableAnnotation.FieldType.COUNT); + break; + case DOUBLE: + case LONG: + case FLOAT: + fieldType = Set.of(SearchableAnnotation.FieldType.DOUBLE); + break; + default: + fieldType = Set.of(SearchableAnnotation.FieldType.TEXT); + break; + } + objectFieldTypes.put(fieldName, fieldType); + annotationFieldTypes.remove(fieldName); + }); + return Stream.concat( - annotationFieldTypes.entrySet().stream(), + Stream.concat( + objectFieldTypes.entrySet().stream(), + annotationFieldTypes.entrySet().stream()), Stream.concat( mappingFieldTypes.entrySet().stream(), aliasFieldTypes.entrySet().stream())); }) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index e135f1941bfec..f72b5fc1f6d22 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -759,7 +759,7 @@ private static Set getFieldTypes( StructuredPropertyUtils.toElasticsearchFieldType(fieldName, aspectRetriever); } else { Set fieldTypes = - searchableFields.getOrDefault(fieldName, Collections.emptySet()); + searchableFields.getOrDefault(fieldName.split("\\.")[0], Collections.emptySet()); finalFieldTypes = fieldTypes.stream().map(ESUtils::getElasticTypeForFieldType).collect(Collectors.toSet()); } @@ -785,6 +785,7 @@ private static RangeQueryBuilder buildRangeQueryFromCriterion( // Determine criterion value, range query only accepts single value so take first value in // values if multiple String criterionValueString = criterion.getValues().get(0).trim(); + Object criterionValue; String documentFieldName; if (fieldTypes.contains(BOOLEAN_FIELD_TYPE)) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java index 8d06594e415e0..6665faacae337 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java @@ -10,10 +10,13 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.search.elasticsearch.query.filter.QueryFilterRewriteChain; @@ -38,7 +41,7 @@ public class ESUtilsTest { private static AspectRetriever aspectRetrieverV1; @BeforeClass - public void setup() throws RemoteInvocationException, URISyntaxException { + public static void setup() throws RemoteInvocationException, URISyntaxException { Urn abFghTenUrn = Urn.createFromString("urn:li:structuredProperty:ab.fgh.ten"); // legacy @@ -835,4 +838,61 @@ public void testGetQueryBuilderFromStructPropExistsV1() { + "}"; Assert.assertEquals(result.toString(), expected); } + + @Test + public void testGetQueryBuilderForObjectFields() { + final Criterion singleValueCriterion = + new Criterion() + .setField("testObjectField.numericField") + .setCondition(Condition.EQUAL) + .setValues(new StringArray(ImmutableList.of("10"))); + + Map> searchableFieldTypes = new HashMap<>(); + searchableFieldTypes.put("testObjectField", Set.of(SearchableAnnotation.FieldType.DOUBLE)); + + QueryBuilder result = + ESUtils.getQueryBuilderFromCriterion( + singleValueCriterion, + false, + searchableFieldTypes, + mock(OperationContext.class), + QueryFilterRewriteChain.EMPTY); + String expected = + "{\n" + + " \"terms\" : {\n" + + " \"testObjectField.numericField\" : [\n" + + " 10.0\n" + + " ],\n" + + " \"boost\" : 1.0,\n" + + " \"_name\" : \"testObjectField.numericField\"\n" + + " }\n" + + "}"; + Assert.assertEquals(result.toString(), expected); + + final Criterion multiValueCriterion = + new Criterion() + .setField("testObjectField.numericField") + .setCondition(Condition.EQUAL) + .setValues(new StringArray(ImmutableList.of("10", "20"))); + + result = + ESUtils.getQueryBuilderFromCriterion( + multiValueCriterion, + false, + searchableFieldTypes, + mock(OperationContext.class), + QueryFilterRewriteChain.EMPTY); + expected = + "{\n" + + " \"terms\" : {\n" + + " \"testObjectField.numericField\" : [\n" + + " 10.0,\n" + + " 20.0\n" + + " ],\n" + + " \"boost\" : 1.0,\n" + + " \"_name\" : \"testObjectField.numericField\"\n" + + " }\n" + + "}"; + Assert.assertEquals(result.toString(), expected); + } } From e535d72da9fa49631c44eb82972a20bf6030d12e Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 8 Oct 2024 17:24:17 -0500 Subject: [PATCH 05/11] docs(search): add structured search examples for structured properties (#11565) --- docs/how/search.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/how/search.md b/docs/how/search.md index 4df5e7c1984d5..2274fe7c09240 100644 --- a/docs/how/search.md +++ b/docs/how/search.md @@ -105,6 +105,20 @@ If you want to: - ```/q customProperties: encoding*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20customProperties%3A%20encoding%2A) - Dataset Properties are indexed in ElasticSearch the manner of key=value. Hence if you know the precise key-value pair, you can search using ```"key=value"```. However, if you only know the key, you can use wildcards to replace the value and that is what is being done here. +- Find an entity with an **unversioned** structured property + - ```/q structuredProperties.io_acryl_privacy_retentionTime01:60``` + - This will return results for an **unversioned** structured property's qualified name `io.acryl.private.retentionTime01` and value `60`. + - ```/q _exists_:structuredProperties.io_acryl_privacy_retentionTime01``` + - In this example, the query will return any entity which has any value for the **unversioned** structured property with qualified name `io.acryl.private.retentionTime01`. + +- Find an entity with a **versioned** structured property + - ```/q structuredProperties._versioned.io_acryl_privacy_retentionTime.20240614080000.number:365``` + - This query will return results for a **versioned** structured property with qualified name `io.acryl.privacy.retentionTime`, version `20240614080000`, type `number` and value `365`. + - ```/q _exists_:structuredProperties._versioned.io_acryl_privacy_retentionTime.20240614080000.number``` + - Returns results for a **versioned** structured property with qualified name `io.acryl.privacy.retentionTime`, version `20240614080000` and type `number`. + - ```/q structuredProperties._versioned.io_acryl_privacy_retentionTime.\*.\*:365``` + - Returns results for a **versioned** structured property with any version and type with a values of `365` + - Find a dataset with a column name, **latitude** - ```/q fieldPaths: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20fieldPaths%3A%20latitude) - fieldPaths is the name of the attribute that holds the column name in Datasets. From 732543f2017f3cb9a03f872a8efb1f3931fb12e3 Mon Sep 17 00:00:00 2001 From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com> Date: Wed, 9 Oct 2024 01:05:57 -0700 Subject: [PATCH 06/11] docs(datahub source): Add urn exclusions to docs (#11568) --- .../docs/sources/datahub/datahub_pre.md | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/metadata-ingestion/docs/sources/datahub/datahub_pre.md b/metadata-ingestion/docs/sources/datahub/datahub_pre.md index cb1cc2c4d5903..b35eb5811e4c9 100644 --- a/metadata-ingestion/docs/sources/datahub/datahub_pre.md +++ b/metadata-ingestion/docs/sources/datahub/datahub_pre.md @@ -71,3 +71,27 @@ and [mce-consumer](../../../../metadata-jobs/mce-consumer-job/README.md)) - Increase the number of gms pods to add redundancy and increase resilience to node evictions * If you are migrating large amounts of data, consider increasing elasticsearch's thread count via the `ELASTICSEARCH_THREAD_COUNT` environment variable. + +#### Exclusions +You will likely want to exclude some urn types from your ingestion, as they contain instance-specific +metadata, such as settings, roles, policies, ingestion sources, and ingestion runs. For example, you +will likely want to start with this: + +```yaml +source: + config: + urn_pattern: # URN pattern to ignore/include in the ingestion + deny: + # Ignores all datahub metadata where the urn matches the regex + - ^urn:li:role.* # Only exclude if you do not want to ingest roles + - ^urn:li:dataHubRole.* # Only exclude if you do not want to ingest roles + - ^urn:li:dataHubPolicy.* # Only exclude if you do not want to ingest policies + - ^urn:li:dataHubIngestionSource.* # Only exclude if you do not want to ingest ingestion sources + - ^urn:li:dataHubSecret.* + - ^urn:li:dataHubExecutionRequest.* + - ^urn:li:dataHubAccessToken.* + - ^urn:li:dataHubUpgrade.* + - ^urn:li:inviteToken.* + - ^urn:li:globalSettings.* + - ^urn:li:dataHubStepState.* +``` From ef4805d85247e654552b1bb5d1f3766f783383e6 Mon Sep 17 00:00:00 2001 From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com> Date: Wed, 9 Oct 2024 01:11:28 -0700 Subject: [PATCH 07/11] docs(csv-enricher): Add warning for write_semantics (#11561) --- .../src/datahub/ingestion/source/csv_enricher.py | 4 ++++ .../src/datahub/ingestion/source_config/csv_enricher.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index e3f9a150ad000..e4829f8713cf7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -129,6 +129,10 @@ class CSVEnricherSource(Source): If ownership_type_urn is set then ownership_type must be set to CUSTOM. + Note that you have the option in your recipe config to write as a PATCH or as an OVERRIDE. This choice will apply to + all metadata for the entity, not just a single aspect. So OVERRIDE will override all metadata, including performing + deletes if a metadata field is empty. The default is PATCH. + :::note This source will not work on very large csv files that do not fit in memory. ::: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py index e62c46888ef0e..f0f0ab95ca811 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py @@ -11,7 +11,7 @@ class CSVEnricherConfig(ConfigModel): ) write_semantics: str = pydantic.Field( default="PATCH", - description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE"', + description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE". NOTE: this will apply to all metadata for the entity, not just a single aspect.', ) delimiter: str = pydantic.Field( default=",", description="Delimiter to use when parsing CSV" From f09e18cdb82433f07963bde5bac67ffae169b553 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 9 Oct 2024 09:35:42 -0500 Subject: [PATCH 08/11] fix(test): fix test specific for single query (#11567) --- .../linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java index 109c9b5c44efb..7d1fdd34026f9 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java @@ -56,7 +56,8 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { testDao.runInTransactionWithRetryUnlocked( (txContext) -> { - testDao.getLatestAspects(Map.of("urn:li:corpuser:test", Set.of("status")), true); + testDao.getLatestAspects( + Map.of("urn:li:corpuser:testGetLatestAspectsForUpdate", Set.of("status")), true); return ""; }, mock(AspectsBatch.class), @@ -65,7 +66,7 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { // Get the captured SQL statements List sql = LoggedSql.stop().stream() - .filter(str -> str.contains("(t0.urn,t0.aspect,t0.version)")) + .filter(str -> str.contains("testGetLatestAspectsForUpdate")) .toList(); assertEquals( sql.size(), 1, String.format("Found: %s", new ObjectMapper().writeValueAsString(sql))); From 3cb1678eccd7c20b2a1cc29aac5858c11dde6b2d Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Wed, 9 Oct 2024 17:01:34 +0200 Subject: [PATCH 09/11] feat(ingest/bigquery): Add way to reference existing DataHub Tag from a bigquery label (#11544) --- .../source/bigquery_v2/bigquery_schema_gen.py | 33 +- .../src/datahub/ingestion/source/mongodb.py | 4 +- .../bigquery_v2/bigquery_mcp_golden.json | 73 +- .../bigquery_mcp_queries_golden.json | 1031 +++++++++++++++++ .../bigquery_project_label_mcp_golden.json | 6 +- .../integration/bigquery_v2/test_bigquery.py | 9 +- 6 files changed, 1135 insertions(+), 21 deletions(-) create mode 100644 metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 6ea8f21e8b291..11d06771d4e4f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -1,5 +1,6 @@ import logging import re +from base64 import b32decode from collections import defaultdict from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast @@ -89,12 +90,13 @@ HiveColumnToAvroConverter, get_schema_fields_for_hive_column, ) -from datahub.utilities.mapping import Constants from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.ratelimiter import RateLimiter from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor +ENCODED_TAG_PREFIX = "urn_li_encoded_tag_" + logger: logging.Logger = logging.getLogger(__name__) # Handle table snapshots # See https://cloud.google.com/bigquery/docs/table-snapshots-intro. @@ -194,6 +196,18 @@ def store_table_refs(self): or self.config.use_queries_v2 ) + def modified_base32decode(self, text_to_decode: str) -> str: + # When we sync from DataHub to BigQuery, we encode the tags as modified base32 strings. + # BiqQuery labels only support lowercase letters, international characters, numbers, or underscores. + # So we need to modify the base32 encoding to replace the padding character `=` with `_` and convert to lowercase. + if not text_to_decode.startswith("%s" % ENCODED_TAG_PREFIX): + return text_to_decode + text_to_decode = ( + text_to_decode.replace(ENCODED_TAG_PREFIX, "").upper().replace("_", "=") + ) + text = b32decode(text_to_decode.encode("utf-8")).decode("utf-8") + return text + def get_project_workunits( self, project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: @@ -253,7 +267,7 @@ def gen_dataset_containers( tags_joined: Optional[List[str]] = None if tags and self.config.capture_dataset_label_as_tag: tags_joined = [ - f"{k}:{v}" + self.make_tag_from_label(k, v) for k, v in tags.items() if is_tag_allowed(self.config.capture_dataset_label_as_tag, k) ] @@ -662,6 +676,11 @@ def _process_snapshot( dataset_name=dataset_name, ) + def make_tag_from_label(self, key: str, value: str) -> str: + if not value.startswith(ENCODED_TAG_PREFIX): + return make_tag_urn(f"""{key}:{value}""") + return self.modified_base32decode(value) + def gen_table_dataset_workunits( self, table: BigqueryTable, @@ -707,7 +726,7 @@ def gen_table_dataset_workunits( tags_to_add = [] tags_to_add.extend( [ - make_tag_urn(f"""{k}:{v}""") + self.make_tag_from_label(k, v) for k, v in table.labels.items() if is_tag_allowed(self.config.capture_table_label_as_tag, k) ] @@ -733,7 +752,7 @@ def gen_view_dataset_workunits( tags_to_add = None if table.labels and self.config.capture_view_label_as_tag: tags_to_add = [ - make_tag_urn(f"{k}:{v}") + self.make_tag_from_label(k, v) for k, v in table.labels.items() if is_tag_allowed(self.config.capture_view_label_as_tag, k) ] @@ -922,11 +941,6 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: break else: tags = [] - if col.is_partition_column: - tags.append( - TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) - ) - if col.cluster_column_position is not None: tags.append( TagAssociationClass( @@ -944,6 +958,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: type=SchemaFieldDataType( self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)() ), + isPartitioningKey=col.is_partition_column, nativeDataType=col.data_type, description=col.comment, nullable=col.is_nullable, diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index e4dadaf602852..e3fd8849ed844 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -290,8 +290,8 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig): # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes self.mongo_client = MongoClient( - self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options - ) # type: ignore + self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options # type: ignore + ) # This cheaply tests the connection. For details, see # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index fcf65130df975..02660f0fae08e 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -269,7 +269,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -296,7 +297,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -328,6 +330,29 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:priority:high" + }, + { + "tag": "urn:li:tag:purchase" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", @@ -463,7 +488,8 @@ } ] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -479,7 +505,8 @@ "globalTags": { "tags": [] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -620,7 +647,8 @@ } ] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -636,7 +664,8 @@ "globalTags": { "tags": [] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -1021,5 +1050,37 @@ "runId": "bigquery-2022_02_03-07_00_00", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:priority:high", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "priority:high" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:purchase", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "purchase" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json new file mode 100644 index 0000000000000..2a7336144f23d --- /dev/null +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json @@ -0,0 +1,1031 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1" + }, + "name": "project-id-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1", + "dataset_id": "bigquery-dataset-1", + "location": "US" + }, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1", + "name": "bigquery-dataset-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Age" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", + "name": "table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.view-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1", + "name": "view-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.view-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1", + "name": "snapshot-table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Bigquery Table Snapshot" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Age", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Age" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Email_Address", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Email_Address" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Test Policy Tag", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Test Policy Tag" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json index ab59b95a9f388..a1e5c3fd18f23 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json @@ -268,7 +268,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -295,7 +296,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index f9481d1d83d8b..0ac4e94a5a24f 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -54,6 +54,7 @@ def recipe(mcp_output_path: str, source_config_override: dict = {}) -> dict: "include_usage_statistics": False, "include_table_lineage": True, "include_data_platform_instance": True, + "capture_table_label_as_tag": True, "classification": ClassificationConfig( enabled=True, classifiers=[ @@ -155,6 +156,10 @@ def test_bigquery_v2_ingest( last_altered=None, size_in_bytes=None, rows_count=None, + labels={ + "priority": "high", + "purchase": "urn_li_encoded_tag_ovzg4otmne5hiylhhjyhk4tdnbqxgzi_", + }, ) get_tables_for_dataset.return_value = iter([bigquery_table]) snapshot_table = BigqueryTableSnapshot( @@ -319,8 +324,8 @@ def test_bigquery_queries_v2_ingest( tmp_path, ): test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" - mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_golden.json" - mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_output.json") + mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_queries_golden.json" + mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_queries_output.json") dataset_name = "bigquery-dataset-1" get_datasets_for_project_id.return_value = [ From 867007f209e62243e177bf0bd379caf463d9ad36 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 9 Oct 2024 20:52:50 +0530 Subject: [PATCH 10/11] chore: remove obsolete attribute from docker (#11573) --- docker/airflow/docker-compose.yaml | 1 - docker/cassandra/docker-compose.cassandra.yml | 1 - docker/docker-compose-with-cassandra.yml | 1 - docker/docker-compose-without-neo4j.override.yml | 1 - docker/docker-compose-without-neo4j.postgres.override.yml | 1 - docker/docker-compose-without-neo4j.yml | 1 - docker/docker-compose.consumers-without-neo4j.yml | 1 - docker/docker-compose.consumers.dev.yml | 1 - docker/docker-compose.consumers.yml | 1 - docker/docker-compose.dev.yml | 1 - docker/docker-compose.kafka-setup.yml | 1 - docker/docker-compose.override.yml | 1 - docker/docker-compose.tools.yml | 1 - docker/docker-compose.yml | 1 - docker/ingestion/docker-compose.yml | 1 - docker/mariadb/docker-compose.mariadb.yml | 1 - docker/monitoring/docker-compose.consumers.monitoring.yml | 1 - docker/monitoring/docker-compose.monitoring.yml | 1 - docker/mysql/docker-compose.mysql.yml | 1 - docker/quickstart/docker-compose-m1.quickstart.yml | 1 - .../docker-compose-without-neo4j-m1.quickstart.yml | 1 - .../quickstart/docker-compose-without-neo4j.quickstart.yml | 1 - .../docker-compose.consumers-without-neo4j.quickstart.yml | 1 - docker/quickstart/docker-compose.consumers.quickstart.yml | 1 - docker/quickstart/docker-compose.kafka-setup.quickstart.yml | 1 - docker/quickstart/docker-compose.monitoring.quickstart.yml | 1 - docker/quickstart/docker-compose.quickstart.yml | 1 - docker/quickstart/generate_docker_quickstart.py | 5 ----- .../spark-smoke-test/docker/spark-docker-compose.yml | 1 - 29 files changed, 33 deletions(-) diff --git a/docker/airflow/docker-compose.yaml b/docker/airflow/docker-compose.yaml index df85f2acb9efd..ca19e3c5aa9d1 100644 --- a/docker/airflow/docker-compose.yaml +++ b/docker/airflow/docker-compose.yaml @@ -41,7 +41,6 @@ # # Feel free to modify this file to suit your needs. --- -version: '3' x-airflow-common: &airflow-common image: ${AIRFLOW_IMAGE_NAME:-acryldata/airflow-datahub:latest} diff --git a/docker/cassandra/docker-compose.cassandra.yml b/docker/cassandra/docker-compose.cassandra.yml index ae7d649ab3d23..3fde920161685 100644 --- a/docker/cassandra/docker-compose.cassandra.yml +++ b/docker/cassandra/docker-compose.cassandra.yml @@ -1,6 +1,5 @@ # Override to use Cassandra as a backing store for datahub-gms. --- -version: '3.8' services: cassandra: hostname: cassandra diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 6709aee98d697..55bca606c5ba3 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -4,7 +4,6 @@ # NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/docker-compose-without-neo4j.override.yml b/docker/docker-compose-without-neo4j.override.yml index 11d7cd0c0c87b..adf610ec3a9ed 100644 --- a/docker/docker-compose-without-neo4j.override.yml +++ b/docker/docker-compose-without-neo4j.override.yml @@ -1,5 +1,4 @@ --- -version: '3.9' services: datahub-gms: env_file: datahub-gms/env/docker-without-neo4j.env diff --git a/docker/docker-compose-without-neo4j.postgres.override.yml b/docker/docker-compose-without-neo4j.postgres.override.yml index b81fb6435c297..2d1e0958dfb8b 100644 --- a/docker/docker-compose-without-neo4j.postgres.override.yml +++ b/docker/docker-compose-without-neo4j.postgres.override.yml @@ -1,6 +1,5 @@ # Override to use PostgreSQL as a backing store for datahub-gms. --- -version: '3.9' services: datahub-gms: env_file: diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 53fcc77c6e8f3..4350322a17379 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -4,7 +4,6 @@ # NOTE: This file will cannot build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/docker-compose.consumers-without-neo4j.yml b/docker/docker-compose.consumers-without-neo4j.yml index f1aa6b30cede0..01dff8c0d7b3d 100644 --- a/docker/docker-compose.consumers-without-neo4j.yml +++ b/docker/docker-compose.consumers-without-neo4j.yml @@ -1,5 +1,4 @@ # Service definitions for standalone Kafka consumer containers. -version: '3.9' services: datahub-gms: environment: diff --git a/docker/docker-compose.consumers.dev.yml b/docker/docker-compose.consumers.dev.yml index 00f7b52df151f..df180e22f55d0 100644 --- a/docker/docker-compose.consumers.dev.yml +++ b/docker/docker-compose.consumers.dev.yml @@ -1,4 +1,3 @@ -version: '3.9' services: datahub-mae-consumer: image: acryldata/datahub-mae-consumer:debug diff --git a/docker/docker-compose.consumers.yml b/docker/docker-compose.consumers.yml index 74b9adaeb9948..36fdb5451ebd6 100644 --- a/docker/docker-compose.consumers.yml +++ b/docker/docker-compose.consumers.yml @@ -1,5 +1,4 @@ # Service definitions for standalone Kafka consumer containers. -version: '3.9' services: datahub-gms: environment: diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 2202f362abd99..c68a4c1f5a8fc 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -8,7 +8,6 @@ # To make a JVM app debuggable via IntelliJ, go to its env file and add JVM debug flags, and then add the JVM debug # port to this file. --- -version: '3.9' services: datahub-frontend-react: image: acryldata/datahub-frontend-react:head diff --git a/docker/docker-compose.kafka-setup.yml b/docker/docker-compose.kafka-setup.yml index 59b3459bf4555..67b7641f509b3 100644 --- a/docker/docker-compose.kafka-setup.yml +++ b/docker/docker-compose.kafka-setup.yml @@ -1,3 +1,2 @@ # Empty docker compose for kafka-setup as we have moved kafka-setup back into the main compose -version: '3.9' services: \ No newline at end of file diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml index 51fbe0060aa5f..25abf247a5d04 100644 --- a/docker/docker-compose.override.yml +++ b/docker/docker-compose.override.yml @@ -1,6 +1,5 @@ # Default override to use MySQL as a backing store for datahub-gms (same as docker-compose.mysql.yml). --- -version: '3.9' services: datahub-gms: env_file: datahub-gms/env/docker.env diff --git a/docker/docker-compose.tools.yml b/docker/docker-compose.tools.yml index 8d2c30c64e6c8..9f0e2639521a4 100644 --- a/docker/docker-compose.tools.yml +++ b/docker/docker-compose.tools.yml @@ -1,6 +1,5 @@ # Tools useful for operating & debugging DataHub. --- -version: '3.8' services: kafka-rest-proxy: image: confluentinc/cp-kafka-rest:7.4.0 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5430a8a6fcd5b..e77b6d11ef241 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -4,7 +4,6 @@ # NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/ingestion/docker-compose.yml b/docker/ingestion/docker-compose.yml index 06d4e47aa4a40..a474d9505285d 100644 --- a/docker/ingestion/docker-compose.yml +++ b/docker/ingestion/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: '3.5' services: ingestion: build: diff --git a/docker/mariadb/docker-compose.mariadb.yml b/docker/mariadb/docker-compose.mariadb.yml index 0ee172fb1f1a5..63f5ed929711b 100644 --- a/docker/mariadb/docker-compose.mariadb.yml +++ b/docker/mariadb/docker-compose.mariadb.yml @@ -1,6 +1,5 @@ # Override to use MariaDB as a backing store for datahub-gms. --- -version: '3.8' services: mariadb: hostname: mariadb diff --git a/docker/monitoring/docker-compose.consumers.monitoring.yml b/docker/monitoring/docker-compose.consumers.monitoring.yml index 254b0a58d0223..147ddde37798f 100644 --- a/docker/monitoring/docker-compose.consumers.monitoring.yml +++ b/docker/monitoring/docker-compose.consumers.monitoring.yml @@ -1,5 +1,4 @@ --- -version: '3.8' services: datahub-mae-consumer: environment: diff --git a/docker/monitoring/docker-compose.monitoring.yml b/docker/monitoring/docker-compose.monitoring.yml index c6fa019cf99fc..039aa55bd3867 100644 --- a/docker/monitoring/docker-compose.monitoring.yml +++ b/docker/monitoring/docker-compose.monitoring.yml @@ -1,5 +1,4 @@ --- -version: '3.9' services: datahub-frontend-react: environment: diff --git a/docker/mysql/docker-compose.mysql.yml b/docker/mysql/docker-compose.mysql.yml index 7cc27b9f8b154..ee00e51c09b1f 100644 --- a/docker/mysql/docker-compose.mysql.yml +++ b/docker/mysql/docker-compose.mysql.yml @@ -1,6 +1,5 @@ # Override to use MySQL as a backing store for datahub-gms. --- -version: '3.8' services: mysql: hostname: mysql diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 046ab96cf3002..5f415d4000178 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -291,7 +291,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 6295572aac98f..c2a0ddb0391b9 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -266,7 +266,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index ed5f203ff4d05..46ea765f45e1f 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -266,7 +266,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index a4211acedcf10..8801d26eddbf4 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -55,4 +55,3 @@ services: image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 -version: '3.9' diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index e7571e4baf8b4..1daa969af82d8 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -69,4 +69,3 @@ services: image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 -version: '3.9' diff --git a/docker/quickstart/docker-compose.kafka-setup.quickstart.yml b/docker/quickstart/docker-compose.kafka-setup.quickstart.yml index d126caac6b5a8..ad189ddbec905 100644 --- a/docker/quickstart/docker-compose.kafka-setup.quickstart.yml +++ b/docker/quickstart/docker-compose.kafka-setup.quickstart.yml @@ -1,2 +1 @@ services: {} -version: '3.9' diff --git a/docker/quickstart/docker-compose.monitoring.quickstart.yml b/docker/quickstart/docker-compose.monitoring.quickstart.yml index bddd82e393338..f4afd397e785d 100644 --- a/docker/quickstart/docker-compose.monitoring.quickstart.yml +++ b/docker/quickstart/docker-compose.monitoring.quickstart.yml @@ -41,6 +41,5 @@ services: - 9089:9090 volumes: - ../monitoring/prometheus.yaml:/etc/prometheus/prometheus.yml -version: '3.9' volumes: grafana-storage: null diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 66616be98bec1..4d03b22598606 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -291,7 +291,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/generate_docker_quickstart.py b/docker/quickstart/generate_docker_quickstart.py index 9846233b369d7..6a3b64155adea 100644 --- a/docker/quickstart/generate_docker_quickstart.py +++ b/docker/quickstart/generate_docker_quickstart.py @@ -120,11 +120,6 @@ def modify_docker_config(base_path, docker_yaml_config): elif volumes[i].startswith("./"): volumes[i] = "." + volumes[i] - # 10. Set docker compose version to 3. - # We need at least this version, since we use features like start_period for - # healthchecks (with services dependencies based on them) and shell-like variable interpolation. - docker_yaml_config["version"] = "3.9" - def dedup_env_vars(merged_docker_config): for service in merged_docker_config["services"]: diff --git a/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml b/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml index 865c5dee0f86e..e2e0eba62f3d9 100644 --- a/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml +++ b/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.6" services: spark-master: image: spark-master From 576ae8a056603818c4b34ef33c40f47b9e48e631 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 9 Oct 2024 08:34:46 -0700 Subject: [PATCH 11/11] fix(cli): update hard delete confirmation message in delete cli (#11550) --- metadata-ingestion/src/datahub/cli/delete_cli.py | 16 ++++++++++++---- smoke-test/cypress-dev.sh | 3 +++ smoke-test/run-quickstart.sh | 5 ++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index b5cc67532a9dd..584bfc1f7cda5 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -338,10 +338,18 @@ def by_filter( # TODO: add some validation on entity_type if not force and not soft and not dry_run: - click.confirm( - "This will permanently delete data from DataHub. Do you want to continue?", - abort=True, - ) + if only_soft_deleted: + click.confirm( + "This will permanently delete data from DataHub. Do you want to continue?", + abort=True, + ) + else: + click.confirm( + "Hard deletion will permanently delete data from DataHub and can be slow. " + "We generally recommend using soft deletes instead. " + "Do you want to continue?", + abort=True, + ) graph = get_default_graph() logger.info(f"Using {graph}") diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index bce2d794b1869..3db81b11c67fa 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -8,7 +8,10 @@ if [ "${RUN_QUICKSTART:-true}" == "true" ]; then source ./run-quickstart.sh fi +set +x +echo "Activating virtual environment" source venv/bin/activate +set -x # set environment variables for the test source ./set-test-env-vars.sh diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index e83a116c670a4..902dc1030660a 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -5,7 +5,10 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd "$DIR" ../gradlew :smoke-test:installDev +set +x +echo "Activating virtual environment" source venv/bin/activate +set -x mkdir -p ~/.datahub/plugins/frontend/auth/ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props @@ -23,4 +26,4 @@ DATAHUB_SEARCH_IMAGE="$DATAHUB_SEARCH_IMAGE" DATAHUB_SEARCH_TAG="$DATAHUB_SEARCH XPACK_SECURITY_ENABLED="$XPACK_SECURITY_ENABLED" ELASTICSEARCH_USE_SSL="$ELASTICSEARCH_USE_SSL" \ USE_AWS_ELASTICSEARCH="$USE_AWS_ELASTICSEARCH" \ DATAHUB_VERSION=${DATAHUB_VERSION} \ -docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900 +docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900