diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml new file mode 100644 index 0000000000000..63bab821cc398 --- /dev/null +++ b/.github/workflows/airflow-plugin.yml @@ -0,0 +1,85 @@ +name: Airflow Plugin +on: + push: + branches: + - master + paths: + - ".github/workflows/airflow-plugin.yml" + - "metadata-ingestion-modules/airflow-plugin/**" + - "metadata-ingestion/**" + - "metadata-models/**" + pull_request: + branches: + - master + paths: + - ".github/**" + - "metadata-ingestion-modules/airflow-plugin/**" + - "metadata-ingestion/**" + - "metadata-models/**" + release: + types: [published] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + airflow-plugin: + runs-on: ubuntu-latest + env: + SPARK_VERSION: 3.0.3 + DATAHUB_TELEMETRY_ENABLED: false + strategy: + matrix: + include: + - python-version: "3.7" + extraPythonRequirement: "apache-airflow~=2.1.0" + - python-version: "3.7" + extraPythonRequirement: "apache-airflow~=2.2.0" + - python-version: "3.10" + extraPythonRequirement: "apache-airflow~=2.4.0" + - python-version: "3.10" + extraPythonRequirement: "apache-airflow~=2.6.0" + - python-version: "3.10" + extraPythonRequirement: "apache-airflow>2.6.0" + fail-fast: false + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + - name: Install dependencies + run: ./metadata-ingestion/scripts/install_deps.sh + - name: Install airflow package and test (extras ${{ matrix.extraPythonRequirement }}) + run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion-modules:airflow-plugin:lint :metadata-ingestion-modules:airflow-plugin:testQuick + - name: pip freeze show list installed + if: always() + run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze + - uses: actions/upload-artifact@v3 + if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'apache-airflow>2.6.0' }} + with: + name: Test Results (Airflow Plugin ${{ matrix.python-version}}) + path: | + **/build/reports/tests/test/** + **/build/test-results/test/** + **/junit.*.xml + - name: Upload coverage to Codecov + if: always() + uses: codecov/codecov-action@v3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + directory: . + fail_ci_if_error: false + flags: airflow-${{ matrix.python-version }}-${{ matrix.extraPythonRequirement }} + name: pytest-airflow + verbose: true + + event-file: + runs-on: ubuntu-latest + steps: + - name: Upload + uses: actions/upload-artifact@v3 + with: + name: Event File + path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 23d7ee9427f42..fff41e481c3cb 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -25,7 +25,7 @@ jobs: metadata-ingestion: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 + SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} @@ -42,9 +42,7 @@ jobs: ] include: - python-version: "3.7" - extraPythonRequirement: "sqlalchemy==1.3.24 apache-airflow~=2.2.0" - python-version: "3.10" - extraPythonRequirement: "sqlalchemy~=1.4.0 apache-airflow>=2.4.0" fail-fast: false steps: - uses: actions/checkout@v3 @@ -56,8 +54,8 @@ jobs: run: ./metadata-ingestion/scripts/install_deps.sh - name: Install package run: ./gradlew :metadata-ingestion:installPackageOnly - - name: Run metadata-ingestion tests (extras ${{ matrix.extraPythonRequirement }}) - run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion:${{ matrix.command }} + - name: Run metadata-ingestion tests + run: ./gradlew :metadata-ingestion:${{ matrix.command }} - name: pip freeze show list installed if: always() run: source metadata-ingestion/venv/bin/activate && pip freeze @@ -80,7 +78,6 @@ jobs: name: pytest-${{ matrix.command }} verbose: true - event-file: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 5f501780873f6..ac411d812deea 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -40,6 +40,8 @@ jobs: python-version: "3.7" - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh + - name: Remove images + run: docker image prune -a -f || true - name: Smoke test run: | ./gradlew :metadata-integration:java:spark-lineage:integrationTest \ diff --git a/.github/workflows/test-results.yml b/.github/workflows/test-results.yml index 656e4dcbc4e43..0153060692271 100644 --- a/.github/workflows/test-results.yml +++ b/.github/workflows/test-results.yml @@ -2,7 +2,7 @@ name: Test Results on: workflow_run: - workflows: ["build & test", "metadata ingestion"] + workflows: ["build & test", "metadata ingestion", "Airflow Plugin"] types: - completed diff --git a/.gitignore b/.gitignore index b6edbccf71125..49ab5c475096c 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ metadata-ingestion/generated/** # docs docs/generated/ docs-website/versioned_docs/ +docs-website/versioned_sidebars/ tmp* temp/** diff --git a/build.gradle b/build.gradle index a90511d8ecdfc..e12d520e12de6 100644 --- a/build.gradle +++ b/build.gradle @@ -1,7 +1,7 @@ buildscript { ext.junitJupiterVersion = '5.6.1' // Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md - ext.pegasusVersion = '29.22.16' + ext.pegasusVersion = '29.45.0' ext.mavenVersion = '3.6.3' ext.springVersion = '5.3.29' ext.springBootVersion = '2.7.14' @@ -30,16 +30,16 @@ buildscript { classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.8.1' classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0" - classpath "com.palantir.gradle.gitversion:gradle-git-version:0.12.3" - classpath "org.gradle.playframework:gradle-playframework:0.12" - classpath "gradle.plugin.org.hidetake:gradle-swagger-generator-plugin:2.18.1" + classpath "com.palantir.gradle.gitversion:gradle-git-version:3.0.0" + classpath "org.gradle.playframework:gradle-playframework:0.14" + classpath "gradle.plugin.org.hidetake:gradle-swagger-generator-plugin:2.19.1" } } plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2' id 'com.github.johnrengelman.shadow' version '6.1.0' - id "com.palantir.docker" version "0.35.0" + id 'com.palantir.docker' version '0.35.0' // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" } @@ -97,7 +97,7 @@ project.ext.externalDependency = [ 'graphqlJavaScalars': 'com.graphql-java:graphql-java-extended-scalars:19.1', 'gson': 'com.google.code.gson:gson:2.8.9', 'guice': 'com.google.inject:guice:4.2.3', - 'guava': 'com.google.guava:guava:27.0.1-jre', + 'guava': 'com.google.guava:guava:32.1.2-jre', 'h2': 'com.h2database:h2:2.1.214', 'hadoopCommon':'org.apache.hadoop:hadoop-common:2.7.2', 'hadoopMapreduceClient':'org.apache.hadoop:hadoop-mapreduce-client-core:2.7.2', @@ -241,7 +241,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { subprojects { - apply plugin: 'maven' + apply plugin: 'maven-publish' apply plugin: 'com.gorylenko.gradle-git-properties' gitProperties { @@ -255,7 +255,7 @@ subprojects { plugins.withType(JavaPlugin) { dependencies { - testCompile externalDependency.testng + testImplementation externalDependency.testng constraints { implementation('io.netty:netty-all:4.1.86.Final') implementation('org.apache.commons:commons-compress:1.21') diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle index 981a0ab221217..f88d2bdb966ce 100644 --- a/buildSrc/build.gradle +++ b/buildSrc/build.gradle @@ -5,12 +5,12 @@ buildscript { } dependencies { - compile('io.acryl:json-schema-avro:0.1.5') { + implementation('io.acryl:json-schema-avro:0.1.5') { exclude group: 'com.fasterxml.jackson.core', module: 'jackson-databind' exclude group: 'com.google.guava', module: 'guava' } - compile 'com.google.guava:guava:27.0.1-jre' - compile 'com.fasterxml.jackson.core:jackson-databind:2.13.5' - compile 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.5' - compile 'commons-io:commons-io:2.11.0' + implementation 'com.google.guava:guava:32.1.2-jre' + implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.5' + implementation 'com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.5' + implementation 'commons-io:commons-io:2.11.0' } \ No newline at end of file diff --git a/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java b/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java index a5a843d91b1eb..796d622860c15 100644 --- a/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java +++ b/buildSrc/src/main/java/io/datahubproject/GenerateJsonSchemaTask.java @@ -21,10 +21,7 @@ import java.util.List; import java.util.stream.Collectors; import org.gradle.api.DefaultTask; -import org.gradle.api.tasks.CacheableTask; -import org.gradle.api.tasks.InputDirectory; -import org.gradle.api.tasks.OutputDirectory; -import org.gradle.api.tasks.TaskAction; +import org.gradle.api.tasks.*; import static com.github.fge.processing.ProcessingUtil.*; import static org.apache.commons.io.FilenameUtils.*; @@ -46,6 +43,7 @@ public void setInputDirectory(String inputDirectory) { } @InputDirectory + @PathSensitive(PathSensitivity.NAME_ONLY) public String getInputDirectory() { return inputDirectory; } diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index 5ac16d0bd0706..fdf13bac0accc 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -1,7 +1,7 @@ plugins { id "io.github.kobylynskyi.graphql.codegen" version "4.1.1" - id 'com.palantir.docker' id 'scala' + id 'com.palantir.docker' } apply from: "../gradle/versioning/versioning.gradle" diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index e7121d277926d..e40f8e3eeb96d 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -4,7 +4,7 @@ apply plugin: "org.gradle.playframework" project.ext.httpPort = 9001 project.ext.playBinaryBaseName = "datahub-frontend" -tasks.withType(PlayRun) { +runPlay { httpPort = project.ext.httpPort } @@ -33,8 +33,8 @@ dependencies { } } - compile project(":metadata-service:restli-client") - compile project(":metadata-service:auth-config") + implementation project(":metadata-service:restli-client") + implementation project(":metadata-service:auth-config") implementation externalDependency.jettyJaas implementation externalDependency.graphqlJava @@ -70,15 +70,15 @@ dependencies { testImplementation 'no.nav.security:mock-oauth2-server:0.3.1' testImplementation 'org.junit-pioneer:junit-pioneer:1.9.1' testImplementation externalDependency.junitJupiterApi - testRuntime externalDependency.junitJupiterEngine + testRuntimeOnly externalDependency.junitJupiterEngine implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - runtime externalDependency.guice - runtime (externalDependency.playDocs) { + runtimeOnly externalDependency.guice + runtimeOnly (externalDependency.playDocs) { exclude group: 'com.typesafe.akka', module: 'akka-http-core_2.12' } - runtime externalDependency.playGuice + runtimeOnly externalDependency.playGuice implementation externalDependency.log4j2Api implementation externalDependency.logbackClassic diff --git a/datahub-graphql-core/build.gradle b/datahub-graphql-core/build.gradle index 8fd45033373dc..89ba8f17b6aeb 100644 --- a/datahub-graphql-core/build.gradle +++ b/datahub-graphql-core/build.gradle @@ -4,25 +4,26 @@ plugins { apply plugin: 'java' dependencies { - compile project(':metadata-service:restli-client') - compile project(':metadata-service:auth-impl') - compile project(':metadata-service:auth-config') - compile project(':metadata-service:configuration') - compile project(':metadata-service:services') - compile project(':metadata-io') - compile project(':metadata-utils') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:auth-config') + implementation project(':metadata-service:configuration') + implementation project(':metadata-service:services') + implementation project(':metadata-io') + implementation project(':metadata-utils') implementation externalDependency.graphqlJava implementation externalDependency.graphqlJavaScalars - compile externalDependency.antlr4Runtime - compile externalDependency.antlr4 - compile externalDependency.guava + implementation externalDependency.antlr4Runtime + implementation externalDependency.antlr4 + implementation externalDependency.guava + implementation externalDependency.opentelemetryAnnotations implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito + testImplementation externalDependency.mockito } graphqlCodegen { diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 561ed1f2178a6..5d0edf3ee8427 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -12,14 +12,15 @@ ext { } dependencies { - compile project(':metadata-io') - compile project(':metadata-service:factories') - compile project(':metadata-service:restli-client') - compile project(':metadata-service:configuration') + implementation project(':metadata-io') + implementation project(':metadata-service:factories') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-service:configuration') + implementation project(':metadata-dao-impl:kafka-producer') implementation externalDependency.charle - compile externalDependency.javaxInject - compile(externalDependency.hadoopClient) { + implementation externalDependency.javaxInject + implementation(externalDependency.hadoopClient) { exclude group: 'net.minidev', module: 'json-smart' exclude group: 'com.nimbusds', module: 'nimbus-jose-jwt' exclude group: "org.apache.htrace", module: "htrace-core4" @@ -52,18 +53,18 @@ dependencies { implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - compile externalDependency.picocli - compile externalDependency.parquet + implementation externalDependency.picocli + implementation externalDependency.parquet implementation externalDependency.protobuf - compile externalDependency.springBeans - compile externalDependency.springBootAutoconfigure - compile externalDependency.springCore - compile externalDependency.springKafka + implementation externalDependency.springBeans + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springCore + implementation externalDependency.springKafka - runtime externalDependency.logbackClassic - runtime externalDependency.mariadbConnector - runtime externalDependency.mysqlConnector - runtime externalDependency.postgresql + runtimeOnly externalDependency.logbackClassic + runtimeOnly externalDependency.mariadbConnector + runtimeOnly externalDependency.mysqlConnector + runtimeOnly externalDependency.postgresql implementation externalDependency.awsMskIamAuth @@ -71,9 +72,9 @@ dependencies { annotationProcessor externalDependency.picocli testImplementation externalDependency.springBootTest - testCompile externalDependency.mockito - testCompile externalDependency.testng - testRuntime externalDependency.logbackClassic + testImplementation externalDependency.mockito + testImplementation externalDependency.testng + testRuntimeOnly externalDependency.logbackClassic } bootJar { diff --git a/datahub-web-react/public/robots.txt b/datahub-web-react/public/robots.txt index e9e57dc4d41b9..7a00656bc3073 100644 --- a/datahub-web-react/public/robots.txt +++ b/datahub-web-react/public/robots.txt @@ -1,3 +1,6 @@ # https://www.robotstxt.org/robotstxt.html User-agent: * -Disallow: +Disallow: /api +Disallow: /gms +Disallow: /search +Disallow: /logOut diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index ed3904bcf4e2d..535a3f569964c 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -317,6 +317,7 @@ export class DatasetEntity implements Entity { subtype: entity?.subTypes?.typeNames?.[0] || undefined, icon: entity?.platform?.properties?.logoUrl || undefined, platform: entity?.platform, + health: entity?.health || undefined, }; }; diff --git a/datahub-web-react/src/app/entity/group/preview/Preview.tsx b/datahub-web-react/src/app/entity/group/preview/Preview.tsx index 67449b9a481f0..5b9a25e198cfe 100644 --- a/datahub-web-react/src/app/entity/group/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/group/preview/Preview.tsx @@ -88,9 +88,7 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpGroup)} - - {name ? : urn} - + {name ? : urn} {membersCount} members diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx index baef67a3d1c88..30713afa888b8 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealth.tsx @@ -2,7 +2,7 @@ import React from 'react'; import styled from 'styled-components'; import { Link } from 'react-router-dom'; import { Health } from '../../../../../../types.generated'; -import { getHealthSummaryIcon, isUnhealthy } from '../../../../../shared/health/healthUtils'; +import { getHealthSummaryIcon, HealthSummaryIconType, isUnhealthy } from '../../../../../shared/health/healthUtils'; import { EntityHealthPopover } from './EntityHealthPopover'; const Container = styled.div` @@ -14,17 +14,19 @@ const Container = styled.div` type Props = { health: Health[]; baseUrl: string; + fontSize?: number; + tooltipPlacement?: any; }; -export const EntityHealth = ({ health, baseUrl }: Props) => { +export const EntityHealth = ({ health, baseUrl, fontSize, tooltipPlacement }: Props) => { const unhealthy = isUnhealthy(health); - const icon = getHealthSummaryIcon(health); + const icon = getHealthSummaryIcon(health, HealthSummaryIconType.FILLED, fontSize); return ( <> {(unhealthy && ( - + {icon} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx index 0d327a54a62d1..4dde3ffcbb6a4 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHealthPopover.tsx @@ -50,10 +50,12 @@ type Props = { health: Health[]; baseUrl: string; children: React.ReactNode; + fontSize?: number; + placement?: any; }; -export const EntityHealthPopover = ({ health, baseUrl, children }: Props) => { - const icon = getHealthSummaryIcon(health, HealthSummaryIconType.OUTLINED); +export const EntityHealthPopover = ({ health, baseUrl, children, fontSize, placement = 'right' }: Props) => { + const icon = getHealthSummaryIcon(health, HealthSummaryIconType.OUTLINED, fontSize); const message = getHealthSummaryMessage(health); return ( { } color="#262626" - placement="right" + placement={placement} zIndex={10000000} > {children} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx index 5605bacc1d4e4..51a422ba93418 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/PlatformContent/PlatformContentView.tsx @@ -14,6 +14,7 @@ import ParentNodesView, { const LogoIcon = styled.span` display: flex; + gap: 4px; margin-right: 8px; `; diff --git a/datahub-web-react/src/app/entity/user/preview/Preview.tsx b/datahub-web-react/src/app/entity/user/preview/Preview.tsx index 8893d4ab86786..05baefb295b98 100644 --- a/datahub-web-react/src/app/entity/user/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/user/preview/Preview.tsx @@ -81,9 +81,7 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpUser)} - - {name ? : urn} - + {name ? : urn} diff --git a/datahub-web-react/src/app/lineage/LineageEntityNode.tsx b/datahub-web-react/src/app/lineage/LineageEntityNode.tsx index 4526e3a225ce2..f5be1d57db070 100644 --- a/datahub-web-react/src/app/lineage/LineageEntityNode.tsx +++ b/datahub-web-react/src/app/lineage/LineageEntityNode.tsx @@ -12,11 +12,12 @@ import { getShortenedTitle, nodeHeightFromTitleLength } from './utils/titleUtils import { LineageExplorerContext } from './utils/LineageExplorerContext'; import { useGetEntityLineageLazyQuery } from '../../graphql/lineage.generated'; import { useIsSeparateSiblingsMode } from '../entity/shared/siblingUtils'; -import { centerX, centerY, iconHeight, iconWidth, iconX, iconY, textX, width } from './constants'; +import { centerX, centerY, iconHeight, iconWidth, iconX, iconY, textX, width, healthX, healthY } from './constants'; import LineageEntityColumns from './LineageEntityColumns'; import { convertInputFieldsToSchemaFields } from './utils/columnLineageUtils'; import ManageLineageMenu from './manage/ManageLineageMenu'; import { useGetLineageTimeParams } from './utils/useGetLineageTimeParams'; +import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; const CLICK_DELAY_THRESHOLD = 1000; const DRAG_DISTANCE_THRESHOLD = 20; @@ -136,6 +137,11 @@ export default function LineageEntityNode({ capitalizeFirstLetterOnly(node.data.subtype) || (node.data.type && entityRegistry.getEntityName(node.data.type)); + // Health + const { health } = node.data; + const baseUrl = node.data.type && node.data.urn && entityRegistry.getEntityUrl(node.data.type, node.data.urn); + const hasHealth = (health && baseUrl) || false; + return ( {unexploredHiddenChildren && (isHovered || isSelected) ? ( @@ -359,6 +365,16 @@ export default function LineageEntityNode({ {getShortenedTitle(node.data.name, width)} )} + + {hasHealth && ( + + )} + {unexploredHiddenChildren && isHovered ? ( ; downstreamRelationships?: Array; + health?: Health[]; }; export type VizNode = { diff --git a/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts b/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts index 143b226bda687..778d0e325f7cb 100644 --- a/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts +++ b/datahub-web-react/src/app/lineage/utils/constructFetchedNode.ts @@ -67,6 +67,7 @@ export default function constructFetchedNode( canEditLineage: fetchedNode.canEditLineage, upstreamRelationships: fetchedNode?.upstreamRelationships || [], downstreamRelationships: fetchedNode?.downstreamRelationships || [], + health: fetchedNode?.health, }; // eslint-disable-next-line no-param-reassign diff --git a/datahub-web-react/src/app/lineage/utils/constructTree.ts b/datahub-web-react/src/app/lineage/utils/constructTree.ts index 8374509ad74eb..7da6fc56b57bd 100644 --- a/datahub-web-react/src/app/lineage/utils/constructTree.ts +++ b/datahub-web-react/src/app/lineage/utils/constructTree.ts @@ -100,6 +100,7 @@ export default function constructTree( canEditLineage: fetchedEntity?.canEditLineage, upstreamRelationships: fetchedEntity?.upstreamRelationships || [], downstreamRelationships: fetchedEntity?.downstreamRelationships || [], + health: fetchedEntity?.health, }; const lineageConfig = entityRegistry.getLineageVizConfig(entityAndType.type, entityAndType.entity); let updatedLineageConfig = { ...lineageConfig }; diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 0d0a32f7750a8..319c8ed0a3e1d 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -292,7 +292,7 @@ export default function DefaultPreviewCard({ ) : ( - + )} diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx index 60bb21713ba58..d241a3895f19f 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteEntity.tsx @@ -20,6 +20,7 @@ const AutoCompleteEntityWrapper = styled.div` const IconsContainer = styled.div` display: flex; + gap: 4px; `; const ContentWrapper = styled.div` diff --git a/datahub-web-react/src/app/shared/health/healthUtils.tsx b/datahub-web-react/src/app/shared/health/healthUtils.tsx index 823d77d7eabe9..ff7d9b417617c 100644 --- a/datahub-web-react/src/app/shared/health/healthUtils.tsx +++ b/datahub-web-react/src/app/shared/health/healthUtils.tsx @@ -11,13 +11,17 @@ import { HealthStatus, HealthStatusType, Health } from '../../../types.generated const HEALTH_INDICATOR_COLOR = '#d48806'; -const UnhealthyIconFilled = styled(ExclamationCircleTwoTone)` - font-size: 16px; +const UnhealthyIconFilled = styled(ExclamationCircleTwoTone)<{ fontSize: number }>` + && { + font-size: ${(props) => props.fontSize}px; + } `; -const UnhealthyIconOutlined = styled(ExclamationCircleOutlined)` +const UnhealthyIconOutlined = styled(ExclamationCircleOutlined)<{ fontSize: number }>` color: ${HEALTH_INDICATOR_COLOR}; - font-size: 16px; + && { + font-size: ${(props) => props.fontSize}px; + } `; export enum HealthSummaryIconType { @@ -32,12 +36,16 @@ export const isUnhealthy = (healths: Health[]) => { return isFailingAssertions; }; -export const getHealthSummaryIcon = (healths: Health[], type: HealthSummaryIconType = HealthSummaryIconType.FILLED) => { +export const getHealthSummaryIcon = ( + healths: Health[], + type: HealthSummaryIconType = HealthSummaryIconType.FILLED, + fontSize = 16, +) => { const unhealthy = isUnhealthy(healths); return unhealthy - ? (type === HealthSummaryIconType.FILLED && ) || ( - - ) + ? (type === HealthSummaryIconType.FILLED && ( + + )) || : undefined; }; diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index 61c79abf929a0..52385dee8631a 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -198,6 +198,12 @@ fragment lineageNodeProperties on EntityWithRelationships { path } } + health { + type + status + message + causes + } } ... on MLModelGroup { urn diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index 23c04972209ed..9efc0d2ce8753 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -29,6 +29,8 @@ FROM base as dev-install VOLUME [ "/datahub-frontend" ] FROM ${APP_ENV}-install as final +COPY ./docker/datahub-frontend/start.sh / +RUN chown datahub:datahub /start.sh && chmod 755 /start.sh USER datahub ARG SERVER_PORT=9002 @@ -37,5 +39,4 @@ RUN echo $SERVER_PORT EXPOSE $SERVER_PORT HEALTHCHECK --start-period=2m --retries=4 CMD curl --fail http://localhost:$SERVER_PORT/admin || exit 1 -COPY ./docker/datahub-frontend/start.sh / CMD ./start.sh diff --git a/docker/datahub-frontend/start.sh b/docker/datahub-frontend/start.sh index a1548670309b5..9dc1514144bb1 100755 --- a/docker/datahub-frontend/start.sh +++ b/docker/datahub-frontend/start.sh @@ -26,6 +26,21 @@ if [[ ! -z ${SSL_TRUSTSTORE_PASSWORD:-} ]]; then TRUSTSTORE_PASSWORD="-Djavax.net.ssl.trustStorePassword=$SSL_TRUSTSTORE_PASSWORD" fi +HTTP_PROXY="" +if [[ ! -z ${HTTP_PROXY_HOST:-} ]] && [[ ! -z ${HTTP_PROXY_PORT:-} ]]; then + HTTP_PROXY="-Dhttp.proxyHost=$HTTP_PROXY_HOST -Dhttp.proxyPort=$HTTP_PROXY_PORT" +fi + +HTTPS_PROXY="" +if [[ ! -z ${HTTPS_PROXY_HOST:-} ]] && [[ ! -z ${HTTPS_PROXY_PORT:-} ]]; then + HTTPS_PROXY="-Dhttps.proxyHost=$HTTPS_PROXY_HOST -Dhttps.proxyPort=$HTTPS_PROXY_PORT" +fi + +NO_PROXY="" +if [[ ! -z ${HTTP_NON_PROXY_HOSTS:-} ]]; then + NO_PROXY="-Dhttp.nonProxyHosts='$HTTP_NON_PROXY_HOSTS'" +fi + # make sure there is no whitespace at the beginning and the end of # this string export JAVA_OPTS="-Xms512m \ @@ -37,6 +52,7 @@ export JAVA_OPTS="-Xms512m \ -Dlogback.debug=false \ ${PROMETHEUS_AGENT:-} ${OTEL_AGENT:-} \ ${TRUSTSTORE_FILE:-} ${TRUSTSTORE_TYPE:-} ${TRUSTSTORE_PASSWORD:-} \ + ${HTTP_PROXY:-} ${HTTPS_PROXY:-} ${NO_PROXY:-} \ -Dpidfile.path=/dev/null" exec ./datahub-frontend/bin/datahub-frontend diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index df69e8513fbfc..9bdba5f317542 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -178,8 +178,8 @@ module.exports = { appId: "RK0UG797F3", apiKey: "39d7eb90d8b31d464e309375a52d674f", indexName: "datahubproject", - // contextualSearch: true, - // searchParameters: {}, + insights: true, + contextualSearch: true, // debug: true, }, }, diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py index a005445cb1497..83157edc1972c 100644 --- a/docs-website/download_historical_versions.py +++ b/docs-website/download_historical_versions.py @@ -1,7 +1,7 @@ +import json import os import tarfile import urllib.request -import json repo_url = "https://api.github.com/repos/datahub-project/static-assets" @@ -16,30 +16,36 @@ def download_file(url, destination): f.write(chunk) -def fetch_tar_urls(repo_url, folder_path): +def fetch_urls(repo_url: str, folder_path: str, file_format: str): api_url = f"{repo_url}/contents/{folder_path}" response = urllib.request.urlopen(api_url) - data = response.read().decode('utf-8') - tar_urls = [ - file["download_url"] for file in json.loads(data) if file["name"].endswith(".tar.gz") + data = response.read().decode("utf-8") + urls = [ + file["download_url"] + for file in json.loads(data) + if file["name"].endswith(file_format) ] - print(tar_urls) - return tar_urls + print(urls) + return urls -def main(): - folder_path = "versioned_docs" - destination_dir = "versioned_docs" +def extract_tar_file(destination_path): + with tarfile.open(destination_path, "r:gz") as tar: + tar.extractall() + os.remove(destination_path) + + +def download_versioned_docs(folder_path: str, destination_dir: str, file_format: str): if not os.path.exists(destination_dir): os.makedirs(destination_dir) - tar_urls = fetch_tar_urls(repo_url, folder_path) + urls = fetch_urls(repo_url, folder_path, file_format) - for url in tar_urls: + for url in urls: filename = os.path.basename(url) destination_path = os.path.join(destination_dir, filename) - version = '.'.join(filename.split('.')[:3]) + version = ".".join(filename.split(".")[:3]) extracted_path = os.path.join(destination_dir, version) print("extracted_path", extracted_path) if os.path.exists(extracted_path): @@ -48,13 +54,25 @@ def main(): try: download_file(url, destination_path) print(f"Downloaded {filename} to {destination_dir}") - with tarfile.open(destination_path, "r:gz") as tar: - tar.extractall() - os.remove(destination_path) + if file_format == ".tar.gz": + extract_tar_file(destination_path) except urllib.error.URLError as e: print(f"Error while downloading {filename}: {e}") continue +def main(): + download_versioned_docs( + folder_path="versioned_docs", + destination_dir="versioned_docs", + file_format=".tar.gz", + ) + download_versioned_docs( + folder_path="versioned_sidebars", + destination_dir="versioned_sidebars", + file_format=".json", + ) + + if __name__ == "__main__": main() diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 51a57fc41dd36..fcf82b786a1b9 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -138,7 +138,15 @@ module.exports = { ], }, { - Deployment: [ + type: "category", + label: "Deployment", + link: { + type: "generated-index", + title: "Deployment Guides", + description: + "Learn how to deploy DataHub to your environment, set up authentication, manage upgrades, and more.", + }, + items: [ // The purpose of this section is to provide the minimum steps required to deploy DataHub to the vendor of your choosing "docs/deploy/aws", "docs/deploy/gcp", @@ -160,6 +168,7 @@ module.exports = { "docs/authentication/guides/sso/configure-oidc-react-google", "docs/authentication/guides/sso/configure-oidc-react-okta", "docs/authentication/guides/sso/configure-oidc-react-azure", + "docs/authentication/guides/sso/configure-oidc-behind-proxy", ], }, ], @@ -395,7 +404,14 @@ module.exports = { ], }, { - Features: [ + type: "category", + label: "Features", + link: { + type: "generated-index", + title: "Feature Guides", + description: "Learn about the features of DataHub.", + }, + items: [ "docs/ui-ingestion", "docs/how/search", "docs/schema-history", @@ -418,7 +434,10 @@ module.exports = { }, "docs/act-on-metadata/impact-analysis", { - Observability: ["docs/managed-datahub/observe/freshness-assertions"], + Observability: [ + "docs/managed-datahub/observe/freshness-assertions", + "docs/managed-datahub/observe/volume-assertions", + ], }, ], }, diff --git a/docs-website/versioned_sidebars/version-0.10.5-sidebars.json b/docs-website/versioned_sidebars/version-0.10.5-sidebars.json deleted file mode 100644 index 67179075fc994..0000000000000 --- a/docs-website/versioned_sidebars/version-0.10.5-sidebars.json +++ /dev/null @@ -1,594 +0,0 @@ -{ - "overviewSidebar": [ - { - "label": "Getting Started", - "type": "category", - "collapsed": true, - "items": [ - { - "type": "doc", - "label": "Introduction", - "id": "docs/features" - }, - { - "type": "doc", - "label": "Quickstart", - "id": "docs/quickstart" - }, - { - "type": "link", - "label": "Demo", - "href": "https://demo.datahubproject.io/" - }, - "docs/what-is-datahub/datahub-concepts", - "docs/saas" - ] - }, - { - "Integrations": [ - { - "type": "doc", - "label": "Introduction", - "id": "metadata-ingestion/README" - }, - { - "Quickstart Guides": [ - { - "BigQuery": [ - "docs/quick-ingestion-guides/bigquery/overview", - "docs/quick-ingestion-guides/bigquery/setup", - "docs/quick-ingestion-guides/bigquery/configuration" - ] - }, - { - "Redshift": [ - "docs/quick-ingestion-guides/redshift/overview", - "docs/quick-ingestion-guides/redshift/setup", - "docs/quick-ingestion-guides/redshift/configuration" - ] - }, - { - "Snowflake": [ - "docs/quick-ingestion-guides/snowflake/overview", - "docs/quick-ingestion-guides/snowflake/setup", - "docs/quick-ingestion-guides/snowflake/configuration" - ] - }, - { - "Tableau": [ - "docs/quick-ingestion-guides/tableau/overview", - "docs/quick-ingestion-guides/tableau/setup", - "docs/quick-ingestion-guides/tableau/configuration" - ] - }, - { - "PowerBI": [ - "docs/quick-ingestion-guides/powerbi/overview", - "docs/quick-ingestion-guides/powerbi/setup", - "docs/quick-ingestion-guides/powerbi/configuration" - ] - } - ] - }, - { - "Sources": [ - { - "type": "doc", - "id": "docs/lineage/airflow", - "label": "Airflow" - }, - "metadata-integration/java/spark-lineage/README", - "metadata-ingestion/integration_docs/great-expectations", - "metadata-integration/java/datahub-protobuf/README", - { - "type": "autogenerated", - "dirName": "docs/generated/ingestion/sources" - } - ] - }, - { - "Sinks": [ - { - "type": "autogenerated", - "dirName": "metadata-ingestion/sink_docs" - } - ] - }, - { - "Transformers": [ - "metadata-ingestion/docs/transformer/intro", - "metadata-ingestion/docs/transformer/dataset_transformer" - ] - }, - { - "Advanced Guides": [ - { - "Scheduling Ingestion": [ - "metadata-ingestion/schedule_docs/intro", - "metadata-ingestion/schedule_docs/cron", - "metadata-ingestion/schedule_docs/airflow", - "metadata-ingestion/schedule_docs/kubernetes" - ] - }, - "docs/platform-instances", - "metadata-ingestion/docs/dev_guides/stateful", - "metadata-ingestion/docs/dev_guides/classification", - "metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source", - "metadata-ingestion/docs/dev_guides/sql_profiles" - ] - } - ] - }, - { - "Deployment": [ - "docs/deploy/aws", - "docs/deploy/gcp", - "docker/README", - "docs/deploy/kubernetes", - "docs/deploy/environment-vars", - { - "Authentication": [ - "docs/authentication/README", - "docs/authentication/concepts", - "docs/authentication/changing-default-credentials", - "docs/authentication/guides/add-users", - { - "Frontend Authentication": [ - "docs/authentication/guides/jaas", - { - "OIDC Authentication": [ - "docs/authentication/guides/sso/configure-oidc-react", - "docs/authentication/guides/sso/configure-oidc-react-google", - "docs/authentication/guides/sso/configure-oidc-react-okta", - "docs/authentication/guides/sso/configure-oidc-react-azure" - ] - } - ] - }, - "docs/authentication/introducing-metadata-service-authentication", - "docs/authentication/personal-access-tokens" - ] - }, - { - "Authorization": [ - "docs/authorization/README", - "docs/authorization/roles", - "docs/authorization/policies", - "docs/authorization/groups" - ] - }, - { - "Advanced Guides": [ - "docs/how/delete-metadata", - "docs/how/configuring-authorization-with-apache-ranger", - "docs/how/backup-datahub", - "docs/how/restore-indices", - "docs/advanced/db-retention", - "docs/advanced/monitoring", - "docs/how/extract-container-logs", - "docs/deploy/telemetry", - "docs/how/kafka-config", - "docs/deploy/confluent-cloud", - "docs/advanced/no-code-upgrade", - "docs/how/jattach-guide" - ] - }, - "docs/how/updating-datahub" - ] - }, - { - "API": [ - "docs/api/datahub-apis", - { - "GraphQL API": [ - { - "label": "Overview", - "type": "doc", - "id": "docs/api/graphql/overview" - }, - { - "Reference": [ - { - "type": "doc", - "label": "Queries", - "id": "graphql/queries" - }, - { - "type": "doc", - "label": "Mutations", - "id": "graphql/mutations" - }, - { - "type": "doc", - "label": "Objects", - "id": "graphql/objects" - }, - { - "type": "doc", - "label": "Inputs", - "id": "graphql/inputObjects" - }, - { - "type": "doc", - "label": "Interfaces", - "id": "graphql/interfaces" - }, - { - "type": "doc", - "label": "Unions", - "id": "graphql/unions" - }, - { - "type": "doc", - "label": "Enums", - "id": "graphql/enums" - }, - { - "type": "doc", - "label": "Scalars", - "id": "graphql/scalars" - } - ] - }, - { - "Guides": [ - { - "type": "doc", - "label": "How To Set Up GraphQL", - "id": "docs/api/graphql/how-to-set-up-graphql" - }, - { - "type": "doc", - "label": "Getting Started With GraphQL", - "id": "docs/api/graphql/getting-started" - }, - { - "type": "doc", - "label": "Access Token Management", - "id": "docs/api/graphql/token-management" - } - ] - } - ] - }, - { - "type": "doc", - "label": "OpenAPI", - "id": "docs/api/openapi/openapi-usage-guide" - }, - "docs/dev-guides/timeline", - { - "Rest.li API": [ - { - "type": "doc", - "label": "Rest.li API Guide", - "id": "docs/api/restli/restli-overview" - }, - { - "type": "doc", - "label": "Restore Indices", - "id": "docs/api/restli/restore-indices" - }, - { - "type": "doc", - "label": "Get Index Sizes", - "id": "docs/api/restli/get-index-sizes" - }, - { - "type": "doc", - "label": "Truncate Timeseries Aspect", - "id": "docs/api/restli/truncate-time-series-aspect" - }, - { - "type": "doc", - "label": "Get ElasticSearch Task Status Endpoint", - "id": "docs/api/restli/get-elastic-task-status" - }, - { - "type": "doc", - "label": "Evaluate Tests", - "id": "docs/api/restli/evaluate-tests" - }, - { - "type": "doc", - "label": "Aspect Versioning and Rest.li Modeling", - "id": "docs/advanced/aspect-versioning" - } - ] - }, - { - "Python SDK": [ - "metadata-ingestion/as-a-library", - { - "Python SDK Reference": [ - { - "type": "autogenerated", - "dirName": "python-sdk" - } - ] - } - ] - }, - "metadata-integration/java/as-a-library", - { - "API and SDK Guides": [ - "docs/advanced/patch", - "docs/api/tutorials/datasets", - "docs/api/tutorials/lineage", - "docs/api/tutorials/tags", - "docs/api/tutorials/terms", - "docs/api/tutorials/owners", - "docs/api/tutorials/domains", - "docs/api/tutorials/deprecation", - "docs/api/tutorials/descriptions", - "docs/api/tutorials/custom-properties", - "docs/api/tutorials/ml" - ] - }, - { - "type": "category", - "label": "DataHub CLI", - "link": { - "type": "doc", - "id": "docs/cli" - }, - "items": [ - "docs/datahub_lite" - ] - }, - { - "type": "category", - "label": "Datahub Actions", - "link": { - "type": "doc", - "id": "docs/act-on-metadata" - }, - "items": [ - "docs/actions/README", - "docs/actions/quickstart", - "docs/actions/concepts", - { - "Sources": [ - { - "type": "autogenerated", - "dirName": "docs/actions/sources" - } - ] - }, - { - "Events": [ - { - "type": "autogenerated", - "dirName": "docs/actions/events" - } - ] - }, - { - "Actions": [ - { - "type": "autogenerated", - "dirName": "docs/actions/actions" - } - ] - }, - { - "Guides": [ - { - "type": "autogenerated", - "dirName": "docs/actions/guides" - } - ] - } - ] - } - ] - }, - { - "Features": [ - "docs/ui-ingestion", - "docs/how/search", - "docs/schema-history", - "docs/domains", - "docs/dataproducts", - "docs/glossary/business-glossary", - "docs/tags", - "docs/ownership/ownership-types", - "docs/browse", - "docs/authorization/access-policies-guide", - "docs/features/dataset-usage-and-query-history", - "docs/posts", - "docs/sync-status", - "docs/lineage/lineage-feature-guide", - { - "type": "doc", - "id": "docs/tests/metadata-tests", - "className": "saasOnly" - }, - "docs/act-on-metadata/impact-analysis", - { - "Observability": [ - "docs/managed-datahub/observe/freshness-assertions" - ] - } - ] - }, - { - "Develop": [ - { - "DataHub Metadata Model": [ - "docs/modeling/metadata-model", - "docs/modeling/extending-the-metadata-model", - "docs/what/mxe", - { - "Entities": [ - { - "type": "autogenerated", - "dirName": "docs/generated/metamodel/entities" - } - ] - } - ] - }, - { - "Architecture": [ - "docs/architecture/architecture", - "docs/components", - "docs/architecture/metadata-ingestion", - "docs/architecture/metadata-serving", - "docs/architecture/docker-containers" - ] - }, - { - "Developing on DataHub": [ - "docs/developers", - "docs/docker/development", - "metadata-ingestion/developing", - "docs/api/graphql/graphql-endpoint-development", - { - "Modules": [ - "datahub-web-react/README", - "datahub-frontend/README", - "datahub-graphql-core/README", - "metadata-service/README", - "metadata-jobs/mae-consumer-job/README", - "metadata-jobs/mce-consumer-job/README" - ] - } - ] - }, - "docs/plugins", - { - "Troubleshooting": [ - "docs/troubleshooting/quickstart", - "docs/troubleshooting/build", - "docs/troubleshooting/general" - ] - }, - { - "Advanced": [ - "metadata-ingestion/docs/dev_guides/reporting_telemetry", - "docs/advanced/mcp-mcl", - "docker/datahub-upgrade/README", - "docs/advanced/no-code-modeling", - "datahub-web-react/src/app/analytics/README", - "docs/how/migrating-graph-service-implementation", - "docs/advanced/field-path-spec-v2", - "metadata-ingestion/adding-source", - "docs/how/add-custom-ingestion-source", - "docs/how/add-custom-data-platform", - "docs/advanced/browse-paths-upgrade", - "docs/browseV2/browse-paths-v2" - ] - } - ] - }, - { - "Community": [ - "docs/slack", - "docs/townhalls", - "docs/townhall-history", - "docs/CODE_OF_CONDUCT", - "docs/CONTRIBUTING", - "docs/links", - "docs/rfc" - ] - }, - { - "Managed DataHub": [ - "docs/managed-datahub/managed-datahub-overview", - "docs/managed-datahub/welcome-acryl", - { - "type": "doc", - "id": "docs/managed-datahub/saas-slack-setup", - "className": "saasOnly" - }, - { - "type": "doc", - "id": "docs/managed-datahub/approval-workflows", - "className": "saasOnly" - }, - { - "Metadata Ingestion With Acryl": [ - "docs/managed-datahub/metadata-ingestion-with-acryl/ingestion" - ] - }, - { - "DataHub API": [ - { - "type": "doc", - "id": "docs/managed-datahub/datahub-api/entity-events-api", - "className": "saasOnly" - }, - { - "GraphQL API": [ - "docs/managed-datahub/datahub-api/graphql-api/getting-started", - { - "type": "doc", - "id": "docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta", - "className": "saasOnly" - } - ] - } - ] - }, - { - "Integrations": [ - { - "type": "doc", - "id": "docs/managed-datahub/integrations/aws-privatelink", - "className": "saasOnly" - }, - { - "type": "doc", - "id": "docs/managed-datahub/integrations/oidc-sso-integration", - "className": "saasOnly" - } - ] - }, - { - "Operator Guide": [ - { - "type": "doc", - "id": "docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws", - "className": "saasOnly" - }, - { - "type": "doc", - "id": "docs/managed-datahub/operator-guide/setting-up-events-api-on-aws-eventbridge", - "className": "saasOnly" - } - ] - }, - { - "type": "doc", - "id": "docs/managed-datahub/chrome-extension", - "className": "saasOnly" - }, - { - "Managed DataHub Release History": [ - "docs/managed-datahub/release-notes/v_0_2_10", - "docs/managed-datahub/release-notes/v_0_2_9", - "docs/managed-datahub/release-notes/v_0_2_8", - "docs/managed-datahub/release-notes/v_0_2_7", - "docs/managed-datahub/release-notes/v_0_2_6", - "docs/managed-datahub/release-notes/v_0_2_5", - "docs/managed-datahub/release-notes/v_0_2_4", - "docs/managed-datahub/release-notes/v_0_2_3", - "docs/managed-datahub/release-notes/v_0_2_2", - "docs/managed-datahub/release-notes/v_0_2_1", - "docs/managed-datahub/release-notes/v_0_2_0", - "docs/managed-datahub/release-notes/v_0_1_73", - "docs/managed-datahub/release-notes/v_0_1_72", - "docs/managed-datahub/release-notes/v_0_1_70", - "docs/managed-datahub/release-notes/v_0_1_69" - ] - } - ] - }, - { - "Release History": [ - "releases" - ] - } - ] -} diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index ce23a4d274e8e..dc43cb178f949 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -145,8 +145,8 @@ You can now see the column-level lineage between datasets. Note that you have to -```json -mutation searchAcrossLineage { +```graphql +query searchAcrossLineage { searchAcrossLineage( input: { query: "*" diff --git a/docs/authentication/guides/sso/configure-oidc-behind-proxy.md b/docs/authentication/guides/sso/configure-oidc-behind-proxy.md new file mode 100644 index 0000000000000..c998816e04735 --- /dev/null +++ b/docs/authentication/guides/sso/configure-oidc-behind-proxy.md @@ -0,0 +1,64 @@ +# Configuring Frontend to use a Proxy when communicating with SSO Provider +*Authored on 22/08/2023* + +The `datahub-frontend-react` server can be configured to use an http proxy when retrieving the openid-configuration. +This can be needed if your infrastructure is locked down and disallows connectivity by default, using proxies for fine-grained egress control. + +## Configure http proxy and non proxy hosts + +To do this, you will need to pass a set of environment variables to the datahub-frontend-react container (e.g. in the `docker-compose.yml` file or your kubernetes manifest). + +``` +HTTP_PROXY_HOST=host of your http proxy +HTTP_PROXY_PORT=port of your http proxy +HTTPS_PROXY_HOST=host of your http(s) proxy used for https connections (often the same as the http proxy) +HTTPS_PROXY_PORT=port of your http(s) proxy used for https connections (often the same as the http proxy) +HTTP_NON_PROXY_HOSTS=localhost|datahub-gms (or any other hosts that you would like to bypass the proxy for, delimited by pipe) +``` + +## Optional: provide custom truststore +If your upstream proxy performs SSL termination to inspect traffic, this will result in different (self-signed) certificates for HTTPS connections. +The default truststore used in the `datahub-frontend-react` docker image will not trust these kinds of connections. +To address this, you can copy or mount your own truststore (provided by the proxy or network administrators) into the docker container. + +Depending on your setup, you have a few options to achieve this: + +### Make truststore available in the frontend + +#### Option a) Build frontend docker image with your own truststore included + +To build a custom image for your frontend, with the certificates built-in, you can use the official frontend image as a base, then copy in your required files. + +Example Dockerfile: + +```dockerfile +FROM linkedin/datahub-frontend-react: +COPY /truststore-directory /certificates +``` + +Building this Dockerfile will result in your own custom docker image on your local machine. +You will then be able to tag it, publish it to your own registry, etc. + +#### Option b) Mount truststore from your host machine using a docker volume + +Adapt your docker-compose.yml to include a new volume mount in the `datahub-frontend-react` container + +```docker + datahub-frontend-react: + # ... + volumes: + # ... + - /truststore-directory:/certificates +``` + +### Reference new truststore + +Add the following environment values to the `datahub-frontend-react` container: + +``` +SSL_TRUSTSTORE_FILE=path/to/truststore.jks (e.g. /certificates) +SSL_TRUSTSTORE_TYPE=jks +SSL_TRUSTSTORE_PASSWORD=MyTruststorePassword +``` + +Once these steps are done, your frontend container will use the new truststore when validating SSL/HTTPS connections. diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md index acbb573020be0..f720a66ce5765 100644 --- a/docs/how/delete-metadata.md +++ b/docs/how/delete-metadata.md @@ -43,6 +43,9 @@ datahub delete --platform snowflake # Filters can be combined, which will select entities that match all filters. datahub delete --platform looker --entity-type chart datahub delete --platform bigquery --env PROD + +# You can also do recursive deletes for container and dataPlatformInstance entities. +datahub delete --urn "urn:li:container:f76..." --recursive ``` When performing hard deletes, you can optionally add the `--only-soft-deleted` flag to only hard delete entities that were previously soft deleted. @@ -122,6 +125,14 @@ datahub delete --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted datahub delete --platform snowflake --env DEV ``` +#### Delete everything within a specific Snowflake DB + +```shell +# You can find your container urn by navigating to the relevant +# DB in the DataHub UI and clicking the "copy urn" button. +datahub delete --urn "urn:li:container:77644901c4f574845578ebd18b7c14fa" --recursive +``` + #### Delete all BigQuery datasets in the PROD environment ```shell @@ -129,6 +140,13 @@ datahub delete --platform snowflake --env DEV datahub delete --env PROD --entity-type dataset --platform bigquery ``` +#### Delete everything within a MySQL platform instance + +```shell +# The instance name comes from the `platform_instance` config option in the ingestion recipe. +datahub delete --urn 'urn:li:dataPlatformInstance:(urn:li:dataPlatform:mysql,my_instance_name)' --recursive +``` + #### Delete all pipelines and tasks from Airflow ```shell @@ -138,6 +156,7 @@ datahub delete --platform "airflow" #### Delete all containers for a particular platform ```shell +# Note: this will leave S3 datasets intact. datahub delete --entity-type container --platform s3 ``` diff --git a/docs/how/search.md b/docs/how/search.md index bf1c8e8632e24..6a5e85e547fc5 100644 --- a/docs/how/search.md +++ b/docs/how/search.md @@ -2,14 +2,6 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; # About DataHub Search - - - - The **search bar** is an important mechanism for discovering data assets in DataHub. From the search bar, you can find Datasets, Columns, Dashboards, Charts, Data Pipelines, and more. Simply type in a term and press 'enter'. diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 21d59b777dd7c..49de5352f6d58 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -65,7 +65,7 @@ lazy_load_plugins = False | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | | datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | -5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). +5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). 6. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. ### How to validate installation @@ -160,14 +160,14 @@ pip install acryl-datahub[airflow,datahub-kafka] - `capture_executions` (defaults to false): If true, it captures task runs as DataHub DataProcessInstances. - `graceful_exceptions` (defaults to true): If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. -4. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). +4. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). 5. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. ## Emitting lineage via a separate operator Take a look at this sample DAG: -- [`lineage_emission_dag.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py) - emits lineage using the DatahubEmitterOperator. +- [`lineage_emission_dag.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py) - emits lineage using the DatahubEmitterOperator. In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. diff --git a/docs/managed-datahub/chrome-extension.md b/docs/managed-datahub/chrome-extension.md index c6840f4e8e221..0aa0860d03b67 100644 --- a/docs/managed-datahub/chrome-extension.md +++ b/docs/managed-datahub/chrome-extension.md @@ -12,7 +12,7 @@ In order to use the Acryl DataHub Chrome extension, you need to download it onto

- +

@@ -26,7 +26,7 @@ Once you have your extension installed, you'll need to configure it to work with

- +

@@ -34,7 +34,7 @@ Once you have your extension installed, you'll need to configure it to work with

- +

@@ -48,7 +48,7 @@ Some organizations have custom SaaS domains for Looker and some Acryl DataHub de

- +

@@ -56,7 +56,7 @@ Some organizations have custom SaaS domains for Looker and some Acryl DataHub de

- +

@@ -74,7 +74,7 @@ Once you have everything configured on your extension, it's time to use it!

- +

diff --git a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md index 57d46f05c4e0c..736bf6fea6811 100644 --- a/docs/managed-datahub/datahub-api/graphql-api/getting-started.md +++ b/docs/managed-datahub/datahub-api/graphql-api/getting-started.md @@ -12,7 +12,7 @@ For a full reference to the Queries & Mutations available for consumption, check

- +

diff --git a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md index bfd8e8f2dae1b..16d83d2f57575 100644 --- a/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md +++ b/docs/managed-datahub/datahub-api/graphql-api/incidents-api-beta.md @@ -406,7 +406,7 @@ These notifications are also able to tag the immediate asset's owners, along wit

- +

diff --git a/docs/managed-datahub/integrations/oidc-sso-integration.md b/docs/managed-datahub/integrations/oidc-sso-integration.md index c0f5069d849fa..ec4ca311a0de5 100644 --- a/docs/managed-datahub/integrations/oidc-sso-integration.md +++ b/docs/managed-datahub/integrations/oidc-sso-integration.md @@ -44,6 +44,6 @@ To enable the OIDC integration, start by navigating to **Settings > Platform > S

- +

diff --git a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md index e225fd8b014c8..0444d15b3627c 100644 --- a/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md +++ b/docs/managed-datahub/metadata-ingestion-with-acryl/ingestion.md @@ -58,13 +58,13 @@ In Acryl DataHub deployments, you _must_ use a sink of type `datahub-rest`, whic The token can be retrieved by logging in as admin. You can go to Settings page and generate a Personal Access Token with your desired expiration date.

- +

- +

diff --git a/docs/managed-datahub/observe/freshness-assertions.md b/docs/managed-datahub/observe/freshness-assertions.md index 54b3134151d3a..c5d4ca9081b43 100644 --- a/docs/managed-datahub/observe/freshness-assertions.md +++ b/docs/managed-datahub/observe/freshness-assertions.md @@ -59,7 +59,7 @@ Tables. For example, imagine that we work for a company with a Snowflake Table that stores user clicks collected from our e-commerce website. This table is updated with new data on a specific cadence: once per hour (In practice, daily or even weekly are also common). In turn, there is a downstream Business Analytics Dashboard in Looker that shows important metrics like -the number of people clicking our "Daily Sale" banners, and this dashboard pulls is generated from data stored in our "clicks" table. +the number of people clicking our "Daily Sale" banners, and this dashboard is generated from data stored in our "clicks" table. It is important that our clicks Table continues to be updated each hour because if it stops being updated, it could mean that our downstream metrics dashboard becomes incorrect. And the risk of this situation is obvious: our organization may make bad decisions based on incomplete information. @@ -122,8 +122,12 @@ Change Source types vary by the platform, but generally fall into these categori is higher than the previously observed value, in order to determine whether the Table has been changed within a given period of time. Note that this approach is only supported if the Change Window does not use a fixed interval. - Using the final 2 approaches - column value queries - to determine whether a Table has changed useful because it can be customized to determine whether - specific types of important changes have been made to a given Table. + - **DataHub Operation**: A DataHub "Operation" aspect contains timeseries information used to describe changes made to an entity. Using this + option avoids contacting your data platform, and instead uses the DataHub Operation metadata to evaluate Freshness Assertions. + This relies on Operations being reported to DataHub, either via ingestion or via use of the DataHub APIs (see [Report Operation via API](#reporting-operations-via-api)). + Note if you have not configured an ingestion source through DataHub, then this may be the only option available. + + Using either of the column value approaches (**Last Modified Column** or **High Watermark Column**) to determine whether a Table has changed can be useful because it can be customized to determine whether specific types of important changes have been made to a given Table. Because it does not involve system warehouse tables, it is also easily portable across Data Warehouse and Data Lake providers. Freshness Assertions also have an off switch: they can be started or stopped at any time with the click of button. @@ -178,7 +182,7 @@ _Check whether the table has changed in a specific window of time_ 7. (Optional) Click **Advanced** to customize the evaluation **source**. This is the mechanism that will be used to evaluate -the check. Each Data Platform supports different options including Audit Log, Information Schema, Last Modified Column, and High Watermark Column. +the check. Each Data Platform supports different options including Audit Log, Information Schema, Last Modified Column, High Watermark Column, and DataHub Operation.

@@ -189,11 +193,12 @@ the check. Each Data Platform supports different options including Audit Log, In - **Last Modified Column**: Check for the presence of rows using a "Last Modified Time" column, which should reflect the time at which a given row was last changed in the table, to determine whether the table changed within the evaluation period. - **High Watermark Column**: Monitor changes to a continuously-increasing "high watermark" column value to determine whether a table - has been changed. This option is particularly useful for tables that grow consistently with time, for example fact or event (e.g. click-strea) tables. It is not available + has been changed. This option is particularly useful for tables that grow consistently with time, for example fact or event (e.g. click-stream) tables. It is not available when using a fixed lookback period. +- **DataHub Operation**: Use DataHub Operations to determine whether the table changed within the evaluation period. -8. Click **Next** -9. Configure actions that should be taken when the Freshness Assertion passes or fails +1. Click **Next** +2. Configure actions that should be taken when the Freshness Assertion passes or fails

@@ -280,7 +285,7 @@ Note that to create or delete Assertions and Monitors for a specific entity on D In order to create a Freshness Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2 GraphQL mutation queries to create a Freshness Assertion entity and create an Assertion Monitor entity responsible for evaluating it. -Start by creating the Freshness Assertion entity using the `createFreshnessAssertion` query and hang on to the 'urn' field of the Assertion entit y +Start by creating the Freshness Assertion entity using the `createFreshnessAssertion` query and hang on to the 'urn' field of the Assertion entity you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`. ##### Examples @@ -291,10 +296,10 @@ To create a Freshness Assertion Entity that checks whether a table has been upda mutation createFreshnessAssertion { createFreshnessAssertion( input: { - entityUrn: "" - type: DATASET_CHANGE + entityUrn: "", + type: DATASET_CHANGE, schedule: { - type: FIXED_INTERVAL + type: FIXED_INTERVAL, fixedInterval: { unit: HOUR, multiple: 8 } } } @@ -337,6 +342,28 @@ After creating the monitor, the new assertion will start to be evaluated every 8 You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`. +### Reporting Operations via API + +DataHub Operations can be used to capture changes made to entities. This is useful for cases where the underlying data platform does not provide a mechanism +to capture changes, or where the data platform's mechanism is not reliable. In order to report an operation, you can use the `reportOperation` GraphQL mutation. + + +##### Examples +```json +mutation reportOperation { + reportOperation( + input: { + urn: "", + operationType: INSERT, + sourceType: DATA_PLATFORM, + timestampMillis: 1693252366489 + } + ) +} +``` + +Use the `timestampMillis` field to specify the time at which the operation occurred. If no value is provided, the current time will be used. + ### Tips :::info diff --git a/docs/managed-datahub/observe/volume-assertions.md b/docs/managed-datahub/observe/volume-assertions.md new file mode 100644 index 0000000000000..5f5aff33a5ce2 --- /dev/null +++ b/docs/managed-datahub/observe/volume-assertions.md @@ -0,0 +1,355 @@ +--- +description: This page provides an overview of working with DataHub Volume Assertions +--- +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + + +# Volume Assertions + + + + +> ⚠️ The **Volume Assertions** feature is currently in private beta, part of the **Acryl Observe** module, and may only be available to a +> limited set of design partners. +> +> If you are interested in trying it and providing feedback, please reach out to your Acryl Customer Success +> representative. + +## Introduction + +Can you remember a time when the meaning of Data Warehouse Table that you depended on fundamentally changed, with little or no notice? +If the answer is yes, how did you find out? We'll take a guess - someone looking at an internal reporting dashboard or worse, a user using your your product, sounded an alarm when +a number looked a bit out of the ordinary. Perhaps your table initially tracked purchases made on your company's e-commerce web store, but suddenly began to include purchases made +through your company's new mobile app. + +There are many reasons why an important Table on Snowflake, Redshift, or BigQuery may change in its meaning - application code bugs, new feature rollouts, +changes to key metric definitions, etc. Often times, these changes break important assumptions made about the data used in building key downstream data products +like reporting dashboards or data-driven product features. + +What if you could reduce the time to detect these incidents, so that the people responsible for the data were made aware of data +issues _before_ anyone else? With Acryl DataHub **Volume Assertions**, you can. + +Acryl DataHub allows users to define expectations about the normal volume, or size, of a particular warehouse Table, +and then monitor those expectations over time as the table grows and changes. + +In this article, we'll cover the basics of monitoring Volume Assertions - what they are, how to configure them, and more - so that you and your team can +start building trust in your most important data assets. + +Let's get started! + +## Support + +Volume Assertions are currently supported for: + +1. Snowflake +2. Redshift +3. BigQuery + +Note that an Ingestion Source _must_ be configured with the data platform of your choice in Acryl DataHub's **Ingestion** +tab. + +> Note that Volume Assertions are not yet supported if you are connecting to your warehouse +> using the DataHub CLI or a Remote Ingestion Executor. + +## What is a Volume Assertion? + +A **Volume Assertion** is a configurable Data Quality rule used to monitor a Data Warehouse Table +for unexpected or sudden changes in "volume", or row count. Volume Assertions can be particularly useful when you have frequently-changing +Tables which have a relatively stable pattern of growth or decline. + +For example, imagine that we work for a company with a Snowflake Table that stores user clicks collected from our e-commerce website. +This table is updated with new data on a specific cadence: once per hour (In practice, daily or even weekly are also common). +In turn, there is a downstream Business Analytics Dashboard in Looker that shows important metrics like +the number of people clicking our "Daily Sale" banners, and this dashboard is generated from data stored in our "clicks" table. +It is important that our clicks Table is updated with the correct number of rows each hour, else it could mean +that our downstream metrics dashboard becomes incorrect. The risk of this situation is obvious: our organization +may make bad decisions based on incomplete information. + +In such cases, we can use a **Volume Assertion** that checks whether the Snowflake "clicks" Table is growing in an expected +way, and that there are no sudden increases or sudden decreases in the rows being added or removed from the table. +If too many rows are added or removed within an hour, we can notify key stakeholders and begin to root cause before the problem impacts stakeholders of the data. + +### Anatomy of a Volume Assertion + +At the most basic level, **Volume Assertions** consist of a few important parts: + +1. An **Evaluation Schedule** +2. A **Volume Condition** +2. A **Volume Source** + +In this section, we'll give an overview of each. + +#### 1. Evaluation Schedule + +The **Evaluation Schedule**: This defines how often to check a given warehouse Table for its volume. This should usually +be configured to match the expected change frequency of the Table, although it can also be less frequently depending +on the requirements. You can also specify specific days of the week, hours in the day, or even +minutes in an hour. + + +#### 2. Volume Condition + +The **Volume Condition**: This defines the type of condition that we'd like to monitor, or when the Assertion +should result in failure. + +There are a 2 different categories of conditions: **Total** Volume and **Change** Volume. + +_Total_ volume conditions are those which are defined against the point-in-time total row count for a table. They allow you to specify conditions like: + +1. **Table has too many rows**: The table should always have less than 1000 rows +2. **Table has too few rows**: The table should always have more than 1000 rows +3. **Table row count is outside a range**: The table should always have between 1000 and 2000 rows. + +_Change_ volume conditions are those which are defined against the growth or decline rate of a table, measured between subsequent checks +of the table volume. They allow you to specify conditions like: + +1. **Table growth is too fast**: When the table volume is checked, it should have < 1000 more rows than it had during the previous check. +2. **Table growth is too slow**: When the table volume is checked, it should have > 1000 more rows than it had during the previous check. +3. **Table growth is outside a range**: When the table volume is checked, it should have between 1000 and 2000 more rows than it had during the previous check. + +For change volume conditions, both _absolute_ row count deltas and relative percentage deltas are supported for identifying +table that are following an abnormal pattern of growth. + + +#### 3. Volume Source + +The **Volume Source**: This is the mechanism that Acryl DataHub should use to determine the table volume (row count). The supported +source types vary by the platform, but generally fall into these categories: + +- **Information Schema**: A system Table that is exposed by the Data Warehouse which contains live information about the Databases + and Tables stored inside the Data Warehouse, including their row count. It is usually efficient to check, but can in some cases be slightly delayed to update + once a change has been made to a table. + +- **Query**: A `COUNT(*)` query is used to retrieve the latest row count for a table, with optional SQL filters applied (depending on platform). + This can be less efficient to check depending on the size of the table. This approach is more portable, as it does not involve + system warehouse tables, it is also easily portable across Data Warehouse and Data Lake providers. + +- **DataHub Dataset Profile**: The DataHub Dataset Profile aspect is used to retrieve the latest row count information for a table. + Using this option avoids contacting your data platform, and instead uses the DataHub Dataset Profile metadata to evaluate Volume Assertions. + Note if you have not configured an ingestion source through DataHub, then this may be the only option available. + +Volume Assertions also have an off switch: they can be started or stopped at any time with the click of button. + + +## Creating a Volume Assertion + +### Prerequisites + +1. **Permissions**: To create or delete Volume Assertions for a specific entity on DataHub, you'll need to be granted the + `Edit Assertions` and `Edit Monitors` privileges for the entity. This is granted to Entity owners by default. + +2. **Data Platform Connection**: In order to create a Volume Assertion, you'll need to have an **Ingestion Source** configured to your + Data Platform: Snowflake, BigQuery, or Redshift under the **Integrations** tab. + +Once these are in place, you're ready to create your Volume Assertions! + +### Steps + +1. Navigate to the Table that to monitor for volume +2. Click the **Validations** tab + +

+ +

+ +3. Click **+ Create Assertion** + +

+ +

+ +4. Choose **Volume** + +5. Configure the evaluation **schedule**. This is the frequency at which the assertion will be evaluated to produce a pass or fail result, and the times + when the table volume will be checked. + +6. Configure the evaluation **condition type**. This determines the cases in which the new assertion will fail when it is evaluated. + +

+ +

+ +7. (Optional) Click **Advanced** to customize the volume **source**. This is the mechanism that will be used to obtain the table + row count metric. Each Data Platform supports different options including Information Schema, Query, and DataHub Dataset Profile. + +

+ +

+ +- **Information Schema**: Check the Data Platform system metadata tables to determine the table row count. +- **Query**: Issue a `COUNT(*)` query to the table to determine the row count. +- **DataHub Dataset Profile**: Use the DataHub Dataset Profile metadata to determine the row count. + +8. Click **Next** +9. Configure actions that should be taken when the Volume Assertion passes or fails + +

+ +

+ +- **Raise incident**: Automatically raise a new DataHub `Volume` Incident for the Table whenever the Volume Assertion is failing. This + may indicate that the Table is unfit for consumption. Configure Slack Notifications under **Settings** to be notified when + an incident is created due to an Assertion failure. +- **Resolve incident**: Automatically resolved any incidents that were raised due to failures in this Volume Assertion. Note that + any other incidents will not be impacted. + +10. Click **Save**. + +And that's it! DataHub will now begin to monitor your Volume Assertion for the table. + +To view the time of the next Volume Assertion evaluation, simply click **Volume** and then click on your +new Assertion: + +

+ +

+ +Once your assertion has run, you will begin to see Success or Failure status for the Table + +

+ +

+ + +## Stopping a Volume Assertion + +In order to temporarily stop the evaluation of a Volume Assertion: + +1. Navigate to the **Validations** tab of the Table with the assertion +2. Click **Volume** to open the Volume Assertions list +3. Click the three-dot menu on the right side of the assertion you want to disable +4. Click **Stop** + +

+ +

+ +To resume the Volume Assertion, simply click **Turn On**. + +

+ +

+ + +## Smart Assertions ⚡ + +As part of the **Acryl Observe** module, Acryl DataHub also provides **Smart Assertions** out of the box. These are +dynamic, AI-powered Volume Assertions that you can use to monitor the volume of important warehouse Tables, without +requiring any manual setup. + +If Acryl DataHub is able to detect a pattern in the volume of a Snowflake, Redshift, or BigQuery Table, you'll find +a recommended Smart Assertion under the `Validations` tab on the Table profile page: + +

+ +

+ +In order to enable it, simply click **Turn On**. From this point forward, the Smart Assertion will check for changes on a cadence +based on the Table history. + +Don't need it anymore? Smart Assertions can just as easily be turned off by clicking the three-dot "more" button and then **Stop**. + + +## Creating Volume Assertions via API + +Under the hood, Acryl DataHub implements Volume Assertion Monitoring using two "entity" concepts: + +- **Assertion**: The specific expectation for volume, e.g. "The table was changed int the past 7 hours" + or "The table is changed on a schedule of every day by 8am". This is the "what". + +- **Monitor**: The process responsible for evaluating the Assertion on a given evaluation schedule and using specific + mechanisms. This is the "how". + +Note that to create or delete Assertions and Monitors for a specific entity on DataHub, you'll need the +`Edit Assertions` and `Edit Monitors` privileges for it. + +#### GraphQL + +In order to create a Volume Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2 +GraphQL mutation queries to create a Volume Assertion entity and create an Assertion Monitor entity responsible for evaluating it. + +Start by creating the Volume Assertion entity using the `createVolumeAssertion` query and hang on to the 'urn' field of the Assertion entity +you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`. + +##### Examples + +To create a Volume Assertion Entity that checks whether a table has been updated in the past 8 hours: + +```json +mutation createVolumeAssertion { + createVolumeAssertion( + input: { + entityUrn: "", + type: ROW_COUNT_TOTAL, + rowCountTotal: { + operator: BETWEEN, + parameters: { + minValue: { + "value": 10, + "type": NUMBER + }, + maxValue: { + "value": 20, + "type": NUMBER + } + } + } + } + ) { + urn +} +} +``` + +To create an assertion that specifies that the row count total should always fall between 10 and 20. + +The supported volume assertion types are `ROW_COUNT_TOTAL` and `ROW_COUNT_CHANGE`. Other (e.g. incrementing segment) types are not yet supported. +The supported operator types are `GREATER_THAN`, `GREATER_THAN_OR_EQUAL_TO`, `LESS_THAN`, `LESS_THAN_OR_EQUAL_TO`, and `BETWEEN` (requires minValue, maxValue). +The supported parameter types are `NUMBER`. + +To create an Assertion Monitor Entity that evaluates the volume assertion every 8 hours using the Information Schema: + +```json +mutation createAssertionMonitor { + createAssertionMonitor( + input: { + entityUrn: "", + assertionUrn: "", + schedule: { + cron: "0 */8 * * *", + timezone: "America/Los_Angeles" + }, + parameters: { + type: DATASET_VOLUME, + datasetVolumeParameters: { + sourceType: INFORMATION_SCHEMA, + } + } + } + ) { + urn + } +} +``` + +This entity defines _when_ to run the check (Using CRON format - every 8th hour) and _how_ to run the check (using the Information Schema). + +After creating the monitor, the new assertion will start to be evaluated every 8 hours in your selected timezone. + +You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`. + +### Tips + +:::info +**Authorization** + +Remember to always provide a DataHub Personal Access Token when calling the GraphQL API. To do so, just add the 'Authorization' header as follows: + +``` +Authorization: Bearer +``` + +**Exploring GraphQL API** + +Also, remember that you can play with an interactive version of the Acryl GraphQL API at `https://your-account-id.acryl.io/api/graphiql` +::: diff --git a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md index 6c6cce51ea098..b8fb0ea9e80f1 100644 --- a/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md +++ b/docs/managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md @@ -19,7 +19,7 @@ For example, if an ingestion source is not publicly accessible via the internet,

- +

@@ -27,7 +27,7 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut

- +

@@ -50,13 +50,13 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut

- +

- +

@@ -67,7 +67,7 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut

- +

3. In the 'Finish Up' step, click '**Advanced'**. @@ -78,7 +78,7 @@ To accommodate these cases, Acryl supports configuring a remote ingestion execut

- +

## Updating a Remote Ingestion Executor @@ -92,7 +92,7 @@ In order to update the executor, ie. to deploy a new container version, you'll n 6. Upload a copy of the Acryl Remote Executor [CloudFormation Template](https://raw.githubusercontent.com/acryldata/datahub-cloudformation/master/Ingestion/templates/python.ecs.template.yaml)

- +

7. Click **Next** diff --git a/entity-registry/build.gradle b/entity-registry/build.gradle index af742d240d1e6..3da0bf5bb4fb8 100644 --- a/entity-registry/build.gradle +++ b/entity-registry/build.gradle @@ -1,16 +1,17 @@ apply plugin: 'pegasus' +apply plugin: 'java-library' dependencies { - compile spec.product.pegasus.data - compile spec.product.pegasus.generator - compile project(path: ':metadata-models') + implementation spec.product.pegasus.data + implementation spec.product.pegasus.generator + api project(path: ':metadata-models') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - compile externalDependency.guava - compile externalDependency.jacksonDataBind - compile externalDependency.jacksonDataFormatYaml - compile externalDependency.reflections - compile externalDependency.jsonPatch + implementation externalDependency.guava + implementation externalDependency.jacksonDataBind + implementation externalDependency.jacksonDataFormatYaml + implementation externalDependency.reflections + implementation externalDependency.jsonPatch constraints { implementation(externalDependency.snakeYaml) { because("previous versions are vulnerable to CVE-2022-25857") @@ -19,12 +20,13 @@ dependencies { dataModel project(':li-utils') annotationProcessor externalDependency.lombok - compile externalDependency.mavenArtifact + api externalDependency.mavenArtifact - testCompile project(':test-models') - testCompile externalDependency.testng - testCompile externalDependency.mockito - testCompile externalDependency.mockitoInline + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline } compileTestJava.dependsOn tasks.getByPath(':entity-registry:custom-test-model:modelDeploy') diff --git a/entity-registry/custom-test-model/build.gradle b/entity-registry/custom-test-model/build.gradle index 90f50fe1f2992..778e2e42b95c4 100644 --- a/entity-registry/custom-test-model/build.gradle +++ b/entity-registry/custom-test-model/build.gradle @@ -23,11 +23,11 @@ if (project.hasProperty('projVersion')) { dependencies { - compile spec.product.pegasus.data + implementation spec.product.pegasus.data // Uncomment these if you want to depend on models defined in core datahub - //compile project(':li-utils') + //implementation project(':li-utils') //dataModel project(':li-utils') - //compile project(':metadata-models') + //implementation project(':metadata-models') //dataModel project(':metadata-models') } diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index e708b1c023ec8..afba109285af7 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index ec991f9aa12cb..4e86b9270786f 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-6.9.2-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.6.2-bin.zip +networkTimeout=10000 zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index 1b6c787337ffb..65dcd68d65c82 100755 --- a/gradlew +++ b/gradlew @@ -55,7 +55,7 @@ # Darwin, MinGW, and NonStop. # # (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt # within the Gradle project. # # You can find Gradle at https://github.com/gradle/gradle/. @@ -80,10 +80,10 @@ do esac done -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -APP_NAME="Gradle" +# This is normally unused +# shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' @@ -143,12 +143,16 @@ fi if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac case $MAX_FD in #( '' | soft) :;; #( *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -205,6 +209,12 @@ set -- \ org.gradle.wrapper.GradleWrapperMain \ "$@" +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + # Use "xargs" to parse quoted args. # # With -n1 it outputs one arg per line, with the quotes and backslashes removed. diff --git a/gradlew.bat b/gradlew.bat index ac1b06f93825d..6689b85beecde 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -14,7 +14,7 @@ @rem limitations under the License. @rem -@if "%DEBUG%" == "" @echo off +@if "%DEBUG%"=="" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @@ -25,7 +25,8 @@ if "%OS%"=="Windows_NT" setlocal set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. +if "%DIRNAME%"=="" set DIRNAME=. +@rem This is normally unused set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @@ -40,7 +41,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto execute +if %ERRORLEVEL% equ 0 goto execute echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. @@ -75,13 +76,15 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar :end @rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd +if %ERRORLEVEL% equ 0 goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% :mainEnd if "%OS%"=="Windows_NT" endlocal diff --git a/ingestion-scheduler/build.gradle b/ingestion-scheduler/build.gradle index b15b5b8c52673..dc9887406b8b4 100644 --- a/ingestion-scheduler/build.gradle +++ b/ingestion-scheduler/build.gradle @@ -1,16 +1,17 @@ apply plugin: 'java' dependencies { - compile project(path: ':metadata-models') - compile project(path: ':metadata-io') - compile project(path: ':metadata-service:restli-client') - compile project(':metadata-service:configuration') + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-io') + implementation project(path: ':metadata-service:restli-client') + implementation project(':metadata-service:configuration') + implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng constraints { implementation(externalDependency.log4jCore) { diff --git a/li-utils/build.gradle b/li-utils/build.gradle index d11cd86659605..e8b672a3a21fa 100644 --- a/li-utils/build.gradle +++ b/li-utils/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'pegasus' tasks.withType(JavaCompile).configureEach { @@ -13,19 +13,21 @@ tasks.withType(Test).configureEach { } dependencies { - compile spec.product.pegasus.data - compile externalDependency.commonsLang - compile(externalDependency.reflections) { + api spec.product.pegasus.data + implementation externalDependency.commonsLang + implementation(externalDependency.reflections) { exclude group: 'com.google.guava', module: 'guava' } - compile externalDependency.guava + implementation externalDependency.guava implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.assertJ - testCompile project(':test-models') + testImplementation externalDependency.assertJ + testImplementation externalDependency.commonsIo + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') } idea { diff --git a/metadata-auth/auth-api/build.gradle b/metadata-auth/auth-api/build.gradle index f82f488b6f182..2bf9e5243e152 100644 --- a/metadata-auth/auth-api/build.gradle +++ b/metadata-auth/auth-api/build.gradle @@ -3,7 +3,7 @@ plugins { } apply plugin: 'com.github.johnrengelman.shadow' -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'signing' apply plugin: 'maven-publish' apply plugin: 'io.codearte.nexus-staging' @@ -28,14 +28,14 @@ shadowJar { dependencies() { implementation spec.product.pegasus.data implementation project(path: ':li-utils') - implementation project(path: ':metadata-utils') + api project(path: ':metadata-utils') - compile externalDependency.guava - compile externalDependency.lombok + implementation externalDependency.guava + implementation externalDependency.lombok annotationProcessor externalDependency.lombok - - testCompile externalDependency.testng + + testImplementation externalDependency.testng } task sourcesJar(type: Jar) { diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 6b08ac50a4c17..393b10b0e9d24 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -1,20 +1,23 @@ apply plugin: 'java' dependencies { - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':entity-registry') - compile project(':metadata-io') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':entity-registry') + implementation project(':metadata-io') - compile externalDependency.kafkaClients + implementation externalDependency.kafkaClients + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.opentelemetryAnnotations implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito + testImplementation externalDependency.mockito constraints { implementation(externalDependency.log4jCore) { diff --git a/metadata-events/mxe-avro-1.7/build.gradle b/metadata-events/mxe-avro-1.7/build.gradle index 6bde1511bf280..e30406644913c 100644 --- a/metadata-events/mxe-avro-1.7/build.gradle +++ b/metadata-events/mxe-avro-1.7/build.gradle @@ -3,11 +3,11 @@ configurations { } apply plugin: 'io.acryl.gradle.plugin.avro' -apply plugin: 'java' +apply plugin: 'java-library' dependencies { - compile externalDependency.avro_1_7 - compile(externalDependency.avroCompiler_1_7) { + api externalDependency.avro_1_7 + implementation(externalDependency.avroCompiler_1_7) { exclude group: 'org.apache.velocity', module: 'velocity' } constraints { diff --git a/metadata-events/mxe-registration/build.gradle b/metadata-events/mxe-registration/build.gradle index aa5fad09f3fec..60e0da59616d9 100644 --- a/metadata-events/mxe-registration/build.gradle +++ b/metadata-events/mxe-registration/build.gradle @@ -5,11 +5,12 @@ configurations { } dependencies { - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-models') - compile spec.product.pegasus.dataAvro1_6 + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-models') + implementation spec.product.pegasus.dataAvro1_6 - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') avroOriginal project(path: ':metadata-models', configuration: 'avroSchema') diff --git a/metadata-events/mxe-schemas/build.gradle b/metadata-events/mxe-schemas/build.gradle index 0b3e621b8db15..fe46601fb68b7 100644 --- a/metadata-events/mxe-schemas/build.gradle +++ b/metadata-events/mxe-schemas/build.gradle @@ -11,6 +11,10 @@ task copyMetadataModels(type: Copy) { } generateAvroSchema.dependsOn copyMetadataModels +validateSchemaAnnotation.dependsOn copyMetadataModels +mainTranslateSchemas.dependsOn copyMetadataModels +generateDataTemplate.dependsOn copyMetadataModels +mainCopySchemas.dependsOn copyMetadataModels pegasus.main.generationModes = [PegasusGenerationMode.PEGASUS, PegasusGenerationMode.AVRO] task copyOriginalAvsc(type: Copy, dependsOn: generateAvroSchema) { diff --git a/metadata-events/mxe-utils-avro-1.7/build.gradle b/metadata-events/mxe-utils-avro-1.7/build.gradle index f8474e21daa0b..82249d393578c 100644 --- a/metadata-events/mxe-utils-avro-1.7/build.gradle +++ b/metadata-events/mxe-utils-avro-1.7/build.gradle @@ -1,11 +1,12 @@ -apply plugin: 'java' +apply plugin: 'java-library' dependencies { - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-models') - compile spec.product.pegasus.dataAvro1_6 + api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-models') + api spec.product.pegasus.dataAvro1_6 - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') constraints { implementation(externalDependency.log4jCore) { diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index 336be8fc94d44..d1e6f2f646491 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -7,6 +7,10 @@ ext { venv_name = 'venv' } +if (!project.hasProperty("extra_pip_requirements")) { + ext.extra_pip_requirements = "" +} + def pip_install_command = "${venv_name}/bin/pip install -e ../../metadata-ingestion" task checkPythonVersion(type: Exec) { @@ -14,30 +18,37 @@ task checkPythonVersion(type: Exec) { } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && ${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'" + outputs.file(sentinel_file) + commandLine 'bash', '-c', + "${python_executable} -m venv ${venv_name} &&" + + "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" } -task installPackage(type: Exec, dependsOn: environmentSetup) { +task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingestion:codegen']) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") + outputs.file(sentinel_file) // Workaround for https://github.com/yaml/pyyaml/issues/601. // See https://github.com/yaml/pyyaml/issues/601#issuecomment-1638509577. // and https://github.com/datahub-project/datahub/pull/8435. commandLine 'bash', '-x', '-c', "${pip_install_command} install 'Cython<3.0' 'PyYAML<6' --no-build-isolation && " + - "${pip_install_command} -e ." + "${pip_install_command} -e . ${extra_pip_requirements} &&" + + "touch ${sentinel_file}" } task install(dependsOn: [installPackage]) task installDev(type: Exec, dependsOn: [install]) { + def sentinel_file = "${venv_name}/.build_install_dev_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - outputs.file("${venv_name}/.build_install_dev_sentinel") + outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev] && touch ${venv_name}/.build_install_dev_sentinel" + "${pip_install_command} -e .[dev] ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } task lint(type: Exec, dependsOn: installDev) { @@ -45,9 +56,13 @@ task lint(type: Exec, dependsOn: installDev) { The find/sed combo below is a temporary work-around for the following mypy issue with airflow 2.2.0: "venv/lib/python3.8/site-packages/airflow/_vendor/connexion/spec.py:169: error: invalid syntax". */ - commandLine 'bash', '-x', '-c', + commandLine 'bash', '-c', "find ${venv_name}/lib -path *airflow/_vendor/connexion/spec.py -exec sed -i.bak -e '169,169s/ # type: List\\[str\\]//g' {} \\; && " + - "source ${venv_name}/bin/activate && black --check --diff src/ tests/ && isort --check --diff src/ tests/ && flake8 --count --statistics src/ tests/ && mypy src/ tests/" + "source ${venv_name}/bin/activate && set -x && " + + "black --check --diff src/ tests/ && " + + "isort --check --diff src/ tests/ && " + + "flake8 --count --statistics src/ tests/ && " + + "mypy --show-traceback --show-error-codes src/ tests/" } task lintFix(type: Exec, dependsOn: installDev) { commandLine 'bash', '-x', '-c', @@ -58,21 +73,13 @@ task lintFix(type: Exec, dependsOn: installDev) { "mypy src/ tests/ " } -task testQuick(type: Exec, dependsOn: installDev) { - // We can't enforce the coverage requirements if we run a subset of the tests. - inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) - inputs.files(project.fileTree(dir: "tests/")) - outputs.dir("${venv_name}") - commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" -} - task installDevTest(type: Exec, dependsOn: [installDev]) { + def sentinel_file = "${venv_name}/.build_install_dev_test_sentinel" inputs.file file('setup.py') outputs.dir("${venv_name}") - outputs.file("${venv_name}/.build_install_dev_test_sentinel") + outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev,integration-tests] && touch ${venv_name}/.build_install_dev_test_sentinel" + "${pip_install_command} -e .[dev,integration-tests] && touch ${sentinel_file}" } def testFile = hasProperty('testFile') ? testFile : 'unknown' @@ -89,6 +96,16 @@ task testSingle(dependsOn: [installDevTest]) { } } +task testQuick(type: Exec, dependsOn: installDevTest) { + // We can't enforce the coverage requirements if we run a subset of the tests. + inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) + inputs.files(project.fileTree(dir: "tests/")) + outputs.dir("${venv_name}") + commandLine 'bash', '-x', '-c', + "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" +} + + task testFull(type: Exec, dependsOn: [testQuick, installDevTest]) { commandLine 'bash', '-x', '-c', "source ${venv_name}/bin/activate && pytest -m 'not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" diff --git a/metadata-ingestion-modules/airflow-plugin/pyproject.toml b/metadata-ingestion-modules/airflow-plugin/pyproject.toml index 83b79e3146176..fba81486b9f67 100644 --- a/metadata-ingestion-modules/airflow-plugin/pyproject.toml +++ b/metadata-ingestion-modules/airflow-plugin/pyproject.toml @@ -9,7 +9,6 @@ extend-exclude = ''' ^/tmp ''' include = '\.pyi?$' -target-version = ['py36', 'py37', 'py38'] [tool.isort] indent = ' ' diff --git a/metadata-ingestion-modules/airflow-plugin/setup.cfg b/metadata-ingestion-modules/airflow-plugin/setup.cfg index c9a2ba93e9933..157bcce1c298d 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.cfg +++ b/metadata-ingestion-modules/airflow-plugin/setup.cfg @@ -69,4 +69,6 @@ exclude_lines = pragma: no cover @abstract if TYPE_CHECKING: -#omit = +omit = + # omit example dags + src/datahub_airflow_plugin/example_dags/* diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index c2571916ca5d0..c5bdc7ea329cd 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -13,16 +13,21 @@ def get_long_description(): return pathlib.Path(os.path.join(root, "README.md")).read_text() +rest_common = {"requests", "requests_file"} + base_requirements = { # Compatibility. "dataclasses>=0.6; python_version < '3.7'", - "typing_extensions>=3.10.0.2", + # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to Airflow 2.0.2 dependency conflict + "typing_extensions>=3.7.4.3 ; python_version < '3.8'", + "typing_extensions>=3.10.0.2,<4.6.0 ; python_version >= '3.8'", "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", "pydantic>=1.5.1", "apache-airflow >= 2.0.2", - f"acryl-datahub[airflow] == {package_metadata['__version__']}", + *rest_common, + f"acryl-datahub == {package_metadata['__version__']}", } @@ -47,19 +52,18 @@ def get_long_description(): base_dev_requirements = { *base_requirements, *mypy_stubs, - "black>=21.12b0", + "black==22.12.0", "coverage>=5.1", "flake8>=3.8.3", "flake8-tidy-imports>=4.3.0", "isort>=5.7.0", - "mypy>=0.920", + "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10", "pytest>=6.2.2", "pytest-asyncio>=0.16.0", "pytest-cov>=2.8.1", - "pytest-docker>=0.10.3,<0.12", "tox", "deepdiff", "requests-mock", @@ -127,5 +131,13 @@ def get_long_description(): "datahub-kafka": [ f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}" ], + "integration-tests": [ + f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}", + # Extra requirements for Airflow. + "apache-airflow[snowflake]>=2.0.2", # snowflake is used in example dags + # Because of https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 we need to restrict SQLAlchemy's max version. + "SQLAlchemy<1.4.42", + "virtualenv", # needed by PythonVirtualenvOperator + ], }, ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_compat.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_compat.py new file mode 100644 index 0000000000000..67c3348ec987c --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_compat.py @@ -0,0 +1,12 @@ +# This module must be imported before any Airflow imports in any of our files. +# The AIRFLOW_PATCHED just helps avoid flake8 errors. + +from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED + +assert MARKUPSAFE_PATCHED + +AIRFLOW_PATCHED = True + +__all__ = [ + "AIRFLOW_PATCHED", +] diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py new file mode 100644 index 0000000000000..5ad20e1f72551 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py @@ -0,0 +1,29 @@ +from airflow.models.baseoperator import BaseOperator + +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED + +try: + from airflow.models.mappedoperator import MappedOperator + from airflow.models.operator import Operator + from airflow.operators.empty import EmptyOperator +except ModuleNotFoundError: + # Operator isn't a real class, but rather a type alias defined + # as the union of BaseOperator and MappedOperator. + # Since older versions of Airflow don't have MappedOperator, we can just use BaseOperator. + Operator = BaseOperator # type: ignore + MappedOperator = None # type: ignore + from airflow.operators.dummy import DummyOperator as EmptyOperator # type: ignore + +try: + from airflow.sensors.external_task import ExternalTaskSensor +except ImportError: + from airflow.sensors.external_task_sensor import ExternalTaskSensor # type: ignore + +assert AIRFLOW_PATCHED + +__all__ = [ + "Operator", + "MappedOperator", + "EmptyOperator", + "ExternalTaskSensor", +] diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py new file mode 100644 index 0000000000000..d91c039ffa718 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py @@ -0,0 +1,115 @@ +from datetime import datetime +from typing import TYPE_CHECKING, Dict, List + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.configuration.common import ConfigModel +from datahub.utilities.urns.dataset_urn import DatasetUrn + +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.entities import _Entity + +if TYPE_CHECKING: + from airflow import DAG + from airflow.models.dagrun import DagRun + from airflow.models.taskinstance import TaskInstance + + from datahub_airflow_plugin._airflow_shims import Operator + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + +def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: + return [DatasetUrn.create_from_string(let.urn) for let in iolets] + + +class DatahubBasicLineageConfig(ConfigModel): + enabled: bool = True + + # DataHub hook connection ID. + datahub_conn_id: str + + # Cluster to associate with the pipelines and tasks. Defaults to "prod". + cluster: str = builder.DEFAULT_FLOW_CLUSTER + + # If true, the owners field of the DAG will be capture as a DataHub corpuser. + capture_ownership_info: bool = True + + # If true, the tags field of the DAG will be captured as DataHub tags. + capture_tags_info: bool = True + + capture_executions: bool = False + + def make_emitter_hook(self) -> "DatahubGenericHook": + # This is necessary to avoid issues with circular imports. + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + return DatahubGenericHook(self.datahub_conn_id) + + +def send_lineage_to_datahub( + config: DatahubBasicLineageConfig, + operator: "Operator", + inlets: List[_Entity], + outlets: List[_Entity], + context: Dict, +) -> None: + if not config.enabled: + return + + dag: "DAG" = context["dag"] + task: "Operator" = context["task"] + ti: "TaskInstance" = context["task_instance"] + + hook = config.make_emitter_hook() + emitter = hook.make_emitter() + + dataflow = AirflowGenerator.generate_dataflow( + cluster=config.cluster, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + dataflow.emit(emitter) + operator.log.info(f"Emitted from Lineage: {dataflow}") + + datajob = AirflowGenerator.generate_datajob( + cluster=config.cluster, + task=task, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + datajob.inlets.extend(_entities_to_urn_list(inlets)) + datajob.outlets.extend(_entities_to_urn_list(outlets)) + + datajob.emit(emitter) + operator.log.info(f"Emitted from Lineage: {datajob}") + + if config.capture_executions: + dag_run: "DagRun" = context["dag_run"] + + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=dag_run, + datajob=datajob, + emit_templates=False, + ) + + operator.log.info(f"Emitted from Lineage: {dpi}") + + dpi = AirflowGenerator.complete_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=dag_run, + datajob=datajob, + result=InstanceRunResult.SUCCESS, + end_timestamp_millis=int(datetime.utcnow().timestamp() * 1000), + ) + operator.log.info(f"Emitted from Lineage: {dpi}") + + emitter.flush() diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source/azure/__init__.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/__init__.py diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py new file mode 100644 index 0000000000000..b5e86e14d85d0 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py @@ -0,0 +1,512 @@ +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union, cast + +from airflow.configuration import conf +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.metadata.schema_classes import DataProcessTypeClass +from datahub.utilities.urns.data_flow_urn import DataFlowUrn +from datahub.utilities.urns.data_job_urn import DataJobUrn + +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED + +assert AIRFLOW_PATCHED + +if TYPE_CHECKING: + from airflow import DAG + from airflow.models import DagRun, TaskInstance + from datahub.emitter.kafka_emitter import DatahubKafkaEmitter + from datahub.emitter.rest_emitter import DatahubRestEmitter + + from datahub_airflow_plugin._airflow_shims import Operator + + +def _task_downstream_task_ids(operator: "Operator") -> Set[str]: + if hasattr(operator, "downstream_task_ids"): + return operator.downstream_task_ids + return operator._downstream_task_id # type: ignore[attr-defined,union-attr] + + +class AirflowGenerator: + @staticmethod + def _get_dependencies( + task: "Operator", dag: "DAG", flow_urn: DataFlowUrn + ) -> List[DataJobUrn]: + from datahub_airflow_plugin._airflow_shims import ExternalTaskSensor + + # resolve URNs for upstream nodes in subdags upstream of the current task. + upstream_subdag_task_urns: List[DataJobUrn] = [] + + for upstream_task_id in task.upstream_task_ids: + upstream_task = dag.task_dict[upstream_task_id] + + # if upstream task is not a subdag, then skip it + upstream_subdag = getattr(upstream_task, "subdag", None) + if upstream_subdag is None: + continue + + # else, link the leaf tasks of the upstream subdag as upstream tasks + for upstream_subdag_task_id in upstream_subdag.task_dict: + upstream_subdag_task = upstream_subdag.task_dict[ + upstream_subdag_task_id + ] + + upstream_subdag_task_urn = DataJobUrn.create_from_ids( + job_id=upstream_subdag_task_id, data_flow_urn=str(flow_urn) + ) + + # if subdag task is a leaf task, then link it as an upstream task + if len(_task_downstream_task_ids(upstream_subdag_task)) == 0: + upstream_subdag_task_urns.append(upstream_subdag_task_urn) + + # resolve URNs for upstream nodes that trigger the subdag containing the current task. + # (if it is in a subdag at all) + upstream_subdag_triggers: List[DataJobUrn] = [] + + # subdags are always named with 'parent.child' style or Airflow won't run them + # add connection from subdag trigger(s) if subdag task has no upstreams + if ( + dag.is_subdag + and dag.parent_dag is not None + and len(task.upstream_task_ids) == 0 + ): + # filter through the parent dag's tasks and find the subdag trigger(s) + subdags = [ + x for x in dag.parent_dag.task_dict.values() if x.subdag is not None + ] + matched_subdags = [ + x for x in subdags if x.subdag and x.subdag.dag_id == dag.dag_id + ] + + # id of the task containing the subdag + subdag_task_id = matched_subdags[0].task_id + + # iterate through the parent dag's tasks and find the ones that trigger the subdag + for upstream_task_id in dag.parent_dag.task_dict: + upstream_task = dag.parent_dag.task_dict[upstream_task_id] + upstream_task_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(flow_urn), job_id=upstream_task_id + ) + + # if the task triggers the subdag, link it to this node in the subdag + if subdag_task_id in _task_downstream_task_ids(upstream_task): + upstream_subdag_triggers.append(upstream_task_urn) + + # If the operator is an ExternalTaskSensor then we set the remote task as upstream. + # It is possible to tie an external sensor to DAG if external_task_id is omitted but currently we can't tie + # jobflow to anothet jobflow. + external_task_upstreams = [] + if task.task_type == "ExternalTaskSensor": + task = cast(ExternalTaskSensor, task) + if hasattr(task, "external_task_id") and task.external_task_id is not None: + external_task_upstreams = [ + DataJobUrn.create_from_ids( + job_id=task.external_task_id, + data_flow_urn=str( + DataFlowUrn.create_from_ids( + orchestrator=flow_urn.get_orchestrator_name(), + flow_id=task.external_dag_id, + env=flow_urn.get_env(), + ) + ), + ) + ] + # exclude subdag operator tasks since these are not emitted, resulting in empty metadata + upstream_tasks = ( + [ + DataJobUrn.create_from_ids(job_id=task_id, data_flow_urn=str(flow_urn)) + for task_id in task.upstream_task_ids + if getattr(dag.task_dict[task_id], "subdag", None) is None + ] + + upstream_subdag_task_urns + + upstream_subdag_triggers + + external_task_upstreams + ) + return upstream_tasks + + @staticmethod + def generate_dataflow( + cluster: str, + dag: "DAG", + capture_owner: bool = True, + capture_tags: bool = True, + ) -> DataFlow: + """ + Generates a Dataflow object from an Airflow DAG + :param cluster: str - name of the cluster + :param dag: DAG - + :param capture_tags: + :param capture_owner: + :return: DataFlow - Data generated dataflow + """ + id = dag.dag_id + orchestrator = "airflow" + description = f"{dag.description}\n\n{dag.doc_md or ''}" + data_flow = DataFlow( + env=cluster, id=id, orchestrator=orchestrator, description=description + ) + + flow_property_bag: Dict[str, str] = {} + + allowed_flow_keys = [ + "_access_control", + "_concurrency", + "_default_view", + "catchup", + "fileloc", + "is_paused_upon_creation", + "start_date", + "tags", + "timezone", + ] + + for key in allowed_flow_keys: + if hasattr(dag, key): + flow_property_bag[key] = repr(getattr(dag, key)) + + data_flow.properties = flow_property_bag + base_url = conf.get("webserver", "base_url") + data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" + + if capture_owner and dag.owner: + data_flow.owners.add(dag.owner) + + if capture_tags and dag.tags: + data_flow.tags.update(dag.tags) + + return data_flow + + @staticmethod + def _get_description(task: "Operator") -> Optional[str]: + from airflow.models.baseoperator import BaseOperator + + if not isinstance(task, BaseOperator): + # TODO: Get docs for mapped operators. + return None + + if hasattr(task, "doc") and task.doc: + return task.doc + elif hasattr(task, "doc_md") and task.doc_md: + return task.doc_md + elif hasattr(task, "doc_json") and task.doc_json: + return task.doc_json + elif hasattr(task, "doc_yaml") and task.doc_yaml: + return task.doc_yaml + elif hasattr(task, "doc_rst") and task.doc_yaml: + return task.doc_yaml + return None + + @staticmethod + def generate_datajob( + cluster: str, + task: "Operator", + dag: "DAG", + set_dependencies: bool = True, + capture_owner: bool = True, + capture_tags: bool = True, + ) -> DataJob: + """ + + :param cluster: str + :param task: TaskIntance + :param dag: DAG + :param set_dependencies: bool - whether to extract dependencies from airflow task + :param capture_owner: bool - whether to extract owner from airflow task + :param capture_tags: bool - whether to set tags automatically from airflow task + :return: DataJob - returns the generated DataJob object + """ + dataflow_urn = DataFlowUrn.create_from_ids( + orchestrator="airflow", env=cluster, flow_id=dag.dag_id + ) + datajob = DataJob(id=task.task_id, flow_urn=dataflow_urn) + + # TODO add support for MappedOperator + datajob.description = AirflowGenerator._get_description(task) + + job_property_bag: Dict[str, str] = {} + + allowed_task_keys = [ + "_downstream_task_ids", + "_inlets", + "_outlets", + "_task_type", + "_task_module", + "depends_on_past", + "email", + "label", + "execution_timeout", + "sla", + "sql", + "task_id", + "trigger_rule", + "wait_for_downstream", + # In Airflow 2.3, _downstream_task_ids was renamed to downstream_task_ids + "downstream_task_ids", + # In Airflow 2.4, _inlets and _outlets were removed in favor of non-private versions. + "inlets", + "outlets", + ] + + for key in allowed_task_keys: + if hasattr(task, key): + job_property_bag[key] = repr(getattr(task, key)) + + datajob.properties = job_property_bag + base_url = conf.get("webserver", "base_url") + datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}" + + if capture_owner and dag.owner: + datajob.owners.add(dag.owner) + + if capture_tags and dag.tags: + datajob.tags.update(dag.tags) + + if set_dependencies: + datajob.upstream_urns.extend( + AirflowGenerator._get_dependencies( + task=task, dag=dag, flow_urn=datajob.flow_urn + ) + ) + + return datajob + + @staticmethod + def create_datajob_instance( + cluster: str, + task: "Operator", + dag: "DAG", + data_job: Optional[DataJob] = None, + ) -> DataProcessInstance: + if data_job is None: + data_job = AirflowGenerator.generate_datajob(cluster, task=task, dag=dag) + dpi = DataProcessInstance.from_datajob( + datajob=data_job, id=task.task_id, clone_inlets=True, clone_outlets=True + ) + return dpi + + @staticmethod + def run_dataflow( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + dag_run: "DagRun", + start_timestamp_millis: Optional[int] = None, + dataflow: Optional[DataFlow] = None, + ) -> None: + if dataflow is None: + assert dag_run.dag + dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) + + if start_timestamp_millis is None: + assert dag_run.execution_date + start_timestamp_millis = int(dag_run.execution_date.timestamp() * 1000) + + assert dag_run.run_id + dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) + + # This property only exists in Airflow2 + if hasattr(dag_run, "run_type"): + from airflow.utils.types import DagRunType + + if dag_run.run_type == DagRunType.SCHEDULED: + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + elif dag_run.run_type == DagRunType.MANUAL: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + else: + if dag_run.run_id.startswith("scheduled__"): + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + else: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + + property_bag: Dict[str, str] = {} + property_bag["run_id"] = str(dag_run.run_id) + property_bag["execution_date"] = str(dag_run.execution_date) + property_bag["end_date"] = str(dag_run.end_date) + property_bag["start_date"] = str(dag_run.start_date) + property_bag["creating_job_id"] = str(dag_run.creating_job_id) + # These properties only exists in Airflow>=2.2.0 + if hasattr(dag_run, "data_interval_start") and hasattr( + dag_run, "data_interval_end" + ): + property_bag["data_interval_start"] = str(dag_run.data_interval_start) + property_bag["data_interval_end"] = str(dag_run.data_interval_end) + property_bag["external_trigger"] = str(dag_run.external_trigger) + dpi.properties.update(property_bag) + + dpi.emit_process_start( + emitter=emitter, start_timestamp_millis=start_timestamp_millis + ) + + @staticmethod + def complete_dataflow( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + dag_run: "DagRun", + end_timestamp_millis: Optional[int] = None, + dataflow: Optional[DataFlow] = None, + ) -> None: + """ + + :param emitter: DatahubRestEmitter - the datahub rest emitter to emit the generated mcps + :param cluster: str - name of the cluster + :param dag_run: DagRun + :param end_timestamp_millis: Optional[int] - the completion time in milliseconds if not set the current time will be used. + :param dataflow: Optional[Dataflow] + """ + if dataflow is None: + assert dag_run.dag + dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) + + assert dag_run.run_id + dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) + if end_timestamp_millis is None: + if dag_run.end_date is None: + raise Exception( + f"Dag {dag_run.dag_id}_{dag_run.run_id} is still running and unable to get end_date..." + ) + end_timestamp_millis = int(dag_run.end_date.timestamp() * 1000) + + # We should use DagRunState but it is not available in Airflow 1 + if dag_run.state == "success": + result = InstanceRunResult.SUCCESS + elif dag_run.state == "failed": + result = InstanceRunResult.FAILURE + else: + raise Exception( + f"Result should be either success or failure and it was {dag_run.state}" + ) + + dpi.emit_process_end( + emitter=emitter, + end_timestamp_millis=end_timestamp_millis, + result=result, + result_type="airflow", + ) + + @staticmethod + def run_datajob( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + ti: "TaskInstance", + dag: "DAG", + dag_run: "DagRun", + start_timestamp_millis: Optional[int] = None, + datajob: Optional[DataJob] = None, + attempt: Optional[int] = None, + emit_templates: bool = True, + ) -> DataProcessInstance: + if datajob is None: + datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) + + assert dag_run.run_id + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", + clone_inlets=True, + clone_outlets=True, + ) + job_property_bag: Dict[str, str] = {} + job_property_bag["run_id"] = str(dag_run.run_id) + job_property_bag["duration"] = str(ti.duration) + job_property_bag["start_date"] = str(ti.start_date) + job_property_bag["end_date"] = str(ti.end_date) + job_property_bag["execution_date"] = str(ti.execution_date) + job_property_bag["try_number"] = str(ti.try_number - 1) + job_property_bag["hostname"] = str(ti.hostname) + job_property_bag["max_tries"] = str(ti.max_tries) + # Not compatible with Airflow 1 + if hasattr(ti, "external_executor_id"): + job_property_bag["external_executor_id"] = str(ti.external_executor_id) + job_property_bag["pid"] = str(ti.pid) + job_property_bag["state"] = str(ti.state) + job_property_bag["operator"] = str(ti.operator) + job_property_bag["priority_weight"] = str(ti.priority_weight) + job_property_bag["unixname"] = str(ti.unixname) + job_property_bag["log_url"] = ti.log_url + dpi.properties.update(job_property_bag) + dpi.url = ti.log_url + + # This property only exists in Airflow2 + if hasattr(ti, "dag_run") and hasattr(ti.dag_run, "run_type"): + from airflow.utils.types import DagRunType + + if ti.dag_run.run_type == DagRunType.SCHEDULED: + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + elif ti.dag_run.run_type == DagRunType.MANUAL: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + else: + if dag_run.run_id.startswith("scheduled__"): + dpi.type = DataProcessTypeClass.BATCH_SCHEDULED + else: + dpi.type = DataProcessTypeClass.BATCH_AD_HOC + + if start_timestamp_millis is None: + assert ti.start_date + start_timestamp_millis = int(ti.start_date.timestamp() * 1000) + + if attempt is None: + attempt = ti.try_number + + dpi.emit_process_start( + emitter=emitter, + start_timestamp_millis=start_timestamp_millis, + attempt=attempt, + emit_template=emit_templates, + ) + return dpi + + @staticmethod + def complete_datajob( + emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + cluster: str, + ti: "TaskInstance", + dag: "DAG", + dag_run: "DagRun", + end_timestamp_millis: Optional[int] = None, + result: Optional[InstanceRunResult] = None, + datajob: Optional[DataJob] = None, + ) -> DataProcessInstance: + """ + + :param emitter: DatahubRestEmitter + :param cluster: str + :param ti: TaskInstance + :param dag: DAG + :param dag_run: DagRun + :param end_timestamp_millis: Optional[int] + :param result: Optional[str] One of the result from datahub.metadata.schema_class.RunResultTypeClass + :param datajob: Optional[DataJob] + :return: DataProcessInstance + """ + if datajob is None: + datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) + + if end_timestamp_millis is None: + assert ti.end_date + end_timestamp_millis = int(ti.end_date.timestamp() * 1000) + + if result is None: + # We should use TaskInstanceState but it is not available in Airflow 1 + if ti.state == "success": + result = InstanceRunResult.SUCCESS + elif ti.state == "failed": + result = InstanceRunResult.FAILURE + else: + raise Exception( + f"Result should be either success or failure and it was {ti.state}" + ) + + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", + clone_inlets=True, + clone_outlets=True, + ) + dpi.emit_process_end( + emitter=emitter, + end_timestamp_millis=end_timestamp_millis, + result=result, + result_type="airflow", + ) + return dpi diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index 226a7382f7595..d1cec9e5c1b54 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -1,4 +1,367 @@ -# This package serves as a shim, but the actual implementation lives in datahub_provider -# from the acryl-datahub package. We leave this shim here to avoid breaking existing -# Airflow installs. -from datahub_provider._plugin import DatahubPlugin # noqa: F401 +import contextlib +import logging +import traceback +from typing import Any, Callable, Iterable, List, Optional, Union + +from airflow.configuration import conf +from airflow.lineage import PIPELINE_OUTLETS +from airflow.models.baseoperator import BaseOperator +from airflow.plugins_manager import AirflowPlugin +from airflow.utils.module_loading import import_string +from cattr import structure +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult + +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED +from datahub_airflow_plugin._airflow_shims import MappedOperator, Operator +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook +from datahub_airflow_plugin.lineage.datahub import DatahubLineageConfig + +assert AIRFLOW_PATCHED +logger = logging.getLogger(__name__) + +TASK_ON_FAILURE_CALLBACK = "on_failure_callback" +TASK_ON_SUCCESS_CALLBACK = "on_success_callback" + + +def get_lineage_config() -> DatahubLineageConfig: + """Load the lineage config from airflow.cfg.""" + + enabled = conf.get("datahub", "enabled", fallback=True) + datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") + cluster = conf.get("datahub", "cluster", fallback="prod") + graceful_exceptions = conf.get("datahub", "graceful_exceptions", fallback=True) + capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) + capture_ownership_info = conf.get( + "datahub", "capture_ownership_info", fallback=True + ) + capture_executions = conf.get("datahub", "capture_executions", fallback=True) + return DatahubLineageConfig( + enabled=enabled, + datahub_conn_id=datahub_conn_id, + cluster=cluster, + graceful_exceptions=graceful_exceptions, + capture_ownership_info=capture_ownership_info, + capture_tags_info=capture_tags_info, + capture_executions=capture_executions, + ) + + +def _task_inlets(operator: "Operator") -> List: + # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets + if hasattr(operator, "_inlets"): + return operator._inlets # type: ignore[attr-defined, union-attr] + return operator.inlets + + +def _task_outlets(operator: "Operator") -> List: + # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets + # We have to use _outlets because outlets is empty in Airflow < 2.4.0 + if hasattr(operator, "_outlets"): + return operator._outlets # type: ignore[attr-defined, union-attr] + return operator.outlets + + +def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: + # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae + # in Airflow 2.4. + # TODO: ignore/handle airflow's dataset type in our lineage + + inlets: List[Any] = [] + task_inlets = _task_inlets(task) + # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator + if isinstance(task_inlets, (str, BaseOperator)): + inlets = [ + task_inlets, + ] + + if task_inlets and isinstance(task_inlets, list): + inlets = [] + task_ids = ( + {o for o in task_inlets if isinstance(o, str)} + .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) + .intersection(task.get_flat_relative_ids(upstream=True)) + ) + + from airflow.lineage import AUTO + + # pick up unique direct upstream task_ids if AUTO is specified + if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: + print("Picking up unique direct upstream task_ids as AUTO is specified") + task_ids = task_ids.union( + task_ids.symmetric_difference(task.upstream_task_ids) + ) + + inlets = task.xcom_pull( + context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS + ) + + # re-instantiate the obtained inlets + inlets = [ + structure(item["data"], import_string(item["type_name"])) + # _get_instance(structure(item, Metadata)) + for sublist in inlets + if sublist + for item in sublist + ] + + for inlet in task_inlets: + if not isinstance(inlet, str): + inlets.append(inlet) + + return inlets + + +def _make_emit_callback( + logger: logging.Logger, +) -> Callable[[Optional[Exception], str], None]: + def emit_callback(err: Optional[Exception], msg: str) -> None: + if err: + logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) + + return emit_callback + + +def datahub_task_status_callback(context, status): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_inlets_from_task(task, context) + + emitter = ( + DatahubGenericHook(context["_datahub_config"].datahub_conn_id) + .get_underlying_hook() + .make_emitter() + ) + + dataflow = AirflowGenerator.generate_dataflow( + cluster=context["_datahub_config"].cluster, + dag=dag, + capture_tags=context["_datahub_config"].capture_tags_info, + capture_owner=context["_datahub_config"].capture_ownership_info, + ) + task.log.info(f"Emitting Datahub Dataflow: {dataflow}") + dataflow.emit(emitter, callback=_make_emit_callback(task.log)) + + datajob = AirflowGenerator.generate_datajob( + cluster=context["_datahub_config"].cluster, + task=task, + dag=dag, + capture_tags=context["_datahub_config"].capture_tags_info, + capture_owner=context["_datahub_config"].capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = _task_outlets(task) + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub Datajob: {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if context["_datahub_config"].capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=context["_datahub_config"].cluster, + ti=context["ti"], + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") + + dpi = AirflowGenerator.complete_datajob( + emitter=emitter, + cluster=context["_datahub_config"].cluster, + ti=context["ti"], + dag_run=context["dag_run"], + result=status, + dag=dag, + datajob=datajob, + end_timestamp_millis=int(ti.end_date.timestamp() * 1000), + ) + task.log.info(f"Emitted Completed Data Process Instance: {dpi}") + + emitter.flush() + + +def datahub_pre_execution(context): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + + task.log.info("Running Datahub pre_execute method") + + emitter = ( + DatahubGenericHook(context["_datahub_config"].datahub_conn_id) + .get_underlying_hook() + .make_emitter() + ) + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_inlets_from_task(task, context) + + datajob = AirflowGenerator.generate_datajob( + cluster=context["_datahub_config"].cluster, + task=context["ti"].task, + dag=dag, + capture_tags=context["_datahub_config"].capture_tags_info, + capture_owner=context["_datahub_config"].capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = _task_outlets(task) + + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub dataJob {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if context["_datahub_config"].capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=context["_datahub_config"].cluster, + ti=context["ti"], + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") + + emitter.flush() + + +def _wrap_pre_execution(pre_execution): + def custom_pre_execution(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + datahub_pre_execution(context) + + # Call original policy + if pre_execution: + pre_execution(context) + + return custom_pre_execution + + +def _wrap_on_failure_callback(on_failure_callback): + def custom_on_failure_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_failure_callback: + on_failure_callback(context) + + return custom_on_failure_callback + + +def _wrap_on_success_callback(on_success_callback): + def custom_on_success_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_success_callback: + on_success_callback(context) + + return custom_on_success_callback + + +def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: + task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") + # task.add_inlets(["auto"]) + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + # MappedOperator's callbacks don't have setters until Airflow 2.X.X + # https://github.com/apache/airflow/issues/24547 + # We can bypass this by going through partial_kwargs for now + if MappedOperator and isinstance(task, MappedOperator): # type: ignore + on_failure_callback_prop: property = getattr( + MappedOperator, TASK_ON_FAILURE_CALLBACK + ) + on_success_callback_prop: property = getattr( + MappedOperator, TASK_ON_SUCCESS_CALLBACK + ) + if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: + task.log.debug( + "Using MappedOperator's partial_kwargs instead of callback properties" + ) + task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( + task.on_failure_callback + ) + task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( + task.on_success_callback + ) + return + + task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore + task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + +def _wrap_task_policy(policy): + if policy and hasattr(policy, "_task_policy_patched_by"): + return policy + + def custom_task_policy(task): + policy(task) + task_policy(task) + + # Add a flag to the policy to indicate that we've patched it. + custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] + return custom_task_policy + + +def _patch_policy(settings): + if hasattr(settings, "task_policy"): + datahub_task_policy = _wrap_task_policy(settings.task_policy) + settings.task_policy = datahub_task_policy + + +def _patch_datahub_policy(): + with contextlib.suppress(ImportError): + import airflow_local_settings + + _patch_policy(airflow_local_settings) + + from airflow.models.dagbag import settings + + _patch_policy(settings) + + +_patch_datahub_policy() + + +class DatahubPlugin(AirflowPlugin): + name = "datahub_plugin" diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py new file mode 100644 index 0000000000000..69f667cad3241 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py @@ -0,0 +1,47 @@ +from abc import abstractmethod +from typing import Optional + +import attr +import datahub.emitter.mce_builder as builder +from datahub.utilities.urns.urn import guess_entity_type + + +class _Entity: + @property + @abstractmethod + def urn(self) -> str: + pass + + +@attr.s(auto_attribs=True, str=True) +class Dataset(_Entity): + platform: str + name: str + env: str = builder.DEFAULT_ENV + platform_instance: Optional[str] = None + + @property + def urn(self): + return builder.make_dataset_urn_with_platform_instance( + platform=self.platform, + name=self.name, + platform_instance=self.platform_instance, + env=self.env, + ) + + +@attr.s(str=True) +class Urn(_Entity): + _urn: str = attr.ib() + + @_urn.validator + def _validate_urn(self, attribute, value): + if not value.startswith("urn:"): + raise ValueError("invalid urn provided: urns must start with 'urn:'") + if guess_entity_type(value) != "dataset": + # This is because DataJobs only support Dataset lineage. + raise ValueError("Airflow lineage currently only supports datasets") + + @property + def urn(self): + return self._urn diff --git a/metadata-ingestion/src/datahub_provider/example_dags/.airflowignore b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/.airflowignore similarity index 100% rename from metadata-ingestion/src/datahub_provider/example_dags/.airflowignore rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/.airflowignore diff --git a/metadata-ingestion/src/datahub_provider/example_dags/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/__init__.py similarity index 100% rename from metadata-ingestion/src/datahub_provider/example_dags/__init__.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/__init__.py diff --git a/metadata-ingestion/src/datahub_provider/example_dags/generic_recipe_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py similarity index 98% rename from metadata-ingestion/src/datahub_provider/example_dags/generic_recipe_sample_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py index d0e4aa944e840..ff8dba457066f 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/generic_recipe_sample_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py @@ -9,7 +9,6 @@ from airflow import DAG from airflow.operators.python import PythonOperator from airflow.utils.dates import days_ago - from datahub.configuration.config_loader import load_config_file from datahub.ingestion.run.pipeline import Pipeline @@ -41,6 +40,7 @@ def datahub_recipe(): schedule_interval=timedelta(days=1), start_date=days_ago(2), catchup=False, + default_view="tree", ) as dag: ingest_task = PythonOperator( task_id="ingest_using_recipe", diff --git a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py similarity index 94% rename from metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py index 95b594e4052a5..3caea093b932d 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py @@ -9,7 +9,7 @@ from airflow.operators.bash import BashOperator from airflow.utils.dates import days_ago -from datahub_provider.entities import Dataset, Urn +from datahub_airflow_plugin.entities import Dataset, Urn default_args = { "owner": "airflow", @@ -28,6 +28,7 @@ start_date=days_ago(2), tags=["example_tag"], catchup=False, + default_view="tree", ) as dag: task1 = BashOperator( task_id="run_data_task", diff --git a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py similarity index 94% rename from metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py index 1fe321eb5c80a..ceb0f452b540a 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py @@ -8,7 +8,7 @@ from airflow.decorators import dag, task from airflow.utils.dates import days_ago -from datahub_provider.entities import Dataset, Urn +from datahub_airflow_plugin.entities import Dataset, Urn default_args = { "owner": "airflow", @@ -26,6 +26,7 @@ start_date=days_ago(2), tags=["example_tag"], catchup=False, + default_view="tree", ) def datahub_lineage_backend_taskflow_demo(): @task( diff --git a/metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py similarity index 96% rename from metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py index 153464246cef7..f40295c6bb883 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/lineage_emission_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py @@ -5,12 +5,12 @@ from datetime import timedelta +import datahub.emitter.mce_builder as builder from airflow import DAG from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator from airflow.utils.dates import days_ago -import datahub.emitter.mce_builder as builder -from datahub_provider.operators.datahub import DatahubEmitterOperator +from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator default_args = { "owner": "airflow", @@ -31,6 +31,7 @@ schedule_interval=timedelta(days=1), start_date=days_ago(2), catchup=False, + default_view="tree", ) as dag: # This example shows a SnowflakeOperator followed by a lineage emission. However, the # same DatahubEmitterOperator can be used to emit lineage in any context. diff --git a/metadata-ingestion/src/datahub_provider/example_dags/mysql_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py similarity index 98% rename from metadata-ingestion/src/datahub_provider/example_dags/mysql_sample_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py index 2c833e1425634..77b29711d7688 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/mysql_sample_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py @@ -47,6 +47,7 @@ def ingest_from_mysql(): start_date=datetime(2022, 1, 1), schedule_interval=timedelta(days=1), catchup=False, + default_view="tree", ) as dag: # While it is also possible to use the PythonOperator, we recommend using # the PythonVirtualenvOperator to ensure that there are no dependency diff --git a/metadata-ingestion/src/datahub_provider/example_dags/snowflake_sample_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py similarity index 99% rename from metadata-ingestion/src/datahub_provider/example_dags/snowflake_sample_dag.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py index c107bb479262c..30e63b68e459f 100644 --- a/metadata-ingestion/src/datahub_provider/example_dags/snowflake_sample_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py @@ -57,6 +57,7 @@ def ingest_from_snowflake(snowflake_credentials, datahub_gms_server): start_date=datetime(2022, 1, 1), schedule_interval=timedelta(days=1), catchup=False, + default_view="tree", ) as dag: # This example pulls credentials from Airflow's connection store. # For this to work, you must have previously configured these connections in Airflow. diff --git a/.github/workflows/docker-ingestion-base.yml b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/__init__.py similarity index 100% rename from .github/workflows/docker-ingestion-base.yml rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/__init__.py diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py new file mode 100644 index 0000000000000..aed858c6c4df0 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py @@ -0,0 +1,214 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +from airflow.exceptions import AirflowException +from airflow.hooks.base import BaseHook +from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( + MetadataChangeEvent, + MetadataChangeProposal, +) + +if TYPE_CHECKING: + from airflow.models.connection import Connection + from datahub.emitter.kafka_emitter import DatahubKafkaEmitter + from datahub.emitter.rest_emitter import DatahubRestEmitter + from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig + + +class DatahubRestHook(BaseHook): + """ + Creates a DataHub Rest API connection used to send metadata to DataHub. + Takes the endpoint for your DataHub Rest API in the Server Endpoint(host) field. + + URI example: :: + + AIRFLOW_CONN_DATAHUB_REST_DEFAULT='datahub-rest://rest-endpoint' + + :param datahub_rest_conn_id: Reference to the DataHub Rest connection. + :type datahub_rest_conn_id: str + """ + + conn_name_attr = "datahub_rest_conn_id" + default_conn_name = "datahub_rest_default" + conn_type = "datahub_rest" + hook_name = "DataHub REST Server" + + def __init__(self, datahub_rest_conn_id: str = default_conn_name) -> None: + super().__init__() + self.datahub_rest_conn_id = datahub_rest_conn_id + + @staticmethod + def get_connection_form_widgets() -> Dict[str, Any]: + return {} + + @staticmethod + def get_ui_field_behaviour() -> Dict: + """Returns custom field behavior""" + return { + "hidden_fields": ["port", "schema", "login"], + "relabeling": { + "host": "Server Endpoint", + }, + } + + def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: + conn: "Connection" = self.get_connection(self.datahub_rest_conn_id) + + host = conn.host + if not host: + raise AirflowException("host parameter is required") + if conn.port: + if ":" in host: + raise AirflowException( + "host parameter should not contain a port number if the port is specified separately" + ) + host = f"{host}:{conn.port}" + password = conn.password + timeout_sec = conn.extra_dejson.get("timeout_sec") + return (host, password, timeout_sec) + + def make_emitter(self) -> "DatahubRestEmitter": + import datahub.emitter.rest_emitter + + return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) + + def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + emitter = self.make_emitter() + + for mce in mces: + emitter.emit_mce(mce) + + def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: + emitter = self.make_emitter() + + for mce in mcps: + emitter.emit_mcp(mce) + + +class DatahubKafkaHook(BaseHook): + """ + Creates a DataHub Kafka connection used to send metadata to DataHub. + Takes your kafka broker in the Kafka Broker(host) field. + + URI example: :: + + AIRFLOW_CONN_DATAHUB_KAFKA_DEFAULT='datahub-kafka://kafka-broker' + + :param datahub_kafka_conn_id: Reference to the DataHub Kafka connection. + :type datahub_kafka_conn_id: str + """ + + conn_name_attr = "datahub_kafka_conn_id" + default_conn_name = "datahub_kafka_default" + conn_type = "datahub_kafka" + hook_name = "DataHub Kafka Sink" + + def __init__(self, datahub_kafka_conn_id: str = default_conn_name) -> None: + super().__init__() + self.datahub_kafka_conn_id = datahub_kafka_conn_id + + @staticmethod + def get_connection_form_widgets() -> Dict[str, Any]: + return {} + + @staticmethod + def get_ui_field_behaviour() -> Dict: + """Returns custom field behavior""" + return { + "hidden_fields": ["port", "schema", "login", "password"], + "relabeling": { + "host": "Kafka Broker", + }, + } + + def _get_config(self) -> "KafkaSinkConfig": + import datahub.ingestion.sink.datahub_kafka + + conn = self.get_connection(self.datahub_kafka_conn_id) + obj = conn.extra_dejson + obj.setdefault("connection", {}) + if conn.host is not None: + if "bootstrap" in obj["connection"]: + raise AirflowException( + "Kafka broker specified twice (present in host and extra)" + ) + obj["connection"]["bootstrap"] = ":".join( + map(str, filter(None, [conn.host, conn.port])) + ) + config = datahub.ingestion.sink.datahub_kafka.KafkaSinkConfig.parse_obj(obj) + return config + + def make_emitter(self) -> "DatahubKafkaEmitter": + import datahub.emitter.kafka_emitter + + sink_config = self._get_config() + return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(sink_config) + + def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + emitter = self.make_emitter() + errors = [] + + def callback(exc, msg): + if exc: + errors.append(exc) + + for mce in mces: + emitter.emit_mce_async(mce, callback) + + emitter.flush() + + if errors: + raise AirflowException(f"failed to push some MCEs: {errors}") + + def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: + emitter = self.make_emitter() + errors = [] + + def callback(exc, msg): + if exc: + errors.append(exc) + + for mcp in mcps: + emitter.emit_mcp_async(mcp, callback) + + emitter.flush() + + if errors: + raise AirflowException(f"failed to push some MCPs: {errors}") + + +class DatahubGenericHook(BaseHook): + """ + Emits Metadata Change Events using either the DatahubRestHook or the + DatahubKafkaHook. Set up a DataHub Rest or Kafka connection to use. + + :param datahub_conn_id: Reference to the DataHub connection. + :type datahub_conn_id: str + """ + + def __init__(self, datahub_conn_id: str) -> None: + super().__init__() + self.datahub_conn_id = datahub_conn_id + + def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: + conn = self.get_connection(self.datahub_conn_id) + + # We need to figure out the underlying hook type. First check the + # conn_type. If that fails, attempt to guess using the conn id name. + if conn.conn_type == DatahubRestHook.conn_type: + return DatahubRestHook(self.datahub_conn_id) + elif conn.conn_type == DatahubKafkaHook.conn_type: + return DatahubKafkaHook(self.datahub_conn_id) + elif "rest" in self.datahub_conn_id: + return DatahubRestHook(self.datahub_conn_id) + elif "kafka" in self.datahub_conn_id: + return DatahubKafkaHook(self.datahub_conn_id) + else: + raise AirflowException( + f"DataHub cannot handle conn_type {conn.conn_type} in {conn}" + ) + + def make_emitter(self) -> Union["DatahubRestEmitter", "DatahubKafkaEmitter"]: + return self.get_underlying_hook().make_emitter() + + def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + return self.get_underlying_hook().emit_mces(mces) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py new file mode 100644 index 0000000000000..c41bb2b2a1e37 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py @@ -0,0 +1,91 @@ +import json +from typing import TYPE_CHECKING, Dict, List, Optional + +from airflow.configuration import conf +from airflow.lineage.backend import LineageBackend + +from datahub_airflow_plugin._lineage_core import ( + DatahubBasicLineageConfig, + send_lineage_to_datahub, +) + +if TYPE_CHECKING: + from airflow.models.baseoperator import BaseOperator + + +class DatahubLineageConfig(DatahubBasicLineageConfig): + # If set to true, most runtime errors in the lineage backend will be + # suppressed and will not cause the overall task to fail. Note that + # configuration issues will still throw exceptions. + graceful_exceptions: bool = True + + +def get_lineage_config() -> DatahubLineageConfig: + """Load the lineage config from airflow.cfg.""" + + # The kwargs pattern is also used for secret backends. + kwargs_str = conf.get("lineage", "datahub_kwargs", fallback="{}") + kwargs = json.loads(kwargs_str) + + # Continue to support top-level datahub_conn_id config. + datahub_conn_id = conf.get("lineage", "datahub_conn_id", fallback=None) + if datahub_conn_id: + kwargs["datahub_conn_id"] = datahub_conn_id + + return DatahubLineageConfig.parse_obj(kwargs) + + +class DatahubLineageBackend(LineageBackend): + """ + Sends lineage data from tasks to DataHub. + + Configurable via ``airflow.cfg`` as follows: :: + + # For REST-based: + airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://localhost:8080' + # For Kafka-based (standard Kafka sink config can be passed via extras): + airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' + + [lineage] + backend = datahub_provider.lineage.datahub.DatahubLineageBackend + datahub_kwargs = { + "datahub_conn_id": "datahub_rest_default", + "capture_ownership_info": true, + "capture_tags_info": true, + "graceful_exceptions": true } + # The above indentation is important! + """ + + def __init__(self) -> None: + super().__init__() + + # By attempting to get and parse the config, we can detect configuration errors + # ahead of time. The init method is only called in Airflow 2.x. + _ = get_lineage_config() + + # With Airflow 2.0, this can be an instance method. However, with Airflow 1.10.x, this + # method is used statically, even though LineageBackend declares it as an instance variable. + @staticmethod + def send_lineage( + operator: "BaseOperator", + inlets: Optional[List] = None, # unused + outlets: Optional[List] = None, # unused + context: Optional[Dict] = None, + ) -> None: + config = get_lineage_config() + if not config.enabled: + return + + try: + context = context or {} # ensure not None to satisfy mypy + send_lineage_to_datahub( + config, operator, operator.inlets, operator.outlets, context + ) + except Exception as e: + if config.graceful_exceptions: + operator.log.error(e) + operator.log.info( + "Suppressing error because graceful_exceptions is set" + ) + else: + raise diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/__init__.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py new file mode 100644 index 0000000000000..109e7ddfe4dfa --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py @@ -0,0 +1,63 @@ +from typing import List, Union + +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent + +from datahub_airflow_plugin.hooks.datahub import ( + DatahubGenericHook, + DatahubKafkaHook, + DatahubRestHook, +) + + +class DatahubBaseOperator(BaseOperator): + """ + The DatahubBaseOperator is used as a base operator all DataHub operators. + """ + + ui_color = "#4398c8" + + hook: Union[DatahubRestHook, DatahubKafkaHook] + + # mypy is not a fan of this. Newer versions of Airflow support proper typing for the decorator + # using PEP 612. However, there is not yet a good way to inherit the types of the kwargs from + # the superclass. + @apply_defaults # type: ignore[misc] + def __init__( # type: ignore[no-untyped-def] + self, + *, + datahub_conn_id: str, + **kwargs, + ): + super().__init__(**kwargs) + + self.datahub_conn_id = datahub_conn_id + self.generic_hook = DatahubGenericHook(datahub_conn_id) + + +class DatahubEmitterOperator(DatahubBaseOperator): + """ + Emits a Metadata Change Event to DataHub using either a DataHub + Rest or Kafka connection. + + :param datahub_conn_id: Reference to the DataHub Rest or Kafka Connection. + :type datahub_conn_id: str + """ + + # See above for why these mypy type issues are ignored here. + @apply_defaults # type: ignore[misc] + def __init__( # type: ignore[no-untyped-def] + self, + mces: List[MetadataChangeEvent], + datahub_conn_id: str, + **kwargs, + ): + super().__init__( + datahub_conn_id=datahub_conn_id, + **kwargs, + ) + self.mces = mces + + def execute(self, context): + self.generic_hook.get_underlying_hook().emit_mces(self.mces) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_operator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_operator.py new file mode 100644 index 0000000000000..6f93c09a9e287 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_operator.py @@ -0,0 +1,78 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.models import BaseOperator +from datahub.api.circuit_breaker import ( + AssertionCircuitBreaker, + AssertionCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubAssertionOperator(BaseOperator): + r""" + DataHub Assertion Circuit Breaker Operator. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. + By default it is True. + :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. + """ + + template_fields: Sequence[str] = ("urn",) + circuit_breaker: AssertionCircuitBreaker + urn: Union[List[str], str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + check_last_assertion_time: bool = True, + time_delta: Optional[datetime.timedelta] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + self.urn = urn + config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + verify_after_last_update=check_last_assertion_time, + time_delta=time_delta if time_delta else datetime.timedelta(days=1), + ) + + self.circuit_breaker = AssertionCircuitBreaker(config=config) + + def execute(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) + if ret: + raise Exception(f"Dataset {self.urn} is not in consumable state") + + return True diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_sensor.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_sensor.py new file mode 100644 index 0000000000000..16e5d1cbe8b1f --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_assertion_sensor.py @@ -0,0 +1,78 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.sensors.base import BaseSensorOperator +from datahub.api.circuit_breaker import ( + AssertionCircuitBreaker, + AssertionCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubAssertionSensor(BaseSensorOperator): + r""" + DataHub Assertion Circuit Breaker Sensor. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. + By default it is True. + :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. + """ + + template_fields: Sequence[str] = ("urn",) + circuit_breaker: AssertionCircuitBreaker + urn: Union[List[str], str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + check_last_assertion_time: bool = True, + time_delta: datetime.timedelta = datetime.timedelta(days=1), + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + self.urn = urn + config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + verify_after_last_update=check_last_assertion_time, + time_delta=time_delta, + ) + self.circuit_breaker = AssertionCircuitBreaker(config=config) + + def poke(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) + if ret: + self.log.info(f"Dataset {self.urn} is not in consumable state") + return False + + return True diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_operator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_operator.py new file mode 100644 index 0000000000000..94e105309537b --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_operator.py @@ -0,0 +1,97 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.sensors.base import BaseSensorOperator +from datahub.api.circuit_breaker import ( + OperationCircuitBreaker, + OperationCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubOperationCircuitBreakerOperator(BaseSensorOperator): + r""" + DataHub Operation Circuit Breaker Operator. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param partition: The partition to check the operation. + :param source_type: The partition to check the operation. :ref:`https://datahubproject.io/docs/graphql/enums#operationsourcetype` + + """ + + template_fields: Sequence[str] = ( + "urn", + "partition", + "source_type", + "operation_type", + ) + circuit_breaker: OperationCircuitBreaker + urn: Union[List[str], str] + partition: Optional[str] + source_type: Optional[str] + operation_type: Optional[str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), + partition: Optional[str] = None, + source_type: Optional[str] = None, + operation_type: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + + self.urn = urn + self.partition = partition + self.operation_type = operation_type + self.source_type = source_type + + config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + time_delta=time_delta, + ) + + self.circuit_breaker = OperationCircuitBreaker(config=config) + + def execute(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active( + urn=urn, + partition=self.partition, + operation_type=self.operation_type, + source_type=self.source_type, + ) + if ret: + raise Exception(f"Dataset {self.urn} is not in consumable state") + + return True diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_sensor.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_sensor.py new file mode 100644 index 0000000000000..434c60754064d --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub_operation_sensor.py @@ -0,0 +1,100 @@ +import datetime +from typing import Any, List, Optional, Sequence, Union + +from airflow.sensors.base import BaseSensorOperator +from datahub.api.circuit_breaker import ( + OperationCircuitBreaker, + OperationCircuitBreakerConfig, +) + +from datahub_airflow_plugin.hooks.datahub import DatahubRestHook + + +class DataHubOperationCircuitBreakerSensor(BaseSensorOperator): + r""" + DataHub Operation Circuit Breaker Sensor. + + :param urn: The DataHub dataset unique identifier. (templated) + :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub + which is set as Airflow connection. + :param partition: The partition to check the operation. + :param source_type: The source type to filter on. If not set it will accept any source type. + See valid values at: https://datahubproject.io/docs/graphql/enums#operationsourcetype + :param operation_type: The operation type to filter on. If not set it will accept any source type. + See valid values at: https://datahubproject.io/docs/graphql/enums/#operationtype + """ + + template_fields: Sequence[str] = ( + "urn", + "partition", + "source_type", + "operation_type", + ) + circuit_breaker: OperationCircuitBreaker + urn: Union[List[str], str] + partition: Optional[str] + source_type: Optional[str] + operation_type: Optional[str] + + def __init__( # type: ignore[no-untyped-def] + self, + *, + urn: Union[List[str], str], + datahub_rest_conn_id: Optional[str] = None, + time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), + partition: Optional[str] = None, + source_type: Optional[str] = None, + operation_type: Optional[str] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + hook: DatahubRestHook + if datahub_rest_conn_id is not None: + hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) + else: + hook = DatahubRestHook() + + host, password, timeout_sec = hook._get_config() + + self.urn = urn + self.partition = partition + self.operation_type = operation_type + self.source_type = source_type + + config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( + datahub_host=host, + datahub_token=password, + timeout=timeout_sec, + time_delta=time_delta, + ) + + self.circuit_breaker = OperationCircuitBreaker(config=config) + + def poke(self, context: Any) -> bool: + if "datahub_silence_circuit_breakers" in context["dag_run"].conf: + self.log.info( + "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" + ) + return True + + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + if isinstance(self.urn, str): + urns = [self.urn] + elif isinstance(self.urn, list): + urns = self.urn + else: + raise Exception(f"urn parameter has invalid type {type(self.urn)}") + + for urn in urns: + self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") + ret = self.circuit_breaker.is_circuit_breaker_active( + urn=urn, + partition=self.partition, + operation_type=self.operation_type, + source_type=self.source_type, + ) + if ret: + self.log.info(f"Dataset {self.urn} is not in consumable state") + return False + + return True diff --git a/metadata-ingestion/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py similarity index 97% rename from metadata-ingestion/tests/unit/test_airflow.py rename to metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index 980dc5550fafa..9aa901171cfa6 100644 --- a/metadata-ingestion/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -9,12 +9,11 @@ import airflow.configuration import airflow.version +import datahub.emitter.mce_builder as builder import packaging.version import pytest from airflow.lineage import apply_lineage, prepare_lineage from airflow.models import DAG, Connection, DagBag, DagRun, TaskInstance - -import datahub.emitter.mce_builder as builder from datahub_provider import get_provider_info from datahub_provider._airflow_shims import AIRFLOW_PATCHED, EmptyOperator from datahub_provider.entities import Dataset, Urn @@ -23,7 +22,7 @@ assert AIRFLOW_PATCHED -pytestmark = pytest.mark.airflow +# TODO: Remove default_view="tree" arg. Figure out why is default_view being picked as "grid" and how to fix it ? # Approach suggested by https://stackoverflow.com/a/11887885/5004662. AIRFLOW_VERSION = packaging.version.parse(airflow.version.version) @@ -75,7 +74,7 @@ def test_airflow_provider_info(): @pytest.mark.filterwarnings("ignore:.*is deprecated.*") def test_dags_load_with_no_errors(pytestconfig: pytest.Config) -> None: airflow_examples_folder = ( - pytestconfig.rootpath / "src/datahub_provider/example_dags" + pytestconfig.rootpath / "src/datahub_airflow_plugin/example_dags" ) # Note: the .airflowignore file skips the snowflake DAG. @@ -233,7 +232,11 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): func = mock.Mock() func.__name__ = "foo" - dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) + dag = DAG( + dag_id="test_lineage_is_sent_to_backend", + start_date=DEFAULT_DATE, + default_view="tree", + ) with dag: op1 = EmptyOperator( @@ -252,6 +255,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. + # We need to add type:ignore in else to suppress mypy error in Airflow < 2.2 if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) # Ignoring type here because DagRun state is just a sring at Airflow 1 @@ -259,7 +263,7 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): else: from airflow.utils.state import DagRunState - ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") + ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") # type: ignore[call-arg] dag_run = DagRun( state=DagRunState.SUCCESS, run_id=f"scheduled_{DEFAULT_DATE.isoformat()}", diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md index 5d49b9a866a3d..f529590e2ab39 100644 --- a/metadata-ingestion/developing.md +++ b/metadata-ingestion/developing.md @@ -26,6 +26,16 @@ source venv/bin/activate datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" ``` +### (Optional) Set up your Python environment for developing on Airflow Plugin + +From the repository root: + +```shell +cd metadata-ingestion-modules/airflow-plugin +../../gradlew :metadata-ingestion-modules:airflow-plugin:installDev +source venv/bin/activate +datahub version # should print "DataHub CLI version: unavailable (installed in develop mode)" +``` ### Common setup issues Common issues (click to expand): @@ -183,7 +193,7 @@ pytest -m 'slow_integration' ../gradlew :metadata-ingestion:testFull ../gradlew :metadata-ingestion:check # Run all tests in a single file -../gradlew :metadata-ingestion:testSingle -PtestFile=tests/unit/test_airflow.py +../gradlew :metadata-ingestion:testSingle -PtestFile=tests/unit/test_bigquery_source.py # Run all tests under tests/unit ../gradlew :metadata-ingestion:testSingle -PtestFile=tests/unit ``` diff --git a/metadata-ingestion/docs/sources/datahub/README.md b/metadata-ingestion/docs/sources/datahub/README.md new file mode 100644 index 0000000000000..45afc6e166889 --- /dev/null +++ b/metadata-ingestion/docs/sources/datahub/README.md @@ -0,0 +1,4 @@ +Migrate data from one DataHub instance to another. + +Requires direct access to the database, kafka broker, and kafka schema registry +of the source DataHub instance. diff --git a/metadata-ingestion/docs/sources/datahub/datahub_pre.md b/metadata-ingestion/docs/sources/datahub/datahub_pre.md new file mode 100644 index 0000000000000..c98cce7047836 --- /dev/null +++ b/metadata-ingestion/docs/sources/datahub/datahub_pre.md @@ -0,0 +1,66 @@ +### Overview + +This source pulls data from two locations: +- The DataHub database, containing a single table holding all versioned aspects +- The DataHub Kafka cluster, reading from the [MCL Log](../../../../docs/what/mxe.md#metadata-change-log-mcl) +topic for timeseries aspects. + +All data is first read from the database, before timeseries data is ingested from kafka. +To prevent this source from potentially running forever, it will not ingest data produced after the +datahub_source ingestion job is started. This `stop_time` is reflected in the report. + +Data from the database and kafka are read in chronological order, specifically by the +createdon timestamp in the database and by kafka offset per partition. In order to +properly read from the database, please ensure that the `createdon` column is indexed. +Newly created databases should have this index, named `timeIndex`, by default, but older +ones you may have to create yourself, with the statement: + +``` +CREATE INDEX timeIndex ON metadata_aspect_v2 (createdon); +``` + +*If you do not have this index, the source may run incredibly slowly and produce +significant database load.* + +#### Stateful Ingestion +On first run, the source will read from the earliest data in the database and the earliest +kafka offsets. Every `commit_state_interval` (default 1000) records, the source will store +a checkpoint to remember its place, i.e. the last createdon timestamp and kafka offsets. +This allows you to stop and restart the source without losing much progress, but note that +you will re-ingest some data at the start of the new run. + +If any errors are encountered in the ingestion process, e.g. we are unable to emit an aspect +due to network errors, the source will keep running, but will stop committing checkpoints, +unless `commit_with_parse_errors` (default `false`) is set. Thus, if you re-run the ingestion, +you can re-ingest the data that was missed, but note it will all re-ingest all subsequent data. + +If you want to re-ingest all data, you can set a different `pipeline_name` in your recipe, +or set `stateful_ingestion.ignore_old_state`: + +```yaml +source: + config: + # ... connection config, etc. + stateful_ingestion: + enabled: true + ignore_old_state: true +``` + +#### Limitations +- Can only pull timeseries aspects retained by Kafka, which by default lasts 90 days. +- Does not detect hard timeseries deletions, e.g. if via a `datahub delete` command using the CLI. +Therefore, if you deleted data in this way, it will still exist in the destination instance. +- If you have a significant amount of aspects with the exact same `createdon` timestamp, +stateful ingestion will not be able to save checkpoints partially through that timestamp. +On a subsequent run, all aspects for that timestamp will be ingested. + +#### Performance +On your destination DataHub instance, we suggest the following settings: +- Enable [async ingestion](../../../../docs/deploy/environment-vars.md#ingestion) +- Use standalone consumers +([mae-consumer](../../../../metadata-jobs/mae-consumer-job/README.md) +and [mce-consumer](../../../../metadata-jobs/mce-consumer-job/README.md)) + * If you are migrating large amounts of data, consider scaling consumer replicas. +- Increase the number of gms pods to add redundancy and increase resilience to node evictions + * If you are migrating large amounts of data, consider increasing elasticsearch's + thread count via the `ELASTICSEARCH_THREAD_COUNT` environment variable. diff --git a/metadata-ingestion/docs/sources/datahub/datahub_recipe.yml b/metadata-ingestion/docs/sources/datahub/datahub_recipe.yml new file mode 100644 index 0000000000000..cb7fc97a39b9f --- /dev/null +++ b/metadata-ingestion/docs/sources/datahub/datahub_recipe.yml @@ -0,0 +1,30 @@ +pipeline_name: datahub_source_1 +datahub_api: + server: "http://localhost:8080" # Migrate data from DataHub instance on localhost:8080 + token: "" +source: + type: datahub + config: + include_all_versions: false + database_connection: + scheme: "mysql+pymysql" # or "postgresql+psycopg2" for Postgres + host_port: ":" + username: "" + password: "" + database: "" + kafka_connection: + bootstrap: ":9092" + schema_registry_url: ":8081" + stateful_ingestion: + enabled: true + ignore_old_state: false + extractor_config: + set_system_metadata: false # Replicate system metadata + +# Here, we write to a DataHub instance +# You can also use a different sink, e.g. to write the data to a file instead +sink: + type: datahub + config: + server: "" + token: "" diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml index 28bce8a478211..8caedafbea50e 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml +++ b/metadata-ingestion/docs/sources/iceberg/iceberg_recipe.yml @@ -2,14 +2,17 @@ source: type: "iceberg" config: env: PROD - adls: - # Will be translated to https://{account_name}.dfs.core.windows.net - account_name: my_adls_account - # Can use sas_token or account_key - sas_token: "${SAS_TOKEN}" - # account_key: "${ACCOUNT_KEY}" - container_name: warehouse - base_path: iceberg + catalog: + name: my_iceberg_catalog + type: rest + # Catalog configuration follows pyiceberg's documentation (https://py.iceberg.apache.org/configuration) + config: + uri: http://localhost:8181 + s3.access-key-id: admin + s3.secret-access-key: password + s3.region: us-east-1 + warehouse: s3a://warehouse/wh/ + s3.endpoint: http://localhost:9000 platform_instance: my_iceberg_catalog table_pattern: allow: diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md index 93715629d0b8e..9484cd8de6666 100644 --- a/metadata-ingestion/docs/sources/s3/s3.md +++ b/metadata-ingestion/docs/sources/s3/s3.md @@ -196,3 +196,9 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/). + +:::caution + +From Spark 3.2.0+, Avro reader fails on column names that don't start with a letter and contains other character than letters, number, and underscore. [https://github.com/apache/spark/blob/72c62b6596d21e975c5597f8fff84b1a9d070a02/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala#L158] +Avro files that contain such columns won't be profiled. +::: \ No newline at end of file diff --git a/metadata-ingestion/schedule_docs/airflow.md b/metadata-ingestion/schedule_docs/airflow.md index e48710964b01c..95393c3cc9919 100644 --- a/metadata-ingestion/schedule_docs/airflow.md +++ b/metadata-ingestion/schedule_docs/airflow.md @@ -4,9 +4,9 @@ If you are using Apache Airflow for your scheduling then you might want to also We've provided a few examples of how to configure your DAG: -- [`mysql_sample_dag`](../src/datahub_provider/example_dags/mysql_sample_dag.py) embeds the full MySQL ingestion configuration inside the DAG. +- [`mysql_sample_dag`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/mysql_sample_dag.py) embeds the full MySQL ingestion configuration inside the DAG. -- [`snowflake_sample_dag`](../src/datahub_provider/example_dags/snowflake_sample_dag.py) avoids embedding credentials inside the recipe, and instead fetches them from Airflow's [Connections](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection/index.html) feature. You must configure your connections in Airflow to use this approach. +- [`snowflake_sample_dag`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/snowflake_sample_dag.py) avoids embedding credentials inside the recipe, and instead fetches them from Airflow's [Connections](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection/index.html) feature. You must configure your connections in Airflow to use this approach. :::tip @@ -37,6 +37,6 @@ In more advanced cases, you might want to store your ingestion recipe in a file - Create a DAG task to read your DataHub ingestion recipe file and run it. See the example below for reference. - Deploy the DAG file into airflow for scheduling. Typically this involves checking in the DAG file into your dags folder which is accessible to your Airflow instance. -Example: [`generic_recipe_sample_dag`](../src/datahub_provider/example_dags/generic_recipe_sample_dag.py) +Example: [`generic_recipe_sample_dag`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/generic_recipe_sample_dag.py) diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 59d847395ec47..fad55b99ec938 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -75,7 +75,6 @@ disallow_untyped_defs = yes asyncio_mode = auto addopts = --cov=src --cov-report= --cov-config setup.cfg --strict-markers markers = - airflow: marks tests related to airflow (deselect with '-m not airflow') slow_unit: marks tests to only run slow unit tests (deselect with '-m not slow_unit') integration: marks tests to only run in integration (deselect with '-m "not integration"') integration_batch_1: mark tests to only run in batch 1 of integration tests. This is done mainly for parallelisation (deselect with '-m not integration_batch_1') @@ -112,5 +111,3 @@ exclude_lines = omit = # omit codegen src/datahub/metadata/* - # omit example dags - src/datahub_provider/example_dags/* diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index ded9186e08a22..32e1cf926cc68 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -229,8 +229,8 @@ def get_long_description(): iceberg_common = { # Iceberg Python SDK - "acryl-iceberg-legacy==0.0.4", - "azure-identity==1.10.0", + "pyiceberg", + "pyarrow>=9.0.0, <13.0.0", } s3_base = { @@ -247,8 +247,8 @@ def get_long_description(): } data_lake_profiling = { - "pydeequ>=1.0.1, <1.1", - "pyspark==3.0.3", + "pydeequ==1.1.0", + "pyspark~=3.3.0", } delta_lake = { @@ -283,8 +283,7 @@ def get_long_description(): }, # Integrations. "airflow": { - "apache-airflow >= 2.0.2", - *rest_common, + f"acryl-datahub-airflow-plugin == {package_metadata['__version__']}", }, "circuit-breaker": { "gql>=3.3.0", @@ -421,6 +420,7 @@ def get_long_description(): # The boto3-stubs package seems to have regularly breaking minor releases, # we pin to a specific version to avoid this. "boto3-stubs[s3,glue,sagemaker,sts]==1.28.15", + "mypy-boto3-sagemaker==1.28.15", # For some reason, above pin only restricts `mypy-boto3-sagemaker<1.29.0,>=1.28.0` "types-tabulate", # avrogen package requires this "types-pytz", @@ -477,7 +477,7 @@ def get_long_description(): "druid", "elasticsearch", "feast" if sys.version_info >= (3, 8) else None, - "iceberg", + "iceberg" if sys.version_info >= (3, 8) else None, "json-schema", "ldap", "looker", @@ -507,8 +507,8 @@ def get_long_description(): "salesforce", "unity-catalog", "nifi", - "vertica" - # airflow is added below + "vertica", + "mode", ] if plugin for dependency in plugins[plugin] @@ -517,9 +517,6 @@ def get_long_description(): dev_requirements = { *base_dev_requirements, - # Extra requirements for Airflow. - "apache-airflow[snowflake]>=2.0.2", # snowflake is used in example dags - "virtualenv", # needed by PythonVirtualenvOperator } full_test_dev_requirements = { @@ -533,7 +530,7 @@ def get_long_description(): "druid", "hana", "hive", - "iceberg", + "iceberg" if sys.version_info >= (3, 8) else None, "kafka-connect", "ldap", "mongodb", @@ -543,6 +540,7 @@ def get_long_description(): "redash", "vertica", ] + if plugin for dependency in plugins[plugin] ), } diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index 0d3c35e933e25..7ab7605ef6363 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -37,6 +37,11 @@ "glossaryNode", } +_RECURSIVE_DELETE_TYPES = { + "container", + "dataPlatformInstance", +} + @click.group(cls=DefaultGroup, default="by-filter") def delete() -> None: @@ -252,6 +257,12 @@ def references(urn: str, dry_run: bool, force: bool) -> None: help="Entity type filter (e.g. dataset)", ) @click.option("--query", required=False, type=str, help="Elasticsearch query string") +@click.option( + "--recursive", + required=False, + is_flag=True, + help="Recursively delete all contained entities (only for containers and dataPlatformInstances)", +) @click.option( "--start-time", required=False, @@ -298,6 +309,7 @@ def by_filter( platform: Optional[str], entity_type: Optional[str], query: Optional[str], + recursive: bool, start_time: Optional[datetime], end_time: Optional[datetime], batch_size: int, @@ -308,7 +320,12 @@ def by_filter( # Validate the cli arguments. _validate_user_urn_and_filters( - urn=urn, entity_type=entity_type, platform=platform, env=env, query=query + urn=urn, + entity_type=entity_type, + platform=platform, + env=env, + query=query, + recursive=recursive, ) soft_delete_filter = _validate_user_soft_delete_flags( soft=soft, aspect=aspect, only_soft_deleted=only_soft_deleted @@ -327,11 +344,29 @@ def by_filter( logger.info(f"Using {graph}") # Determine which urns to delete. + delete_by_urn = bool(urn) and not recursive if urn: - delete_by_urn = True urns = [urn] + + if recursive: + # Add children urns to the list. + if guess_entity_type(urn) == "dataPlatformInstance": + urns.extend( + graph.get_urns_by_filter( + platform_instance=urn, + status=soft_delete_filter, + batch_size=batch_size, + ) + ) + else: + urns.extend( + graph.get_urns_by_filter( + container=urn, + status=soft_delete_filter, + batch_size=batch_size, + ) + ) else: - delete_by_urn = False urns = list( graph.get_urns_by_filter( entity_types=[entity_type] if entity_type else None, @@ -348,20 +383,22 @@ def by_filter( ) return + # Print out a summary of the urns to be deleted and confirm with the user. + if not delete_by_urn: urns_by_type: Dict[str, List[str]] = {} for urn in urns: entity_type = guess_entity_type(urn) urns_by_type.setdefault(entity_type, []).append(urn) if len(urns_by_type) > 1: # Display a breakdown of urns by entity type if there's multiple. - click.echo("Filter matched urns of multiple entity types") + click.echo("Found urns of multiple entity types") for entity_type, entity_urns in urns_by_type.items(): click.echo( f"- {len(entity_urns)} {entity_type} urn(s). Sample: {choices(entity_urns, k=min(5, len(entity_urns)))}" ) else: click.echo( - f"Filter matched {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}" + f"Found {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}" ) if not force and not dry_run: @@ -403,6 +440,7 @@ def _validate_user_urn_and_filters( platform: Optional[str], env: Optional[str], query: Optional[str], + recursive: bool, ) -> None: # Check urn / filters options. if urn: @@ -423,6 +461,21 @@ def _validate_user_urn_and_filters( f"Using --env without other filters will delete all metadata in the {env} environment. Please use with caution." ) + # Check recursive flag. + if recursive: + if not urn: + raise click.UsageError( + "The --recursive flag can only be used with a single urn." + ) + elif guess_entity_type(urn) not in _RECURSIVE_DELETE_TYPES: + raise click.UsageError( + f"The --recursive flag can only be used with these entity types: {_RECURSIVE_DELETE_TYPES}." + ) + elif urn and guess_entity_type(urn) in _RECURSIVE_DELETE_TYPES: + logger.warning( + f"This will only delete {urn}. Use --recursive to delete all contained entities." + ) + def _validate_user_soft_delete_flags( soft: bool, aspect: Optional[str], only_soft_deleted: bool diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py index 62e880a2e5334..36450dda153d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py @@ -27,6 +27,9 @@ def _try_reformat_with_black(code: str) -> str: class WorkUnitRecordExtractorConfig(ConfigModel): set_system_metadata = True + set_system_metadata_pipeline_name = ( + False # false for now until the models are available in OSS + ) unpack_mces_into_mcps = False @@ -66,6 +69,10 @@ def get_records( workunit.metadata.systemMetadata = SystemMetadata( lastObserved=get_sys_time(), runId=self.ctx.run_id ) + if self.config.set_system_metadata_pipeline_name: + workunit.metadata.systemMetadata.pipelineName = ( + self.ctx.pipeline_name + ) if ( isinstance(workunit.metadata, MetadataChangeEvent) and len(workunit.metadata.proposedSnapshot.aspects) == 0 diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 50ea69b6c13a9..b371ab181e133 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -16,7 +16,12 @@ from datahub.cli.cli_utils import get_url_and_token from datahub.configuration.common import ConfigModel, GraphError, OperationalError from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP -from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect, make_data_platform_urn +from datahub.emitter.mce_builder import ( + DEFAULT_ENV, + Aspect, + make_data_platform_urn, + make_dataplatform_instance_urn, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.emitter.serialization_helper import post_json_transform @@ -543,8 +548,10 @@ def get_urns_by_filter( *, entity_types: Optional[List[str]] = None, platform: Optional[str] = None, + platform_instance: Optional[str] = None, env: Optional[str] = None, query: Optional[str] = None, + container: Optional[str] = None, status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED, batch_size: int = 10000, extraFilters: Optional[List[SearchFilterRule]] = None, @@ -557,15 +564,25 @@ def get_urns_by_filter( :param entity_types: List of entity types to include. If None, all entity types will be returned. :param platform: Platform to filter on. If None, all platforms will be returned. + :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned. :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned. + :param query: Query string to filter on. If None, all entities will be returned. + :param container: A container urn that entities must be within. + This works recursively, so it will include entities within sub-containers as well. + If None, all entities will be returned. + Note that this requires browsePathV2 aspects (added in 0.10.4+). :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities. :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters. + + :return: An iterable of urns that match the filters. """ types: Optional[List[str]] = None if entity_types is not None: if not entity_types: - raise ValueError("entity_types cannot be an empty list") + raise ValueError( + "entity_types cannot be an empty list; use None for all entities" + ) types = [_graphql_entity_type(entity_type) for entity_type in entity_types] @@ -584,6 +601,44 @@ def get_urns_by_filter( } ] + # Platform instance filter. + if platform_instance: + if platform: + # Massage the platform instance into a fully qualified urn, if necessary. + platform_instance = make_dataplatform_instance_urn( + platform, platform_instance + ) + + # Warn if platform_instance is not a fully qualified urn. + # TODO: Change this once we have a first-class data platform instance urn type. + if guess_entity_type(platform_instance) != "dataPlatformInstance": + raise ValueError( + f"Invalid data platform instance urn: {platform_instance}" + ) + + andFilters += [ + { + "field": "platformInstance", + "values": [platform_instance], + "condition": "EQUAL", + } + ] + + # Browse path v2 filter. + if container: + # Warn if container is not a fully qualified urn. + # TODO: Change this once we have a first-class container urn type. + if guess_entity_type(container) != "container": + raise ValueError(f"Invalid container urn: {container}") + + andFilters += [ + { + "field": "browsePathV2", + "values": [container], + "condition": "CONTAIN", + } + ] + # Status filter. if status == RemovedStatusFilter.NOT_SOFT_DELETED: # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py b/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py deleted file mode 100644 index 1a48725330df9..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/azure_common.py +++ /dev/null @@ -1,88 +0,0 @@ -from typing import Dict, Optional, Union - -from azure.identity import ClientSecretCredential -from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient -from pydantic import Field, root_validator - -from datahub.configuration import ConfigModel -from datahub.configuration.common import ConfigurationError - - -class AdlsSourceConfig(ConfigModel): - """ - Common Azure credentials config. - - https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python - """ - - base_path: str = Field( - default="/", - description="Base folder in hierarchical namespaces to start from.", - ) - container_name: str = Field( - description="Azure storage account container name.", - ) - account_name: str = Field( - description="Name of the Azure storage account. See [Microsoft official documentation on how to create a storage account.](https://docs.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account)", - ) - account_key: Optional[str] = Field( - description="Azure storage account access key that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**", - default=None, - ) - sas_token: Optional[str] = Field( - description="Azure storage account Shared Access Signature (SAS) token that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**", - default=None, - ) - client_secret: Optional[str] = Field( - description="Azure client secret that can be used as a credential. **An account key, a SAS token or a client secret is required for authentication.**", - default=None, - ) - client_id: Optional[str] = Field( - description="Azure client (Application) ID required when a `client_secret` is used as a credential.", - default=None, - ) - tenant_id: Optional[str] = Field( - description="Azure tenant (Directory) ID required when a `client_secret` is used as a credential.", - default=None, - ) - - def get_abfss_url(self, folder_path: str = "") -> str: - if not folder_path.startswith("/"): - folder_path = f"/{folder_path}" - return f"abfss://{self.container_name}@{self.account_name}.dfs.core.windows.net{folder_path}" - - def get_filesystem_client(self) -> FileSystemClient: - return self.get_service_client().get_file_system_client(self.container_name) - - def get_service_client(self) -> DataLakeServiceClient: - return DataLakeServiceClient( - account_url=f"https://{self.account_name}.dfs.core.windows.net", - credential=self.get_credentials(), - ) - - def get_credentials( - self, - ) -> Union[Optional[str], ClientSecretCredential]: - if self.client_id and self.client_secret and self.tenant_id: - return ClientSecretCredential( - tenant_id=self.tenant_id, - client_id=self.client_id, - client_secret=self.client_secret, - ) - return self.sas_token if self.sas_token is not None else self.account_key - - @root_validator() - def _check_credential_values(cls, values: Dict) -> Dict: - if ( - values.get("account_key") - or values.get("sas_token") - or ( - values.get("client_id") - and values.get("client_secret") - and values.get("tenant_id") - ) - ): - return values - raise ConfigurationError( - "credentials missing, requires one combination of account_key or sas_token or (client_id and client_secret and tenant_id)" - ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 7690723837165..1107a54a1896b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -1,5 +1,4 @@ import atexit -import hashlib import logging import os import re @@ -146,10 +145,6 @@ def cleanup(config: BigQueryV2Config) -> None: os.unlink(config._credentials_path) -def _generate_sql_id(sql: str) -> str: - return hashlib.md5(sql.encode("utf-8")).hexdigest() - - @platform_name("BigQuery", doc_order=1) @config_class(BigQueryV2Config) @support_status(SupportStatus.CERTIFIED) @@ -286,10 +281,9 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): # Global store of table identifiers for lineage filtering self.table_refs: Set[str] = set() - # We do this so that the SQL is stored in a file-backed dict, but the sql IDs are stored in memory. - # Maps project -> view_ref -> sql ID (will be used when generating lineage) - self.view_definition_ids: Dict[str, Dict[str, str]] = defaultdict(dict) - # Maps sql ID -> actual sql + # Maps project -> view_ref, so we can find all views in a project + self.view_refs_by_project: Dict[str, Set[str]] = defaultdict(set) + # Maps view ref -> actual sql self.view_definitions: FileBackedDict[str] = FileBackedDict() self.sql_parser_schema_resolver = SchemaResolver( @@ -684,10 +678,8 @@ def generate_lineage(self, project_id: str) -> Iterable[MetadataWorkUnit]: ) if self.config.lineage_parse_view_ddl: - for view, view_definition_id in self.view_definition_ids[ - project_id - ].items(): - view_definition = self.view_definitions[view_definition_id] + for view in self.view_refs_by_project[project_id]: + view_definition = self.view_definitions[view] raw_view_lineage = sqlglot_lineage( view_definition, schema_resolver=self.sql_parser_schema_resolver, @@ -896,10 +888,9 @@ def _process_view( BigQueryTableRef(table_identifier).get_sanitized_table_ref() ) self.table_refs.add(table_ref) - if self.config.lineage_parse_view_ddl: - view_definition_id = _generate_sql_id(view.view_definition) - self.view_definition_ids[project_id][table_ref] = view_definition_id - self.view_definitions[view_definition_id] = view.view_definition + if self.config.lineage_parse_view_ddl and view.view_definition: + self.view_refs_by_project[project_id].add(table_ref) + self.view_definitions[table_ref] = view.view_definition view.column_count = len(columns) if not view.column_count: @@ -989,7 +980,7 @@ def gen_view_dataset_workunits( view_properties_aspect = ViewProperties( materialized=view.materialized, viewLanguage="SQL", - viewLogic=view_definition_string, + viewLogic=view_definition_string or "", ) yield MetadataChangeProposalWrapper( entityUrn=self.gen_dataset_urn( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 842e3d2144600..341952d95e7d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -183,6 +183,7 @@ def make_lineage_edges_from_parsing_result( column_mapping=frozenset( LineageEdgeColumnMapping(out_column=out_column, in_columns=in_columns) for out_column, in_columns in column_mapping.items() + if in_columns ), auditStamp=audit_stamp, type=lineage_type, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index fe7ab8c49c79a..e112db31c5c63 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -56,10 +56,7 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantUsageRunSkipHandler, ) -from datahub.ingestion.source.usage.usage_common import ( - TOTAL_BUDGET_FOR_QUERY_LIST, - make_usage_workunit, -) +from datahub.ingestion.source.usage.usage_common import make_usage_workunit from datahub.ingestion.source_report.ingestion_stage import ( USAGE_EXTRACTION_INGESTION, USAGE_EXTRACTION_OPERATIONAL_STATS, @@ -101,7 +98,6 @@ READ_STATEMENT_TYPES: List[str] = ["SELECT"] STRING_ENCODING = "utf-8" -MAX_QUERY_LENGTH = TOTAL_BUDGET_FOR_QUERY_LIST @dataclass(frozen=True, order=True) @@ -601,6 +597,7 @@ def _generate_usage_workunits( resource_urn_builder=self.dataset_urn_builder, top_n_queries=self.config.usage.top_n_queries, format_sql_queries=self.config.usage.format_sql_queries, + queries_character_limit=self.config.usage.queries_character_limit, ) self.report.num_usage_workunits_emitted += 1 except Exception as e: @@ -662,7 +659,8 @@ def _store_usage_event( usage_state.column_accesses[str(uuid.uuid4())] = key, field_read return True elif event.query_event and event.query_event.job_name: - query = event.query_event.query[:MAX_QUERY_LENGTH] + max_query_length = self.config.usage.queries_character_limit + query = event.query_event.query[:max_query_length] query_hash = hashlib.md5(query.encode(STRING_ENCODING)).hexdigest() if usage_state.queries.get(query_hash, query) != query: key = str(uuid.uuid4()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index a054067d92334..053d136305527 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -1,27 +1,27 @@ from typing import Optional -from pydantic import Field +from pydantic import Field, root_validator from datahub.configuration.kafka import KafkaConsumerConnectionConfig -from datahub.ingestion.source.sql.mysql import MySQLConnectionConfig +from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfig, StatefulIngestionConfigBase, ) -DEFAULT_MYSQL_TABLE_NAME = "metadata_aspect_v2" +DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2" DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1" -DEFAULT_MYSQL_BATCH_SIZE = 10_000 +DEFAULT_DATABASE_BATCH_SIZE = 10_000 class DataHubSourceConfig(StatefulIngestionConfigBase): - mysql_connection: MySQLConnectionConfig = Field( - default=MySQLConnectionConfig(), - description="MySQL connection config", + database_connection: Optional[SQLAlchemyConnectionConfig] = Field( + default=None, + description="Database connection config", ) - kafka_connection: KafkaConsumerConnectionConfig = Field( - default=KafkaConsumerConnectionConfig(), + kafka_connection: Optional[KafkaConsumerConnectionConfig] = Field( + default=None, description="Kafka connection config", ) @@ -29,18 +29,18 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): default=False, description=( "If enabled, include all versions of each aspect. " - "Otherwise, only include the latest version of each aspect." + "Otherwise, only include the latest version of each aspect. " ), ) - mysql_batch_size: int = Field( - default=DEFAULT_MYSQL_BATCH_SIZE, - description="Number of records to fetch from MySQL at a time", + database_query_batch_size: int = Field( + default=DEFAULT_DATABASE_BATCH_SIZE, + description="Number of records to fetch from the database at a time", ) - mysql_table_name: str = Field( - default=DEFAULT_MYSQL_TABLE_NAME, - description="Name of MySQL table containing all versioned aspects", + database_table_name: str = Field( + default=DEFAULT_DATABASE_TABLE_NAME, + description="Name of database table containing all versioned aspects", ) kafka_topic_name: str = Field( @@ -66,3 +66,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): "Enable if you want to ignore the errors." ), ) + + @root_validator + def check_ingesting_data(cls, values): + if not values.get("database_connection") and not values.get("kafka_connection"): + raise ValueError( + "Your current config will not ingest any data." + " Please specify at least one of `database_connection` or `kafka_connection`, ideally both." + ) + return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_mysql_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py similarity index 67% rename from metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_mysql_reader.py rename to metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index adf4c1db57395..39702ba3ce347 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_mysql_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -10,33 +10,42 @@ from datahub.emitter.serialization_helper import post_json_transform from datahub.ingestion.source.datahub.config import DataHubSourceConfig from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig from datahub.metadata.schema_classes import ChangeTypeClass, SystemMetadataClass from datahub.utilities.lossy_collections import LossyDict, LossyList logger = logging.getLogger(__name__) -MYSQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" +# Should work for at least mysql, mariadb, postgres +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" -class DataHubMySQLReader: - def __init__(self, config: DataHubSourceConfig, report: DataHubSourceReport): +class DataHubDatabaseReader: + def __init__( + self, + config: DataHubSourceConfig, + connection_config: SQLAlchemyConnectionConfig, + report: DataHubSourceReport, + ): self.config = config self.report = report self.engine = create_engine( - url=config.mysql_connection.get_sql_alchemy_url(), - **config.mysql_connection.options, + url=connection_config.get_sql_alchemy_url(), + **connection_config.options, ) @property def query(self) -> str: # May repeat rows for the same date - # Offset is generally 0, unless we repeat the same date twice + # Offset is generally 0, unless we repeat the same createdon twice return f""" SELECT urn, aspect, metadata, systemmetadata, createdon - FROM `{self.config.mysql_table_name}` + FROM `{self.config.database_table_name}` WHERE createdon >= %(since_createdon)s {"" if self.config.include_all_versions else "AND version = 0"} - ORDER BY createdon, urn, aspect, version # Ensures stable ordering + ORDER BY createdon, urn, aspect, # Ensure stable order, chronological per (urn, aspect) + CASE WHEN version = 0 THEN 1 ELSE 0 END, version + # Version 0 last, only when createdon is the same. Otherwise relies on createdon order LIMIT %(limit)s OFFSET %(offset)s """ @@ -48,11 +57,11 @@ def get_aspects( ts = from_createdon offset = 0 while ts.timestamp() <= stop_time.timestamp(): - logger.debug(f"Polling MySQL aspects from {ts}") + logger.debug(f"Polling database aspects from {ts}") rows = conn.execute( self.query, - since_createdon=ts.strftime(MYSQL_DATETIME_FORMAT), - limit=self.config.mysql_batch_size, + since_createdon=ts.strftime(DATETIME_FORMAT), + limit=self.config.database_query_batch_size, offset=offset, ) if not rows.rowcount: @@ -64,7 +73,7 @@ def get_aspects( row_dict = row._asdict() else: row_dict = dict(row) - mcp = self._parse_mysql_row(row_dict) + mcp = self._parse_row(row_dict) if mcp: yield mcp, row_dict["createdon"] @@ -72,15 +81,13 @@ def get_aspects( offset += i else: ts = row_dict["createdon"] - print(ts) offset = 0 - def _parse_mysql_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]: + def _parse_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]: try: json_aspect = post_json_transform(json.loads(d["metadata"])) json_metadata = post_json_transform(json.loads(d["systemmetadata"] or "{}")) system_metadata = SystemMetadataClass.from_obj(json_metadata) - system_metadata.lastObserved = int(d["createdon"].timestamp() * 1000) return MetadataChangeProposalWrapper( entityUrn=d["urn"], aspect=ASPECT_MAP[d["aspect"]].from_obj(json_aspect), @@ -91,8 +98,8 @@ def _parse_mysql_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]: logger.warning( f"Failed to parse metadata for {d['urn']}: {e}", exc_info=True ) - self.report.num_mysql_parse_errors += 1 - self.report.mysql_parse_errors.setdefault(str(e), LossyDict()).setdefault( - d["aspect"], LossyList() - ).append(d["urn"]) + self.report.num_database_parse_errors += 1 + self.report.database_parse_errors.setdefault( + str(e), LossyDict() + ).setdefault(d["aspect"], LossyList()).append(d["urn"]) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py index b165d70dd53b0..d9e53e87c2cea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_kafka_reader.py @@ -11,6 +11,7 @@ from confluent_kafka.schema_registry import SchemaRegistryClient from confluent_kafka.schema_registry.avro import AvroDeserializer +from datahub.configuration.kafka import KafkaConsumerConnectionConfig from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.datahub.config import DataHubSourceConfig @@ -27,10 +28,12 @@ class DataHubKafkaReader(Closeable): def __init__( self, config: DataHubSourceConfig, + connection_config: KafkaConsumerConnectionConfig, report: DataHubSourceReport, ctx: PipelineContext, ): self.config = config + self.connection_config = connection_config self.report = report self.group_id = f"{KAFKA_GROUP_PREFIX}-{ctx.pipeline_name}" @@ -38,13 +41,13 @@ def __enter__(self) -> "DataHubKafkaReader": self.consumer = DeserializingConsumer( { "group.id": self.group_id, - "bootstrap.servers": self.config.kafka_connection.bootstrap, - **self.config.kafka_connection.consumer_config, + "bootstrap.servers": self.connection_config.bootstrap, + **self.connection_config.consumer_config, "auto.offset.reset": "earliest", "enable.auto.commit": False, "value.deserializer": AvroDeserializer( schema_registry_client=SchemaRegistryClient( - {"url": self.config.kafka_connection.schema_registry_url} + {"url": self.connection_config.schema_registry_url} ), return_record_name=True, ), diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index 636e65a244dad..2368febe1ff57 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -1,5 +1,6 @@ import logging from datetime import datetime, timezone +from functools import partial from typing import Dict, Iterable, List, Optional from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -11,10 +12,13 @@ support_status, ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit_reporter from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.datahub_database_reader import ( + DataHubDatabaseReader, +) from datahub.ingestion.source.datahub.datahub_kafka_reader import DataHubKafkaReader -from datahub.ingestion.source.datahub.datahub_mysql_reader import DataHubMySQLReader from datahub.ingestion.source.datahub.report import DataHubSourceReport from datahub.ingestion.source.datahub.state import StatefulDataHubIngestionHandler from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -46,30 +50,50 @@ def get_report(self) -> SourceReport: return self.report def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: - return [] # Exactly replicate data from DataHub source + # Exactly replicate data from DataHub source + return [partial(auto_workunit_reporter, self.get_report())] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - stop_time = datetime.now(tz=timezone.utc) - logger.info(f"Ingesting DataHub metadata up until roughly {stop_time}") + self.report.stop_time = datetime.now(tz=timezone.utc) + logger.info(f"Ingesting DataHub metadata up until {self.report.stop_time}") state = self.stateful_ingestion_handler.get_last_run_state() - yield from self._get_mysql_workunits(state.mysql_createdon_datetime, stop_time) - self._commit_progress() - yield from self._get_kafka_workunits(state.kafka_offsets, stop_time) - self._commit_progress() - def _get_mysql_workunits( - self, from_createdon: datetime, stop_time: datetime + if self.config.database_connection is not None: + yield from self._get_database_workunits( + from_createdon=state.database_createdon_datetime + ) + self._commit_progress() + else: + logger.info( + "Skipping ingestion of versioned aspects as no database_connection provided" + ) + + if self.config.kafka_connection is not None: + yield from self._get_kafka_workunits(from_offsets=state.kafka_offsets) + self._commit_progress() + else: + logger.info( + "Skipping ingestion of timeseries aspects as no kafka_connection provided" + ) + + def _get_database_workunits( + self, from_createdon: datetime ) -> Iterable[MetadataWorkUnit]: - logger.info(f"Fetching MySQL aspects from {from_createdon}") - reader = DataHubMySQLReader(self.config, self.report) - mcps = reader.get_aspects(from_createdon, stop_time) + if self.config.database_connection is None: + return + + logger.info(f"Fetching database aspects starting from {from_createdon}") + reader = DataHubDatabaseReader( + self.config, self.config.database_connection, self.report + ) + mcps = reader.get_aspects(from_createdon, self.report.stop_time) for i, (mcp, createdon) in enumerate(mcps): yield mcp.as_workunit() - self.report.num_mysql_aspects_ingested += 1 + self.report.num_database_aspects_ingested += 1 if ( self.config.commit_with_parse_errors - or not self.report.num_mysql_parse_errors + or not self.report.num_database_parse_errors ): self.stateful_ingestion_handler.update_checkpoint( last_createdon=createdon @@ -77,12 +101,18 @@ def _get_mysql_workunits( self._commit_progress(i) def _get_kafka_workunits( - self, from_offsets: Dict[int, int], stop_time: datetime + self, from_offsets: Dict[int, int] ) -> Iterable[MetadataWorkUnit]: - logger.info(f"Fetching timeseries aspects from kafka until {stop_time}") - - with DataHubKafkaReader(self.config, self.report, self.ctx) as reader: - mcls = reader.get_mcls(from_offsets=from_offsets, stop_time=stop_time) + if self.config.kafka_connection is None: + return + + logger.info("Fetching timeseries aspects from kafka") + with DataHubKafkaReader( + self.config, self.config.kafka_connection, self.report, self.ctx + ) as reader: + mcls = reader.get_mcls( + from_offsets=from_offsets, stop_time=self.report.stop_time + ) for i, (mcl, offset) in enumerate(mcls): mcp = MetadataChangeProposalWrapper.try_from_mcl(mcl) if mcp.changeType == ChangeTypeClass.DELETE: diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py index 3aa93d6a4577b..73e5a798a1553 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/report.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from datetime import datetime, timezone from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionReport, @@ -8,10 +9,12 @@ @dataclass class DataHubSourceReport(StatefulIngestionReport): - num_mysql_aspects_ingested: int = 0 - num_mysql_parse_errors: int = 0 + stop_time: datetime = field(default_factory=lambda: datetime.now(tz=timezone.utc)) + + num_database_aspects_ingested: int = 0 + num_database_parse_errors: int = 0 # error -> aspect -> [urn] - mysql_parse_errors: LossyDict[str, LossyDict[str, LossyList[str]]] = field( + database_parse_errors: LossyDict[str, LossyDict[str, LossyList[str]]] = field( default_factory=LossyDict ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py index deea9772fae20..4bedd331a9aea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/state.py @@ -16,14 +16,16 @@ class DataHubIngestionState(CheckpointStateBase): - mysql_createdon_ts: NonNegativeInt = 0 + database_createdon_ts: NonNegativeInt = 0 # Maps partition -> offset kafka_offsets: Dict[int, NonNegativeInt] = Field(default_factory=dict) @property - def mysql_createdon_datetime(self) -> datetime: - return datetime.fromtimestamp(self.mysql_createdon_ts / 1000, tz=timezone.utc) + def database_createdon_datetime(self) -> datetime: + return datetime.fromtimestamp( + self.database_createdon_ts / 1000, tz=timezone.utc + ) class PartitionOffset(NamedTuple): @@ -81,7 +83,7 @@ def update_checkpoint( if cur_checkpoint: cur_state = cast(DataHubIngestionState, cur_checkpoint.state) if last_createdon: - cur_state.mysql_createdon_ts = int(last_createdon.timestamp() * 1000) + cur_state.database_createdon_ts = int(last_createdon.timestamp() * 1000) if last_offset: cur_state.kafka_offsets[last_offset.partition] = last_offset.offset + 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index b7ae50eb766af..cc7f646dcb884 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -1,14 +1,37 @@ +import sys + +if sys.version_info < (3, 8): + raise ImportError("Iceberg is only supported on Python 3.8+") + import json import logging import uuid -from typing import Any, Dict, Iterable, List, Optional, Tuple - -from iceberg.api import types as IcebergTypes -from iceberg.api.table import Table -from iceberg.api.types.types import NestedField -from iceberg.core.base_table import BaseTable -from iceberg.core.filesystem.filesystem_tables import FilesystemTables -from iceberg.exceptions import NoSuchTableException +from typing import Any, Dict, Iterable, List, Optional + +from pyiceberg.catalog import Catalog +from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit +from pyiceberg.table import Table +from pyiceberg.typedef import Identifier +from pyiceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, +) from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -59,23 +82,13 @@ LOGGER = logging.getLogger(__name__) -_all_atomic_types = { - IcebergTypes.BooleanType: "boolean", - IcebergTypes.IntegerType: "int", - IcebergTypes.LongType: "long", - IcebergTypes.FloatType: "float", - IcebergTypes.DoubleType: "double", - IcebergTypes.BinaryType: "bytes", - IcebergTypes.StringType: "string", -} - @platform_name("Iceberg") @support_status(SupportStatus.TESTING) @config_class(IcebergSourceConfig) @capability( SourceCapability.PLATFORM_INSTANCE, - "Optionally enabled via configuration, an Iceberg instance represents the datalake name where the table is stored.", + "Optionally enabled via configuration, an Iceberg instance represents the catalog name where the table is stored.", ) @capability(SourceCapability.DOMAINS, "Currently not supported.", supported=False) @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration.") @@ -95,16 +108,7 @@ class IcebergSource(StatefulIngestionSourceBase): The DataHub Iceberg source plugin extracts metadata from [Iceberg tables](https://iceberg.apache.org/spec/) stored in a distributed or local file system. Typically, Iceberg tables are stored in a distributed file system like S3 or Azure Data Lake Storage (ADLS) and registered in a catalog. There are various catalog implementations like Filesystem-based, RDBMS-based or even REST-based catalogs. This Iceberg source plugin relies on the - [Iceberg python_legacy library](https://github.com/apache/iceberg/tree/master/python_legacy) and its support for catalogs is limited at the moment. - A new version of the [Iceberg Python library](https://github.com/apache/iceberg/tree/master/python) is currently in development and should fix this. - Because of this limitation, this source plugin **will only ingest HadoopCatalog-based tables that have a `version-hint.text` metadata file**. - - Ingestion of tables happens in 2 steps: - 1. Discover Iceberg tables stored in file system. - 2. Load discovered tables using Iceberg python_legacy library - - The current implementation of the Iceberg source plugin will only discover tables stored in a local file system or in ADLS. Support for S3 could - be added fairly easily. + [pyiceberg library](https://py.iceberg.apache.org/). """ def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None: @@ -112,7 +116,6 @@ def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None: self.platform: str = "iceberg" self.report: IcebergSourceReport = IcebergSourceReport() self.config: IcebergSourceConfig = config - self.iceberg_client: FilesystemTables = config.filesystem_tables @classmethod def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource": @@ -127,23 +130,31 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ).workunit_processor, ] + def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]: + for namespace in catalog.list_namespaces(): + yield from catalog.list_tables(namespace) + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - for dataset_path, dataset_name in self.config.get_paths(): # Tuple[str, str] - try: - if not self.config.table_pattern.allowed(dataset_name): - # Path contained a valid Iceberg table, but is rejected by pattern. - self.report.report_dropped(dataset_name) - continue + try: + catalog = self.config.get_catalog() + except Exception as e: + LOGGER.error("Failed to get catalog", exc_info=True) + self.report.report_failure( + "get-catalog", f"Failed to get catalog {self.config.catalog.name}: {e}" + ) + return + + for dataset_path in self._get_datasets(catalog): + dataset_name = ".".join(dataset_path) + if not self.config.table_pattern.allowed(dataset_name): + # Dataset name is rejected by pattern, report as dropped. + self.report.report_dropped(dataset_name) + continue - # Try to load an Iceberg table. Might not contain one, this will be caught by NoSuchTableException. - table: Table = self.iceberg_client.load(dataset_path) + try: + # Try to load an Iceberg table. Might not contain one, this will be caught by NoSuchIcebergTableError. + table = catalog.load_table(dataset_path) yield from self._create_iceberg_workunit(dataset_name, table) - except NoSuchTableException: - # Path did not contain a valid Iceberg table. Silently ignore this. - LOGGER.debug( - f"Path {dataset_path} does not contain table {dataset_name}" - ) - pass except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( @@ -165,26 +176,21 @@ def _create_iceberg_workunit( aspects=[Status(removed=False)], ) - custom_properties: Dict = dict(table.properties()) - custom_properties["location"] = table.location() - try: - if isinstance(table, BaseTable) and table.current_snapshot(): - custom_properties["snapshot-id"] = str( - table.current_snapshot().snapshot_id - ) - custom_properties[ - "manifest-list" - ] = table.current_snapshot().manifest_location - except KeyError: - # The above API is not well implemented, and can throw KeyError when there is no data. - pass + # Dataset properties aspect. + custom_properties = table.metadata.properties.copy() + custom_properties["location"] = table.metadata.location + custom_properties["format-version"] = str(table.metadata.format_version) + if table.current_snapshot(): + custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id) + custom_properties["manifest-list"] = table.current_snapshot().manifest_list dataset_properties = DatasetPropertiesClass( tags=[], - description=table.properties().get("comment", None), + description=table.metadata.properties.get("comment", None), customProperties=custom_properties, ) dataset_snapshot.aspects.append(dataset_properties) + # Dataset ownership aspect. dataset_ownership = self._get_ownership_aspect(table) if dataset_ownership: dataset_snapshot.aspects.append(dataset_ownership) @@ -206,8 +212,10 @@ def _create_iceberg_workunit( def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: owners = [] if self.config.user_ownership_property: - if self.config.user_ownership_property in table.properties(): - user_owner = table.properties()[self.config.user_ownership_property] + if self.config.user_ownership_property in table.metadata.properties: + user_owner = table.metadata.properties[ + self.config.user_ownership_property + ] owners.append( OwnerClass( owner=make_user_urn(user_owner), @@ -216,8 +224,10 @@ def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: ) ) if self.config.group_ownership_property: - if self.config.group_ownership_property in table.properties(): - group_owner = table.properties()[self.config.group_ownership_property] + if self.config.group_ownership_property in table.metadata.properties: + group_owner = table.metadata.properties[ + self.config.group_ownership_property + ] owners.append( OwnerClass( owner=make_group_urn(group_owner), @@ -225,9 +235,7 @@ def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]: source=None, ) ) - if owners: - return OwnershipClass(owners=owners) - return None + return OwnershipClass(owners=owners) if owners else None def _get_dataplatform_instance_aspect( self, dataset_urn: str @@ -249,191 +257,171 @@ def _get_dataplatform_instance_aspect( def _create_schema_metadata( self, dataset_name: str, table: Table ) -> SchemaMetadata: - schema_fields: List[SchemaField] = self._get_schema_fields( - table.schema().columns() - ) + schema_fields = self._get_schema_fields_for_schema(table.schema()) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=make_data_platform_urn(self.platform), version=0, hash="", - platformSchema=OtherSchema(rawSchema=repr(table.schema())), + platformSchema=OtherSchema(rawSchema=str(table.schema())), fields=schema_fields, ) return schema_metadata - def _get_schema_fields(self, columns: Tuple) -> List[SchemaField]: - canonical_schema: List[SchemaField] = [] - for column in columns: - fields = self._get_schema_fields_for_column(column) - canonical_schema.extend(fields) - return canonical_schema - - def _get_schema_fields_for_column( + def _get_schema_fields_for_schema( self, - column: NestedField, + schema: Schema, ) -> List[SchemaField]: - field_type: IcebergTypes.Type = column.type - if field_type.is_primitive_type() or field_type.is_nested_type(): - avro_schema: Dict = self._get_avro_schema_from_data_type(column) - schema_fields: List[SchemaField] = schema_util.avro_schema_to_mce_fields( - json.dumps(avro_schema), default_nullable=column.is_optional - ) - return schema_fields + avro_schema = visit(schema, ToAvroSchemaIcebergVisitor()) + schema_fields = schema_util.avro_schema_to_mce_fields( + json.dumps(avro_schema), default_nullable=False + ) + return schema_fields + + def get_report(self) -> SourceReport: + return self.report + + +class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]): + """Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema.""" - raise ValueError(f"Invalid Iceberg field type: {field_type}") + @staticmethod + def _gen_name(prefix: str) -> str: + return f"{prefix}{str(uuid.uuid4()).replace('-', '')}" - def _get_avro_schema_from_data_type(self, column: NestedField) -> Dict[str, Any]: - """ - See Iceberg documentation for Avro mapping: - https://iceberg.apache.org/#spec/#appendix-a-format-specific-requirements - """ - # The record structure represents the dataset level. - # The inner fields represent the complex field (struct/array/map/union). + def schema(self, schema: Schema, struct_result: Dict[str, Any]) -> Dict[str, Any]: + return struct_result + + def struct( + self, struct: StructType, field_results: List[Dict[str, Any]] + ) -> Dict[str, Any]: + nullable = True return { "type": "record", - "name": "__struct_", - "fields": [ - { - "name": column.name, - "type": _parse_datatype(column.type, column.is_optional), - "doc": column.doc, - } - ], + "name": self._gen_name("__struct_"), + "fields": field_results, + "native_data_type": str(struct), + "_nullable": nullable, } - def get_report(self) -> SourceReport: - return self.report - + def field(self, field: NestedField, field_result: Dict[str, Any]) -> Dict[str, Any]: + field_result["_nullable"] = not field.required + return { + "name": field.name, + "type": field_result, + "doc": field.doc, + } -def _parse_datatype(type: IcebergTypes.Type, nullable: bool = False) -> Dict[str, Any]: - # Check for complex types: struct, list, map - if type.is_list_type(): - list_type: IcebergTypes.ListType = type + def list( + self, list_type: ListType, element_result: Dict[str, Any] + ) -> Dict[str, Any]: return { "type": "array", - "items": _parse_datatype(list_type.element_type), - "native_data_type": str(type), - "_nullable": nullable, + "items": element_result, + "native_data_type": str(list_type), + "_nullable": not list_type.element_required, } - elif type.is_map_type(): + + def map( + self, + map_type: MapType, + key_result: Dict[str, Any], + value_result: Dict[str, Any], + ) -> Dict[str, Any]: # The Iceberg Map type will be handled differently. The idea is to translate the map # similar to the Map.Entry struct of Java i.e. as an array of map_entry struct, where # the map_entry struct has a key field and a value field. The key and value type can # be complex or primitive types. - map_type: IcebergTypes.MapType = type - map_entry: Dict[str, Any] = { + key_result["_nullable"] = False + value_result["_nullable"] = not map_type.value_required + map_entry = { "type": "record", - "name": _gen_name("__map_entry_"), + "name": self._gen_name("__map_entry_"), "fields": [ { "name": "key", - "type": _parse_datatype(map_type.key_type(), False), + "type": key_result, }, { "name": "value", - "type": _parse_datatype(map_type.value_type(), True), + "type": value_result, }, ], } return { "type": "array", "items": map_entry, - "native_data_type": str(type), - "_nullable": nullable, + "native_data_type": str(map_type), } - elif type.is_struct_type(): - structType: IcebergTypes.StructType = type - return _parse_struct_fields(structType.fields, nullable) - else: - # Primitive types - return _parse_basic_datatype(type, nullable) - - -def _parse_struct_fields(parts: Tuple[NestedField], nullable: bool) -> Dict[str, Any]: - fields = [] - for nested_field in parts: # type: NestedField - field_name = nested_field.name - field_type = _parse_datatype(nested_field.type, nested_field.is_optional) - fields.append({"name": field_name, "type": field_type, "doc": nested_field.doc}) - return { - "type": "record", - "name": _gen_name("__struct_"), - "fields": fields, - "native_data_type": "struct<{}>".format(parts), - "_nullable": nullable, - } - - -def _parse_basic_datatype( - type: IcebergTypes.PrimitiveType, nullable: bool -) -> Dict[str, Any]: - """ - See https://iceberg.apache.org/#spec/#avro - """ - # Check for an atomic types. - for iceberg_type in _all_atomic_types.keys(): - if isinstance(type, iceberg_type): - return { - "type": _all_atomic_types[iceberg_type], - "native_data_type": repr(type), - "_nullable": nullable, - } - - # Fixed is a special case where it is not an atomic type and not a logical type. - if isinstance(type, IcebergTypes.FixedType): - fixed_type: IcebergTypes.FixedType = type + + def visit_fixed(self, fixed_type: FixedType) -> Dict[str, Any]: return { "type": "fixed", - "name": _gen_name("__fixed_"), - "size": fixed_type.length, - "native_data_type": repr(fixed_type), - "_nullable": nullable, + "name": self._gen_name("__fixed_"), + "size": len(fixed_type), + "native_data_type": str(fixed_type), } - # Not an atomic type, so check for a logical type. - if isinstance(type, IcebergTypes.DecimalType): + def visit_decimal(self, decimal_type: DecimalType) -> Dict[str, Any]: # Also of interest: https://avro.apache.org/docs/current/spec.html#Decimal - decimal_type: IcebergTypes.DecimalType = type return { # "type": "bytes", # when using bytes, avro drops _nullable attribute and others. See unit test. "type": "fixed", # to fix avro bug ^ resolved by using a fixed type - "name": _gen_name( + "name": self._gen_name( "__fixed_" ), # to fix avro bug ^ resolved by using a fixed type "size": 1, # to fix avro bug ^ resolved by using a fixed type "logicalType": "decimal", "precision": decimal_type.precision, "scale": decimal_type.scale, - "native_data_type": repr(decimal_type), - "_nullable": nullable, + "native_data_type": str(decimal_type), + } + + def visit_boolean(self, boolean_type: BooleanType) -> Dict[str, Any]: + return { + "type": "boolean", + "native_data_type": str(boolean_type), } - elif isinstance(type, IcebergTypes.UUIDType): - uuid_type: IcebergTypes.UUIDType = type + + def visit_integer(self, integer_type: IntegerType) -> Dict[str, Any]: return { - "type": "string", - "logicalType": "uuid", - "native_data_type": repr(uuid_type), - "_nullable": nullable, + "type": "int", + "native_data_type": str(integer_type), + } + + def visit_long(self, long_type: LongType) -> Dict[str, Any]: + return { + "type": "long", + "native_data_type": str(long_type), } - elif isinstance(type, IcebergTypes.DateType): - date_type: IcebergTypes.DateType = type + + def visit_float(self, float_type: FloatType) -> Dict[str, Any]: + return { + "type": "float", + "native_data_type": str(float_type), + } + + def visit_double(self, double_type: DoubleType) -> Dict[str, Any]: + return { + "type": "double", + "native_data_type": str(double_type), + } + + def visit_date(self, date_type: DateType) -> Dict[str, Any]: return { "type": "int", "logicalType": "date", - "native_data_type": repr(date_type), - "_nullable": nullable, + "native_data_type": str(date_type), } - elif isinstance(type, IcebergTypes.TimeType): - time_type: IcebergTypes.TimeType = type + + def visit_time(self, time_type: TimeType) -> Dict[str, Any]: return { "type": "long", "logicalType": "time-micros", - "native_data_type": repr(time_type), - "_nullable": nullable, + "native_data_type": str(time_type), } - elif isinstance(type, IcebergTypes.TimestampType): - timestamp_type: IcebergTypes.TimestampType = type + + def visit_timestamp(self, timestamp_type: TimestampType) -> Dict[str, Any]: # Avro supports 2 types of timestamp: # - Timestamp: independent of a particular timezone or calendar (TZ information is lost) # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local @@ -446,12 +434,40 @@ def _parse_basic_datatype( # "logicalType": "timestamp-micros" # if timestamp_type.adjust_to_utc # else "local-timestamp-micros", - "native_data_type": repr(timestamp_type), - "_nullable": nullable, + "native_data_type": str(timestamp_type), } - return {"type": "null", "native_data_type": repr(type)} + def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]: + # Avro supports 2 types of timestamp: + # - Timestamp: independent of a particular timezone or calendar (TZ information is lost) + # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local + # utcAdjustment: bool = True + return { + "type": "long", + "logicalType": "timestamp-micros", + # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec. + # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634 + # "logicalType": "timestamp-micros" + # if timestamp_type.adjust_to_utc + # else "local-timestamp-micros", + "native_data_type": str(timestamptz_type), + } + def visit_string(self, string_type: StringType) -> Dict[str, Any]: + return { + "type": "string", + "native_data_type": str(string_type), + } -def _gen_name(prefix: str) -> str: - return f"{prefix}{str(uuid.uuid4()).replace('-', '')}" + def visit_uuid(self, uuid_type: UUIDType) -> Dict[str, Any]: + return { + "type": "string", + "logicalType": "uuid", + "native_data_type": str(uuid_type), + } + + def visit_binary(self, binary_type: BinaryType) -> Dict[str, Any]: + return { + "type": "bytes", + "native_data_type": str(binary_type), + } diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py index d5b9092912d4e..f4d93f67b27af 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py @@ -1,20 +1,11 @@ -import os from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import pydantic -from azure.storage.filedatalake import FileSystemClient, PathProperties -from iceberg.core.filesystem.abfss_filesystem import AbfssFileSystem -from iceberg.core.filesystem.filesystem_tables import FilesystemTables -from pydantic import Field, root_validator - -from datahub.configuration.common import ( - AllowDenyPattern, - ConfigModel, - ConfigurationError, -) +from typing import Dict, List, Optional + +from pydantic import Field +from pyiceberg.catalog import Catalog, load_catalog + +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import DatasetSourceConfigMixin -from datahub.ingestion.source.azure.azure_common import AdlsSourceConfig from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, StatefulStaleMetadataRemovalConfig, @@ -59,22 +50,32 @@ class IcebergProfilingConfig(ConfigModel): # include_field_sample_values: bool = True +class IcebergCatalogConfig(ConfigModel): + """ + Iceberg catalog config. + + https://py.iceberg.apache.org/configuration/ + """ + + name: str = Field( + default="default", + description="Name of catalog", + ) + type: str = Field( + description="Type of catalog. See [PyIceberg](https://py.iceberg.apache.org/configuration/) for list of possible values.", + ) + config: Dict[str, str] = Field( + description="Catalog specific configuration. See [PyIceberg documentation](https://py.iceberg.apache.org/configuration/) for details.", + ) + + class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): # Override the stateful_ingestion config param with the Iceberg custom stateful ingestion config in the IcebergSourceConfig - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( default=None, description="Iceberg Stateful Ingestion Config." ) - adls: Optional[AdlsSourceConfig] = Field( - default=None, - description="[Azure Data Lake Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction) to crawl for Iceberg tables. This is one filesystem type supported by this source and **only one can be configured**.", - ) - localfs: Optional[str] = Field( - default=None, - description="Local path to crawl for Iceberg tables. This is one filesystem type supported by this source and **only one can be configured**.", - ) - max_path_depth: int = Field( - default=2, - description="Maximum folder depth to crawl for Iceberg tables. Folders deeper than this value will be silently ignored.", + catalog: IcebergCatalogConfig = Field( + description="Catalog configuration where to find Iceberg tables. See [pyiceberg's catalog configuration details](https://py.iceberg.apache.org/configuration/).", ) table_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), @@ -95,92 +96,15 @@ def is_profiling_enabled(self) -> bool: self.profiling.operation_config ) - @root_validator() - def _ensure_one_filesystem_is_configured( - cls: "IcebergSourceConfig", values: Dict - ) -> Dict: - if values.get("adls") and values.get("localfs"): - raise ConfigurationError( - "Only one filesystem can be configured: adls or localfs" - ) - elif not values.get("adls") and not values.get("localfs"): - raise ConfigurationError( - "One filesystem (adls or localfs) needs to be configured." - ) - return values - - @property - def adls_filesystem_client(self) -> FileSystemClient: - """Azure Filesystem client if configured. - - Raises: - ConfigurationError: If ADLS is not configured. - - Returns: - FileSystemClient: Azure Filesystem client instance to access storage account files and folders. - """ - if self.adls: # TODO Use local imports for abfss - AbfssFileSystem.get_instance().set_conf(self.adls.dict()) - return self.adls.get_filesystem_client() - raise ConfigurationError("No ADLS filesystem client configured") - - @property - def filesystem_tables(self) -> FilesystemTables: - """Iceberg FilesystemTables abstraction to access tables on a filesystem. - Currently supporting ADLS (Azure Storage Account) and local filesystem. - - Raises: - ConfigurationError: If no filesystem was configured. + def get_catalog(self) -> Catalog: + """Returns the Iceberg catalog instance as configured by the `catalog` dictionary. Returns: - FilesystemTables: An Iceberg FilesystemTables abstraction instance to access tables on a filesystem + Catalog: Iceberg catalog instance. """ - if self.adls: - return FilesystemTables(self.adls.dict()) - elif self.localfs: - return FilesystemTables() - raise ConfigurationError("No filesystem client configured") - - def _get_adls_paths(self, root_path: str, depth: int) -> Iterable[Tuple[str, str]]: - if self.adls and depth < self.max_path_depth: - sub_paths = self.adls_filesystem_client.get_paths( - path=root_path, recursive=False - ) - sub_path: PathProperties - for sub_path in sub_paths: - if sub_path.is_directory: - dataset_name = ".".join( - sub_path.name[len(self.adls.base_path) + 1 :].split("/") - ) - yield self.adls.get_abfss_url(sub_path.name), dataset_name - yield from self._get_adls_paths(sub_path.name, depth + 1) - - def _get_localfs_paths( - self, root_path: str, depth: int - ) -> Iterable[Tuple[str, str]]: - if self.localfs and depth < self.max_path_depth: - for f in os.scandir(root_path): - if f.is_dir(): - dataset_name = ".".join(f.path[len(self.localfs) + 1 :].split("/")) - yield f.path, dataset_name - yield from self._get_localfs_paths(f.path, depth + 1) - - def get_paths(self) -> Iterable[Tuple[str, str]]: - """Generates a sequence of data paths and dataset names. - - Raises: - ConfigurationError: If no filesystem configured. - - Yields: - Iterator[Iterable[Tuple[str, str]]]: A sequence of tuples where the first item is the location of the dataset - and the second item is the associated dataset name. - """ - if self.adls: - yield from self._get_adls_paths(self.adls.base_path, 0) - elif self.localfs: - yield from self._get_localfs_paths(self.localfs, 0) - else: - raise ConfigurationError("No filesystem client configured") + return load_catalog( + name=self.catalog.name, **{"type": self.catalog.type, **self.catalog.config} + ) @dataclass diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py index 1437847ee4343..e1d52752d779a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_profiler.py @@ -1,17 +1,26 @@ -from datetime import datetime, timedelta from typing import Any, Callable, Dict, Iterable, Union, cast -from iceberg.api import types as IcebergTypes -from iceberg.api.data_file import DataFile -from iceberg.api.manifest_file import ManifestFile -from iceberg.api.schema import Schema -from iceberg.api.snapshot import Snapshot -from iceberg.api.table import Table -from iceberg.api.types import Conversions, NestedField, Type, TypeID -from iceberg.core.base_table import BaseTable -from iceberg.core.filesystem import FileSystemInputFile -from iceberg.core.manifest_reader import ManifestReader -from iceberg.exceptions.exceptions import FileSystemNotFound +from pyiceberg.conversions import from_bytes +from pyiceberg.schema import Schema +from pyiceberg.table import Table +from pyiceberg.types import ( + DateType, + DecimalType, + DoubleType, + FloatType, + IcebergType, + IntegerType, + LongType, + TimestampType, + TimestamptzType, + TimeType, +) +from pyiceberg.utils.datetime import ( + days_to_date, + to_human_time, + to_human_timestamp, + to_human_timestamptz, +) from datahub.emitter.mce_builder import get_sys_time from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -51,15 +60,18 @@ def _aggregate_bounds( schema: Schema, aggregator: Callable, aggregated_values: Dict[int, Any], - manifest_values: Dict[int, Any], + manifest_values: Dict[int, bytes], ) -> None: for field_id, value_encoded in manifest_values.items(): # type: int, Any - field: NestedField = schema.find_field(field_id) - # Bounds in manifests can reference historical field IDs that are not part of the current schema. - # We simply not profile those since we only care about the current snapshot. - if field and IcebergProfiler._is_numeric_type(field.type): - value_decoded = Conversions.from_byte_buffer(field.type, value_encoded) - if value_decoded: + try: + field = schema.find_field(field_id) + except ValueError: + # Bounds in manifests can reference historical field IDs that are not part of the current schema. + # We simply not profile those since we only care about the current snapshot. + continue + if IcebergProfiler._is_numeric_type(field.field_type): + value_decoded = from_bytes(field.field_type, value_encoded) + if value_decoded is not None: agg_value = aggregated_values.get(field_id) aggregated_values[field_id] = ( aggregator(agg_value, value_decoded) @@ -97,12 +109,23 @@ def profile_table( Yields: Iterator[Iterable[MetadataWorkUnit]]: Workunits related to datasetProfile. """ - if not table.snapshots() or not isinstance(table, BaseTable): + current_snapshot = table.current_snapshot() + if not current_snapshot: # Table has no data, cannot profile, or we can't get current_snapshot. return - row_count: int = int(table.current_snapshot().summary["total-records"]) - column_count: int = len(table.schema()._id_to_name) + row_count = ( + int(current_snapshot.summary.additional_properties["total-records"]) + if current_snapshot.summary + else 0 + ) + column_count = len( + [ + field.field_id + for field in table.schema().fields + if field.field_type.is_primitive + ] + ) dataset_profile = DatasetProfileClass( timestampMillis=get_sys_time(), rowCount=row_count, @@ -110,47 +133,44 @@ def profile_table( ) dataset_profile.fieldProfiles = [] - field_paths: Dict[int, str] = table.schema()._id_to_name - current_snapshot: Snapshot = table.current_snapshot() - total_count: int = 0 + total_count = 0 null_counts: Dict[int, int] = {} min_bounds: Dict[int, Any] = {} max_bounds: Dict[int, Any] = {} - manifest: ManifestFile try: - for manifest in current_snapshot.manifests: - manifest_input_file = FileSystemInputFile.from_location( - manifest.manifest_path, table.ops.conf - ) - manifest_reader = ManifestReader.read(manifest_input_file) - data_file: DataFile - for data_file in manifest_reader.iterator(): + for manifest in current_snapshot.manifests(table.io): + for manifest_entry in manifest.fetch_manifest_entry(table.io): + data_file = manifest_entry.data_file if self.config.include_field_null_count: null_counts = self._aggregate_counts( - null_counts, data_file.null_value_counts() + null_counts, data_file.null_value_counts ) if self.config.include_field_min_value: self._aggregate_bounds( table.schema(), min, min_bounds, - data_file.lower_bounds(), + data_file.lower_bounds, ) if self.config.include_field_max_value: self._aggregate_bounds( table.schema(), max, max_bounds, - data_file.upper_bounds(), + data_file.upper_bounds, ) - total_count += data_file.record_count() - # TODO Work on error handling to provide better feedback. Iceberg exceptions are weak... - except FileSystemNotFound as e: - raise Exception("Error loading table manifests") from e + total_count += data_file.record_count + except Exception as e: + # Catch any errors that arise from attempting to read the Iceberg table's manifests + # This will prevent stateful ingestion from being blocked by an error (profiling is not critical) + self.report.report_warning( + "profiling", + f"Error while profiling dataset {dataset_name}: {e}", + ) if row_count: # Iterating through fieldPaths introduces unwanted stats for list element fields... - for field_id, field_path in field_paths.items(): - field: NestedField = table.schema().find_field(field_id) + for field_path, field_id in table.schema()._name_to_id.items(): + field = table.schema().find_field(field_id) column_profile = DatasetFieldProfileClass(fieldPath=field_path) if self.config.include_field_null_count: column_profile.nullCount = cast(int, null_counts.get(field_id, 0)) @@ -160,16 +180,16 @@ def profile_table( if self.config.include_field_min_value: column_profile.min = ( - self._renderValue( - dataset_name, field.type, min_bounds.get(field_id) + self._render_value( + dataset_name, field.field_type, min_bounds.get(field_id) ) if field_id in min_bounds else None ) if self.config.include_field_max_value: column_profile.max = ( - self._renderValue( - dataset_name, field.type, max_bounds.get(field_id) + self._render_value( + dataset_name, field.field_type, max_bounds.get(field_id) ) if field_id in max_bounds else None @@ -181,24 +201,18 @@ def profile_table( aspect=dataset_profile, ).as_workunit() - # The following will eventually be done by the Iceberg API (in the new Python refactored API). - def _renderValue( - self, dataset_name: str, value_type: Type, value: Any + def _render_value( + self, dataset_name: str, value_type: IcebergType, value: Any ) -> Union[str, None]: try: - if value_type.type_id == TypeID.TIMESTAMP: - if value_type.adjust_to_utc: - # TODO Deal with utc when required - microsecond_unix_ts = value - else: - microsecond_unix_ts = value - return datetime.fromtimestamp(microsecond_unix_ts / 1000000.0).strftime( - "%Y-%m-%d %H:%M:%S" - ) - elif value_type.type_id == TypeID.DATE: - return (datetime(1970, 1, 1, 0, 0) + timedelta(value - 1)).strftime( - "%Y-%m-%d" - ) + if isinstance(value_type, TimestampType): + return to_human_timestamp(value) + if isinstance(value_type, TimestamptzType): + return to_human_timestamptz(value) + elif isinstance(value_type, DateType): + return days_to_date(value).strftime("%Y-%m-%d") + elif isinstance(value_type, TimeType): + return to_human_time(value) return str(value) except Exception as e: self.report.report_warning( @@ -208,17 +222,18 @@ def _renderValue( return None @staticmethod - def _is_numeric_type(type: Type) -> bool: + def _is_numeric_type(type: IcebergType) -> bool: return isinstance( type, ( - IcebergTypes.DateType, - IcebergTypes.DecimalType, - IcebergTypes.DoubleType, - IcebergTypes.FloatType, - IcebergTypes.IntegerType, - IcebergTypes.LongType, - IcebergTypes.TimestampType, - IcebergTypes.TimeType, + DateType, + DecimalType, + DoubleType, + FloatType, + IntegerType, + LongType, + TimestampType, + TimestamptzType, + TimeType, ), ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index 5805790fe8bb7..5e8413bbb6f30 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -303,11 +303,13 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # This method can be called on the main thread or an async thread, so we must create a new loop if one doesn't exist # See https://docs.python.org/3/library/asyncio-eventloop.html for more info. + created_event_loop = False try: event_loop: asyncio.AbstractEventLoop = asyncio.get_event_loop() except RuntimeError: event_loop = asyncio.new_event_loop() asyncio.set_event_loop(event_loop) + created_event_loop = True # Step 1: Produce MetadataWorkUnits for CorpGroups. okta_groups: Optional[Iterable[Group]] = None @@ -408,7 +410,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ).as_workunit() # Step 4: Close the event loop - event_loop.close() + if created_event_loop: + event_loop.close() def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 42924a09a39e9..3925ba51c16dd 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -108,7 +108,7 @@ class ApiWorkUnit(MetadataWorkUnit): @platform_name("OpenAPI", id="openapi") @config_class(OpenApiConfig) -@support_status(SupportStatus.CERTIFIED) +@support_status(SupportStatus.INCUBATING) @capability(SourceCapability.PLATFORM_INSTANCE, supported=False, description="") class APISource(Source, ABC): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index 953f0edd7c2bb..bbb1876102578 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -449,6 +449,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def report_status(self, step: str, status: bool) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 4247ee9330cfb..ab5d3a4e007ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -261,13 +261,14 @@ def init_spark(self): import pydeequ conf = SparkConf() - + spark_version = os.getenv("SPARK_VERSION", "3.3") conf.set( "spark.jars.packages", ",".join( [ "org.apache.hadoop:hadoop-aws:3.0.3", - "org.apache.spark:spark-avro_2.12:3.0.3", + # Spark's avro version needs to be matched with the Spark version + f"org.apache.spark:spark-avro_2.12:{spark_version}{'.0' if spark_version.count('.') == 1 else ''}", pydeequ.deequ_maven_coord, ] ), @@ -374,10 +375,10 @@ def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]: elif ext.endswith(".avro"): try: df = self.spark.read.format("avro").load(file) - except AnalysisException: + except AnalysisException as e: self.report.report_warning( file, - "To ingest avro files, please install the spark-avro package: https://mvnrepository.com/artifact/org.apache.spark/spark-avro_2.12/3.0.3", + f"Avro file reading failed with exception. The error was: {e}", ) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index f79be7174dbd9..a64921ea01759 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -25,7 +25,6 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantUsageRunSkipHandler, ) -from datahub.ingestion.source.usage.usage_common import TOTAL_BUDGET_FOR_QUERY_LIST from datahub.ingestion.source_report.ingestion_stage import ( USAGE_EXTRACTION_OPERATIONAL_STATS, USAGE_EXTRACTION_USAGE_AGGREGATION, @@ -280,7 +279,7 @@ def build_usage_statistics_for_dataset(self, dataset_identifier, row): def _map_top_sql_queries(self, top_sql_queries: Dict) -> List[str]: budget_per_query: int = int( - TOTAL_BUDGET_FOR_QUERY_LIST / self.config.top_n_queries + self.config.queries_character_limit / self.config.top_n_queries ) return sorted( [ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 90b751c875add..e561ed0e2d146 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -770,7 +770,8 @@ def _process_schema( if self.config.parse_view_ddl: for view in views: key = self.get_dataset_identifier(view.name, schema_name, db_name) - self.view_definitions[key] = view.view_definition + if view.view_definition: + self.view_definitions[key] = view.view_definition if self.config.include_technical_schema or self.config.parse_view_ddl: for view in views: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index c95e20252e421..9cb613bde1e9f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -9,7 +9,7 @@ from sqlalchemy.engine.reflection import Inspector from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.emitter.mcp_builder import ContainerKey +from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey from datahub.ingestion.api.decorators import ( SourceCapability, SupportStatus, @@ -192,15 +192,12 @@ def gen_schema_containers( database: str, extra_properties: Optional[Dict[str, Any]] = None, ) -> Iterable[MetadataWorkUnit]: - database_container_key = gen_database_key( - database, - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, + database_container_key = self.get_database_container_key( + db_name=database, schema=schema ) yield from gen_database_container( - database=database, + database=database_container_key.database, database_container_key=database_container_key, sub_types=[DatasetContainerSubTypes.DATABASE], domain_registry=self.domain_registry, @@ -208,7 +205,7 @@ def gen_schema_containers( extra_properties=extra_properties, ) - def get_database_container_key(self, db_name: str, schema: str) -> ContainerKey: + def get_database_container_key(self, db_name: str, schema: str) -> DatabaseKey: # Because our overridden get_allowed_schemas method returns db_name as the schema name, # the db_name and schema here will be the same. Hence, we just ignore the schema parameter. # Based on community feedback, db_name only available if it is explicitly specified in the connection string. diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py index aa0493a18ab58..345f5bd57b44c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic.py @@ -44,7 +44,7 @@ class BaseView: comment: Optional[str] created: Optional[datetime] last_altered: Optional[datetime] - view_definition: str + view_definition: Optional[str] size_in_bytes: Optional[int] = None rows_count: Optional[int] = None column_count: Optional[int] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py index c5baf148b0e5e..723a8c5fd8669 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_utils.py @@ -35,7 +35,7 @@ def gen_schema_key( platform: str, platform_instance: Optional[str], env: Optional[str], -) -> ContainerKey: +) -> SchemaKey: return SchemaKey( database=db_name, schema=schema, @@ -48,7 +48,7 @@ def gen_schema_key( def gen_database_key( database: str, platform: str, platform_instance: Optional[str], env: Optional[str] -) -> ContainerKey: +) -> DatabaseKey: return DatabaseKey( database=database, platform=platform, diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py index ffa08752070dd..855958f0755e1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/clickhouse_usage.py @@ -248,6 +248,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def get_report(self) -> SourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py index ea817f40f6a2b..99a980b326e53 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/redshift_usage.py @@ -392,6 +392,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def get_report(self) -> RedshiftUsageSourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 7dd66fd1e3d0c..9394a8bba5e0b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -282,6 +282,7 @@ def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit: self.config.top_n_queries, self.config.format_sql_queries, self.config.include_top_n_queries, + self.config.queries_character_limit, ) def get_report(self) -> SourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py index 92f8223f34d14..4547f9f368198 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/usage_common.py @@ -47,7 +47,7 @@ ResourceType = TypeVar("ResourceType") # The total number of characters allowed across all queries in a single workunit. -TOTAL_BUDGET_FOR_QUERY_LIST = 24000 +DEFAULT_QUERIES_CHARACTER_LIMIT = 24000 def default_user_urn_builder(email: str) -> str: @@ -65,8 +65,8 @@ def make_usage_workunit( resource_urn_builder: Callable[[ResourceType], str], top_n_queries: int, format_sql_queries: bool, + queries_character_limit: int, user_urn_builder: Optional[Callable[[str], str]] = None, - total_budget_for_query_list: int = TOTAL_BUDGET_FOR_QUERY_LIST, query_trimmer_string: str = " ...", ) -> MetadataWorkUnit: if user_urn_builder is None: @@ -74,7 +74,7 @@ def make_usage_workunit( top_sql_queries: Optional[List[str]] = None if query_freq is not None: - budget_per_query: int = int(total_budget_for_query_list / top_n_queries) + budget_per_query: int = int(queries_character_limit / top_n_queries) top_sql_queries = [ trim_query( format_sql_query(query, keyword_case="upper", reindent_aligned=True) @@ -154,8 +154,8 @@ def make_usage_workunit( top_n_queries: int, format_sql_queries: bool, include_top_n_queries: bool, + queries_character_limit: int, user_urn_builder: Optional[Callable[[str], str]] = None, - total_budget_for_query_list: int = TOTAL_BUDGET_FOR_QUERY_LIST, query_trimmer_string: str = " ...", ) -> MetadataWorkUnit: query_freq = ( @@ -173,12 +173,21 @@ def make_usage_workunit( user_urn_builder=user_urn_builder, top_n_queries=top_n_queries, format_sql_queries=format_sql_queries, - total_budget_for_query_list=total_budget_for_query_list, + queries_character_limit=queries_character_limit, query_trimmer_string=query_trimmer_string, ) class BaseUsageConfig(BaseTimeWindowConfig): + queries_character_limit: int = Field( + default=DEFAULT_QUERIES_CHARACTER_LIMIT, + description=( + "Total character limit for all queries in a single usage aspect." + " Queries will be truncated to length `queries_character_limit / top_n_queries`." + ), + hidden_from_docs=True, # Don't want to encourage people to break elasticsearch + ) + top_n_queries: pydantic.PositiveInt = Field( default=10, description="Number of top queries to save to each table." ) @@ -203,10 +212,10 @@ class BaseUsageConfig(BaseTimeWindowConfig): ) @pydantic.validator("top_n_queries") - def ensure_top_n_queries_is_not_too_big(cls, v: int) -> int: + def ensure_top_n_queries_is_not_too_big(cls, v: int, values: dict) -> int: minimum_query_size = 20 - max_queries = int(TOTAL_BUDGET_FOR_QUERY_LIST / minimum_query_size) + max_queries = int(values["queries_character_limit"] / minimum_query_size) if v > max_queries: raise ValueError( f"top_n_queries is set to {v} but it can be maximum {max_queries}" @@ -259,6 +268,7 @@ def generate_workunits( include_top_n_queries=self.config.include_top_n_queries, resource_urn_builder=resource_urn_builder, user_urn_builder=user_urn_builder, + queries_character_limit=self.config.queries_character_limit, ) diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index b017afc8c1448..5c52e1ab4f0b3 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -55,7 +55,6 @@ def assert_metadata_files_equal( output = load_json_file(output_path) if update_golden and not golden_exists: - golden = load_json_file(output_path) shutil.copyfile(str(output_path), str(golden_path)) return else: diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 534cac5cef2aa..d677b0874b985 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -459,6 +459,19 @@ def _sqlglot_force_column_normalizer( # statement.sql(pretty=True, dialect=dialect), # ) + def _schema_aware_fuzzy_column_resolve( + table: Optional[_TableName], sqlglot_column: str + ) -> str: + default_col_name = ( + sqlglot_column.lower() if use_case_insensitive_cols else sqlglot_column + ) + if table: + return table_schema_normalized_mapping[table].get( + sqlglot_column, default_col_name + ) + else: + return default_col_name + # Optimize the statement + qualify column references. logger.debug( "Prior to qualification sql %s", statement.sql(pretty=True, dialect=dialect) @@ -540,10 +553,8 @@ def _sqlglot_force_column_normalizer( normalized_col = sqlglot.parse_one(node.name).this.name if node.subfield: normalized_col = f"{normalized_col}.{node.subfield}" - col = table_schema_normalized_mapping[table_ref].get( - normalized_col, normalized_col - ) + col = _schema_aware_fuzzy_column_resolve(table_ref, normalized_col) direct_col_upstreams.add(_ColumnRef(table=table_ref, column=col)) else: # This branch doesn't matter. For example, a count(*) column would go here, and @@ -557,6 +568,9 @@ def _sqlglot_force_column_normalizer( # This is a bit jank since we're relying on sqlglot internals, but it seems to be # the best way to do it. output_col = original_col_expression.this.sql(dialect=dialect) + + output_col = _schema_aware_fuzzy_column_resolve(output_table, output_col) + if not direct_col_upstreams: logger.debug(f' "{output_col}" has no upstreams') column_lineage.append( @@ -699,10 +713,7 @@ def _sqlglot_lineage_inner( # Fetch schema info for the relevant tables. table_name_urn_mapping: Dict[_TableName, str] = {} table_name_schema_mapping: Dict[_TableName, SchemaInfo] = {} - for table, is_input in itertools.chain( - [(table, True) for table in tables], - [(table, False) for table in modified], - ): + for table in itertools.chain(tables, modified): # For select statements, qualification will be a no-op. For other statements, this # is where the qualification actually happens. qualified_table = table.qualified( @@ -712,19 +723,21 @@ def _sqlglot_lineage_inner( urn, schema_info = schema_resolver.resolve_table(qualified_table) table_name_urn_mapping[qualified_table] = urn - if is_input and schema_info: + if schema_info: table_name_schema_mapping[qualified_table] = schema_info # Also include the original, non-qualified table name in the urn mapping. table_name_urn_mapping[table] = urn + total_tables_discovered = len(tables) + len(modified) + total_schemas_resolved = len(table_name_schema_mapping) debug_info = SqlParsingDebugInfo( - confidence=0.9 if len(tables) == len(table_name_schema_mapping) + confidence=0.9 if total_tables_discovered == total_schemas_resolved # If we're missing any schema info, our confidence will be in the 0.2-0.5 range depending # on how many tables we were able to resolve. - else 0.2 + 0.3 * len(table_name_schema_mapping) / len(tables), - tables_discovered=len(tables), - table_schemas_resolved=len(table_name_schema_mapping), + else 0.2 + 0.3 * total_schemas_resolved / total_tables_discovered, + tables_discovered=total_tables_discovered, + table_schemas_resolved=total_schemas_resolved, ) logger.debug( f"Resolved {len(table_name_schema_mapping)} of {len(tables)} table schemas" @@ -789,7 +802,8 @@ def sqlglot_lineage( This is a schema-aware lineage generator, meaning that it will use the schema information for the tables involved to generate lineage information for the columns involved. The schema_resolver is responsible for providing - the table schema information. + the table schema information. In most cases, the DataHubGraph can be used + to construct a schema_resolver that will fetch schemas from DataHub. The parser supports most types of DML statements (SELECT, INSERT, UPDATE, DELETE, MERGE) as well as CREATE TABLE AS SELECT (CTAS) statements. It @@ -859,7 +873,6 @@ def create_lineage_sql_parsed_result( schema: Optional[str] = None, graph: Optional[DataHubGraph] = None, ) -> Optional["SqlParsingResult"]: - parsed_result: Optional["SqlParsingResult"] = None try: schema_resolver = ( diff --git a/metadata-ingestion/src/datahub_provider/__init__.py b/metadata-ingestion/src/datahub_provider/__init__.py index 4c0b2bd8e714e..306076dadf82b 100644 --- a/metadata-ingestion/src/datahub_provider/__init__.py +++ b/metadata-ingestion/src/datahub_provider/__init__.py @@ -1,28 +1 @@ -import datahub - - -# This is needed to allow Airflow to pick up specific metadata fields it needs for -# certain features. We recognize it's a bit unclean to define these in multiple places, -# but at this point it's the only workaround if you'd like your custom conn type to -# show up in the Airflow UI. -def get_provider_info(): - return { - "name": "DataHub", - "description": "`DataHub `__\n", - "connection-types": [ - { - "hook-class-name": "datahub_provider.hooks.datahub.DatahubRestHook", - "connection-type": "datahub_rest", - }, - { - "hook-class-name": "datahub_provider.hooks.datahub.DatahubKafkaHook", - "connection-type": "datahub_kafka", - }, - ], - "hook-class-names": [ - "datahub_provider.hooks.datahub.DatahubRestHook", - "datahub_provider.hooks.datahub.DatahubKafkaHook", - ], - "package-name": datahub.__package_name__, - "versions": [datahub.__version__], - } +from datahub_airflow_plugin import get_provider_info diff --git a/metadata-ingestion/src/datahub_provider/_airflow_compat.py b/metadata-ingestion/src/datahub_provider/_airflow_compat.py index 67c3348ec987c..98b96e32fee78 100644 --- a/metadata-ingestion/src/datahub_provider/_airflow_compat.py +++ b/metadata-ingestion/src/datahub_provider/_airflow_compat.py @@ -1,12 +1,3 @@ -# This module must be imported before any Airflow imports in any of our files. -# The AIRFLOW_PATCHED just helps avoid flake8 errors. +from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED -from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED - -assert MARKUPSAFE_PATCHED - -AIRFLOW_PATCHED = True - -__all__ = [ - "AIRFLOW_PATCHED", -] +__all__ = ["AIRFLOW_PATCHED"] diff --git a/metadata-ingestion/src/datahub_provider/_airflow_shims.py b/metadata-ingestion/src/datahub_provider/_airflow_shims.py index 31e1237c0d21d..d5e4a019a4b81 100644 --- a/metadata-ingestion/src/datahub_provider/_airflow_shims.py +++ b/metadata-ingestion/src/datahub_provider/_airflow_shims.py @@ -1,29 +1,15 @@ -from datahub_provider._airflow_compat import AIRFLOW_PATCHED - -from airflow.models.baseoperator import BaseOperator - -try: - from airflow.models.mappedoperator import MappedOperator - from airflow.models.operator import Operator - from airflow.operators.empty import EmptyOperator -except ModuleNotFoundError: - # Operator isn't a real class, but rather a type alias defined - # as the union of BaseOperator and MappedOperator. - # Since older versions of Airflow don't have MappedOperator, we can just use BaseOperator. - Operator = BaseOperator # type: ignore - MappedOperator = None # type: ignore - from airflow.operators.dummy import DummyOperator as EmptyOperator # type: ignore - -try: - from airflow.sensors.external_task import ExternalTaskSensor -except ImportError: - from airflow.sensors.external_task_sensor import ExternalTaskSensor # type: ignore - -assert AIRFLOW_PATCHED +from datahub_airflow_plugin._airflow_shims import ( + AIRFLOW_PATCHED, + EmptyOperator, + ExternalTaskSensor, + MappedOperator, + Operator, +) __all__ = [ - "Operator", - "MappedOperator", + "AIRFLOW_PATCHED", "EmptyOperator", "ExternalTaskSensor", + "Operator", + "MappedOperator", ] diff --git a/metadata-ingestion/src/datahub_provider/_lineage_core.py b/metadata-ingestion/src/datahub_provider/_lineage_core.py index 07c70eeca4e6d..4305b39cac684 100644 --- a/metadata-ingestion/src/datahub_provider/_lineage_core.py +++ b/metadata-ingestion/src/datahub_provider/_lineage_core.py @@ -1,114 +1,3 @@ -from datetime import datetime -from typing import TYPE_CHECKING, Dict, List +from datahub_airflow_plugin._lineage_core import DatahubBasicLineageConfig -import datahub.emitter.mce_builder as builder -from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub.configuration.common import ConfigModel -from datahub.utilities.urns.dataset_urn import DatasetUrn -from datahub_provider.client.airflow_generator import AirflowGenerator -from datahub_provider.entities import _Entity - -if TYPE_CHECKING: - from airflow import DAG - from airflow.models.dagrun import DagRun - from airflow.models.taskinstance import TaskInstance - - from datahub_provider._airflow_shims import Operator - from datahub_provider.hooks.datahub import DatahubGenericHook - - -def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: - return [DatasetUrn.create_from_string(let.urn) for let in iolets] - - -class DatahubBasicLineageConfig(ConfigModel): - enabled: bool = True - - # DataHub hook connection ID. - datahub_conn_id: str - - # Cluster to associate with the pipelines and tasks. Defaults to "prod". - cluster: str = builder.DEFAULT_FLOW_CLUSTER - - # If true, the owners field of the DAG will be capture as a DataHub corpuser. - capture_ownership_info: bool = True - - # If true, the tags field of the DAG will be captured as DataHub tags. - capture_tags_info: bool = True - - capture_executions: bool = False - - def make_emitter_hook(self) -> "DatahubGenericHook": - # This is necessary to avoid issues with circular imports. - from datahub_provider.hooks.datahub import DatahubGenericHook - - return DatahubGenericHook(self.datahub_conn_id) - - -def send_lineage_to_datahub( - config: DatahubBasicLineageConfig, - operator: "Operator", - inlets: List[_Entity], - outlets: List[_Entity], - context: Dict, -) -> None: - if not config.enabled: - return - - dag: "DAG" = context["dag"] - task: "Operator" = context["task"] - ti: "TaskInstance" = context["task_instance"] - - hook = config.make_emitter_hook() - emitter = hook.make_emitter() - - dataflow = AirflowGenerator.generate_dataflow( - cluster=config.cluster, - dag=dag, - capture_tags=config.capture_tags_info, - capture_owner=config.capture_ownership_info, - ) - dataflow.emit(emitter) - operator.log.info(f"Emitted from Lineage: {dataflow}") - - datajob = AirflowGenerator.generate_datajob( - cluster=config.cluster, - task=task, - dag=dag, - capture_tags=config.capture_tags_info, - capture_owner=config.capture_ownership_info, - ) - datajob.inlets.extend(_entities_to_urn_list(inlets)) - datajob.outlets.extend(_entities_to_urn_list(outlets)) - - datajob.emit(emitter) - operator.log.info(f"Emitted from Lineage: {datajob}") - - if config.capture_executions: - dag_run: "DagRun" = context["dag_run"] - - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=config.cluster, - ti=ti, - dag=dag, - dag_run=dag_run, - datajob=datajob, - emit_templates=False, - ) - - operator.log.info(f"Emitted from Lineage: {dpi}") - - dpi = AirflowGenerator.complete_datajob( - emitter=emitter, - cluster=config.cluster, - ti=ti, - dag=dag, - dag_run=dag_run, - datajob=datajob, - result=InstanceRunResult.SUCCESS, - end_timestamp_millis=int(datetime.utcnow().timestamp() * 1000), - ) - operator.log.info(f"Emitted from Lineage: {dpi}") - - emitter.flush() +__all__ = ["DatahubBasicLineageConfig"] diff --git a/metadata-ingestion/src/datahub_provider/_plugin.py b/metadata-ingestion/src/datahub_provider/_plugin.py index ed2e4e1c93d80..3d74e715bd644 100644 --- a/metadata-ingestion/src/datahub_provider/_plugin.py +++ b/metadata-ingestion/src/datahub_provider/_plugin.py @@ -1,368 +1,3 @@ -from datahub_provider._airflow_compat import AIRFLOW_PATCHED +from datahub_airflow_plugin.datahub_plugin import DatahubPlugin -import contextlib -import logging -import traceback -from typing import Any, Callable, Iterable, List, Optional, Union - -from airflow.configuration import conf -from airflow.lineage import PIPELINE_OUTLETS -from airflow.models.baseoperator import BaseOperator -from airflow.plugins_manager import AirflowPlugin -from airflow.utils.module_loading import import_string -from cattr import structure - -from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub_provider._airflow_shims import MappedOperator, Operator -from datahub_provider.client.airflow_generator import AirflowGenerator -from datahub_provider.hooks.datahub import DatahubGenericHook -from datahub_provider.lineage.datahub import DatahubLineageConfig - -assert AIRFLOW_PATCHED -logger = logging.getLogger(__name__) - -TASK_ON_FAILURE_CALLBACK = "on_failure_callback" -TASK_ON_SUCCESS_CALLBACK = "on_success_callback" - - -def get_lineage_config() -> DatahubLineageConfig: - """Load the lineage config from airflow.cfg.""" - - enabled = conf.get("datahub", "enabled", fallback=True) - datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") - cluster = conf.get("datahub", "cluster", fallback="prod") - graceful_exceptions = conf.get("datahub", "graceful_exceptions", fallback=True) - capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) - capture_ownership_info = conf.get( - "datahub", "capture_ownership_info", fallback=True - ) - capture_executions = conf.get("datahub", "capture_executions", fallback=True) - return DatahubLineageConfig( - enabled=enabled, - datahub_conn_id=datahub_conn_id, - cluster=cluster, - graceful_exceptions=graceful_exceptions, - capture_ownership_info=capture_ownership_info, - capture_tags_info=capture_tags_info, - capture_executions=capture_executions, - ) - - -def _task_inlets(operator: "Operator") -> List: - # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets - if hasattr(operator, "_inlets"): - return operator._inlets # type: ignore[attr-defined, union-attr] - return operator.inlets - - -def _task_outlets(operator: "Operator") -> List: - # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets - # We have to use _outlets because outlets is empty in Airflow < 2.4.0 - if hasattr(operator, "_outlets"): - return operator._outlets # type: ignore[attr-defined, union-attr] - return operator.outlets - - -def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: - # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae - # in Airflow 2.4. - # TODO: ignore/handle airflow's dataset type in our lineage - - inlets: List[Any] = [] - task_inlets = _task_inlets(task) - # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator - if isinstance(task_inlets, (str, BaseOperator)): - inlets = [ - task_inlets, - ] - - if task_inlets and isinstance(task_inlets, list): - inlets = [] - task_ids = ( - {o for o in task_inlets if isinstance(o, str)} - .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) - .intersection(task.get_flat_relative_ids(upstream=True)) - ) - - from airflow.lineage import AUTO - - # pick up unique direct upstream task_ids if AUTO is specified - if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: - print("Picking up unique direct upstream task_ids as AUTO is specified") - task_ids = task_ids.union( - task_ids.symmetric_difference(task.upstream_task_ids) - ) - - inlets = task.xcom_pull( - context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS - ) - - # re-instantiate the obtained inlets - inlets = [ - structure(item["data"], import_string(item["type_name"])) - # _get_instance(structure(item, Metadata)) - for sublist in inlets - if sublist - for item in sublist - ] - - for inlet in task_inlets: - if not isinstance(inlet, str): - inlets.append(inlet) - - return inlets - - -def _make_emit_callback( - logger: logging.Logger, -) -> Callable[[Optional[Exception], str], None]: - def emit_callback(err: Optional[Exception], msg: str) -> None: - if err: - logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) - - return emit_callback - - -def datahub_task_status_callback(context, status): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - dataflow = AirflowGenerator.generate_dataflow( - cluster=context["_datahub_config"].cluster, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - task.log.info(f"Emitting Datahub Dataflow: {dataflow}") - dataflow.emit(emitter, callback=_make_emit_callback(task.log)) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub Datajob: {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") - - dpi = AirflowGenerator.complete_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag_run=context["dag_run"], - result=status, - dag=dag, - datajob=datajob, - end_timestamp_millis=int(ti.end_date.timestamp() * 1000), - ) - task.log.info(f"Emitted Completed Data Process Instance: {dpi}") - - emitter.flush() - - -def datahub_pre_execution(context): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - task.log.info("Running Datahub pre_execute method") - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=context["ti"].task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub dataJob {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") - - emitter.flush() - - -def _wrap_pre_execution(pre_execution): - def custom_pre_execution(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - datahub_pre_execution(context) - - # Call original policy - if pre_execution: - pre_execution(context) - - return custom_pre_execution - - -def _wrap_on_failure_callback(on_failure_callback): - def custom_on_failure_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_failure_callback: - on_failure_callback(context) - - return custom_on_failure_callback - - -def _wrap_on_success_callback(on_success_callback): - def custom_on_success_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_success_callback: - on_success_callback(context) - - return custom_on_success_callback - - -def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: - task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") - # task.add_inlets(["auto"]) - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - # MappedOperator's callbacks don't have setters until Airflow 2.X.X - # https://github.com/apache/airflow/issues/24547 - # We can bypass this by going through partial_kwargs for now - if MappedOperator and isinstance(task, MappedOperator): # type: ignore - on_failure_callback_prop: property = getattr( - MappedOperator, TASK_ON_FAILURE_CALLBACK - ) - on_success_callback_prop: property = getattr( - MappedOperator, TASK_ON_SUCCESS_CALLBACK - ) - if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: - task.log.debug( - "Using MappedOperator's partial_kwargs instead of callback properties" - ) - task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( - task.on_failure_callback - ) - task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( - task.on_success_callback - ) - return - - task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore - task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - -def _wrap_task_policy(policy): - if policy and hasattr(policy, "_task_policy_patched_by"): - return policy - - def custom_task_policy(task): - policy(task) - task_policy(task) - - # Add a flag to the policy to indicate that we've patched it. - custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] - return custom_task_policy - - -def _patch_policy(settings): - if hasattr(settings, "task_policy"): - datahub_task_policy = _wrap_task_policy(settings.task_policy) - settings.task_policy = datahub_task_policy - - -def _patch_datahub_policy(): - with contextlib.suppress(ImportError): - import airflow_local_settings - - _patch_policy(airflow_local_settings) - - from airflow.models.dagbag import settings - - _patch_policy(settings) - - -_patch_datahub_policy() - - -class DatahubPlugin(AirflowPlugin): - name = "datahub_plugin" +__all__ = ["DatahubPlugin"] diff --git a/metadata-ingestion/src/datahub_provider/client/airflow_generator.py b/metadata-ingestion/src/datahub_provider/client/airflow_generator.py index d2d29b00d244f..d50ae152f2b1e 100644 --- a/metadata-ingestion/src/datahub_provider/client/airflow_generator.py +++ b/metadata-ingestion/src/datahub_provider/client/airflow_generator.py @@ -1,509 +1,3 @@ -from datahub_provider._airflow_compat import AIRFLOW_PATCHED +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union, cast - -from airflow.configuration import conf - -from datahub.api.entities.datajob import DataFlow, DataJob -from datahub.api.entities.dataprocess.dataprocess_instance import ( - DataProcessInstance, - InstanceRunResult, -) -from datahub.metadata.schema_classes import DataProcessTypeClass -from datahub.utilities.urns.data_flow_urn import DataFlowUrn -from datahub.utilities.urns.data_job_urn import DataJobUrn - -assert AIRFLOW_PATCHED - -if TYPE_CHECKING: - from airflow import DAG - from airflow.models import DagRun, TaskInstance - - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - from datahub_provider._airflow_shims import Operator - - -def _task_downstream_task_ids(operator: "Operator") -> Set[str]: - if hasattr(operator, "downstream_task_ids"): - return operator.downstream_task_ids - return operator._downstream_task_id # type: ignore[attr-defined,union-attr] - - -class AirflowGenerator: - @staticmethod - def _get_dependencies( - task: "Operator", dag: "DAG", flow_urn: DataFlowUrn - ) -> List[DataJobUrn]: - from datahub_provider._airflow_shims import ExternalTaskSensor - - # resolve URNs for upstream nodes in subdags upstream of the current task. - upstream_subdag_task_urns: List[DataJobUrn] = [] - - for upstream_task_id in task.upstream_task_ids: - upstream_task = dag.task_dict[upstream_task_id] - - # if upstream task is not a subdag, then skip it - upstream_subdag = getattr(upstream_task, "subdag", None) - if upstream_subdag is None: - continue - - # else, link the leaf tasks of the upstream subdag as upstream tasks - for upstream_subdag_task_id in upstream_subdag.task_dict: - upstream_subdag_task = upstream_subdag.task_dict[ - upstream_subdag_task_id - ] - - upstream_subdag_task_urn = DataJobUrn.create_from_ids( - job_id=upstream_subdag_task_id, data_flow_urn=str(flow_urn) - ) - - # if subdag task is a leaf task, then link it as an upstream task - if len(_task_downstream_task_ids(upstream_subdag_task)) == 0: - upstream_subdag_task_urns.append(upstream_subdag_task_urn) - - # resolve URNs for upstream nodes that trigger the subdag containing the current task. - # (if it is in a subdag at all) - upstream_subdag_triggers: List[DataJobUrn] = [] - - # subdags are always named with 'parent.child' style or Airflow won't run them - # add connection from subdag trigger(s) if subdag task has no upstreams - if ( - dag.is_subdag - and dag.parent_dag is not None - and len(task.upstream_task_ids) == 0 - ): - # filter through the parent dag's tasks and find the subdag trigger(s) - subdags = [ - x for x in dag.parent_dag.task_dict.values() if x.subdag is not None - ] - matched_subdags = [ - x for x in subdags if x.subdag and x.subdag.dag_id == dag.dag_id - ] - - # id of the task containing the subdag - subdag_task_id = matched_subdags[0].task_id - - # iterate through the parent dag's tasks and find the ones that trigger the subdag - for upstream_task_id in dag.parent_dag.task_dict: - upstream_task = dag.parent_dag.task_dict[upstream_task_id] - upstream_task_urn = DataJobUrn.create_from_ids( - data_flow_urn=str(flow_urn), job_id=upstream_task_id - ) - - # if the task triggers the subdag, link it to this node in the subdag - if subdag_task_id in _task_downstream_task_ids(upstream_task): - upstream_subdag_triggers.append(upstream_task_urn) - - # If the operator is an ExternalTaskSensor then we set the remote task as upstream. - # It is possible to tie an external sensor to DAG if external_task_id is omitted but currently we can't tie - # jobflow to anothet jobflow. - external_task_upstreams = [] - if task.task_type == "ExternalTaskSensor": - task = cast(ExternalTaskSensor, task) - if hasattr(task, "external_task_id") and task.external_task_id is not None: - external_task_upstreams = [ - DataJobUrn.create_from_ids( - job_id=task.external_task_id, - data_flow_urn=str( - DataFlowUrn.create_from_ids( - orchestrator=flow_urn.get_orchestrator_name(), - flow_id=task.external_dag_id, - env=flow_urn.get_env(), - ) - ), - ) - ] - # exclude subdag operator tasks since these are not emitted, resulting in empty metadata - upstream_tasks = ( - [ - DataJobUrn.create_from_ids(job_id=task_id, data_flow_urn=str(flow_urn)) - for task_id in task.upstream_task_ids - if getattr(dag.task_dict[task_id], "subdag", None) is None - ] - + upstream_subdag_task_urns - + upstream_subdag_triggers - + external_task_upstreams - ) - return upstream_tasks - - @staticmethod - def generate_dataflow( - cluster: str, - dag: "DAG", - capture_owner: bool = True, - capture_tags: bool = True, - ) -> DataFlow: - """ - Generates a Dataflow object from an Airflow DAG - :param cluster: str - name of the cluster - :param dag: DAG - - :param capture_tags: - :param capture_owner: - :return: DataFlow - Data generated dataflow - """ - id = dag.dag_id - orchestrator = "airflow" - description = f"{dag.description}\n\n{dag.doc_md or ''}" - data_flow = DataFlow( - env=cluster, id=id, orchestrator=orchestrator, description=description - ) - - flow_property_bag: Dict[str, str] = {} - - allowed_flow_keys = [ - "_access_control", - "_concurrency", - "_default_view", - "catchup", - "fileloc", - "is_paused_upon_creation", - "start_date", - "tags", - "timezone", - ] - - for key in allowed_flow_keys: - if hasattr(dag, key): - flow_property_bag[key] = repr(getattr(dag, key)) - - data_flow.properties = flow_property_bag - base_url = conf.get("webserver", "base_url") - data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" - - if capture_owner and dag.owner: - data_flow.owners.add(dag.owner) - - if capture_tags and dag.tags: - data_flow.tags.update(dag.tags) - - return data_flow - - @staticmethod - def _get_description(task: "Operator") -> Optional[str]: - from airflow.models.baseoperator import BaseOperator - - if not isinstance(task, BaseOperator): - # TODO: Get docs for mapped operators. - return None - - if hasattr(task, "doc") and task.doc: - return task.doc - elif hasattr(task, "doc_md") and task.doc_md: - return task.doc_md - elif hasattr(task, "doc_json") and task.doc_json: - return task.doc_json - elif hasattr(task, "doc_yaml") and task.doc_yaml: - return task.doc_yaml - elif hasattr(task, "doc_rst") and task.doc_yaml: - return task.doc_yaml - return None - - @staticmethod - def generate_datajob( - cluster: str, - task: "Operator", - dag: "DAG", - set_dependencies: bool = True, - capture_owner: bool = True, - capture_tags: bool = True, - ) -> DataJob: - """ - - :param cluster: str - :param task: TaskIntance - :param dag: DAG - :param set_dependencies: bool - whether to extract dependencies from airflow task - :param capture_owner: bool - whether to extract owner from airflow task - :param capture_tags: bool - whether to set tags automatically from airflow task - :return: DataJob - returns the generated DataJob object - """ - dataflow_urn = DataFlowUrn.create_from_ids( - orchestrator="airflow", env=cluster, flow_id=dag.dag_id - ) - datajob = DataJob(id=task.task_id, flow_urn=dataflow_urn) - - # TODO add support for MappedOperator - datajob.description = AirflowGenerator._get_description(task) - - job_property_bag: Dict[str, str] = {} - - allowed_task_keys = [ - "_downstream_task_ids", - "_inlets", - "_outlets", - "_task_type", - "_task_module", - "depends_on_past", - "email", - "label", - "execution_timeout", - "sla", - "sql", - "task_id", - "trigger_rule", - "wait_for_downstream", - # In Airflow 2.3, _downstream_task_ids was renamed to downstream_task_ids - "downstream_task_ids", - # In Airflow 2.4, _inlets and _outlets were removed in favor of non-private versions. - "inlets", - "outlets", - ] - - for key in allowed_task_keys: - if hasattr(task, key): - job_property_bag[key] = repr(getattr(task, key)) - - datajob.properties = job_property_bag - base_url = conf.get("webserver", "base_url") - datajob.url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={datajob.flow_urn.get_flow_id()}&_flt_3_task_id={task.task_id}" - - if capture_owner and dag.owner: - datajob.owners.add(dag.owner) - - if capture_tags and dag.tags: - datajob.tags.update(dag.tags) - - if set_dependencies: - datajob.upstream_urns.extend( - AirflowGenerator._get_dependencies( - task=task, dag=dag, flow_urn=datajob.flow_urn - ) - ) - - return datajob - - @staticmethod - def create_datajob_instance( - cluster: str, - task: "Operator", - dag: "DAG", - data_job: Optional[DataJob] = None, - ) -> DataProcessInstance: - if data_job is None: - data_job = AirflowGenerator.generate_datajob(cluster, task=task, dag=dag) - dpi = DataProcessInstance.from_datajob( - datajob=data_job, id=task.task_id, clone_inlets=True, clone_outlets=True - ) - return dpi - - @staticmethod - def run_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - dag_run: "DagRun", - start_timestamp_millis: Optional[int] = None, - dataflow: Optional[DataFlow] = None, - ) -> None: - if dataflow is None: - assert dag_run.dag - dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) - - if start_timestamp_millis is None: - assert dag_run.execution_date - start_timestamp_millis = int(dag_run.execution_date.timestamp() * 1000) - - assert dag_run.run_id - dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) - - # This property only exists in Airflow2 - if hasattr(dag_run, "run_type"): - from airflow.utils.types import DagRunType - - if dag_run.run_type == DagRunType.SCHEDULED: - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - elif dag_run.run_type == DagRunType.MANUAL: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - else: - if dag_run.run_id.startswith("scheduled__"): - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - else: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - - property_bag: Dict[str, str] = {} - property_bag["run_id"] = str(dag_run.run_id) - property_bag["execution_date"] = str(dag_run.execution_date) - property_bag["end_date"] = str(dag_run.end_date) - property_bag["start_date"] = str(dag_run.start_date) - property_bag["creating_job_id"] = str(dag_run.creating_job_id) - property_bag["data_interval_start"] = str(dag_run.data_interval_start) - property_bag["data_interval_end"] = str(dag_run.data_interval_end) - property_bag["external_trigger"] = str(dag_run.external_trigger) - dpi.properties.update(property_bag) - - dpi.emit_process_start( - emitter=emitter, start_timestamp_millis=start_timestamp_millis - ) - - @staticmethod - def complete_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - dag_run: "DagRun", - end_timestamp_millis: Optional[int] = None, - dataflow: Optional[DataFlow] = None, - ) -> None: - """ - - :param emitter: DatahubRestEmitter - the datahub rest emitter to emit the generated mcps - :param cluster: str - name of the cluster - :param dag_run: DagRun - :param end_timestamp_millis: Optional[int] - the completion time in milliseconds if not set the current time will be used. - :param dataflow: Optional[Dataflow] - """ - if dataflow is None: - assert dag_run.dag - dataflow = AirflowGenerator.generate_dataflow(cluster, dag_run.dag) - - assert dag_run.run_id - dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dag_run.run_id) - if end_timestamp_millis is None: - if dag_run.end_date is None: - raise Exception( - f"Dag {dag_run.dag_id}_{dag_run.run_id} is still running and unable to get end_date..." - ) - end_timestamp_millis = int(dag_run.end_date.timestamp() * 1000) - - # We should use DagRunState but it is not available in Airflow 1 - if dag_run.state == "success": - result = InstanceRunResult.SUCCESS - elif dag_run.state == "failed": - result = InstanceRunResult.FAILURE - else: - raise Exception( - f"Result should be either success or failure and it was {dag_run.state}" - ) - - dpi.emit_process_end( - emitter=emitter, - end_timestamp_millis=end_timestamp_millis, - result=result, - result_type="airflow", - ) - - @staticmethod - def run_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - ti: "TaskInstance", - dag: "DAG", - dag_run: "DagRun", - start_timestamp_millis: Optional[int] = None, - datajob: Optional[DataJob] = None, - attempt: Optional[int] = None, - emit_templates: bool = True, - ) -> DataProcessInstance: - if datajob is None: - datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) - - assert dag_run.run_id - dpi = DataProcessInstance.from_datajob( - datajob=datajob, - id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", - clone_inlets=True, - clone_outlets=True, - ) - job_property_bag: Dict[str, str] = {} - job_property_bag["run_id"] = str(dag_run.run_id) - job_property_bag["duration"] = str(ti.duration) - job_property_bag["start_date"] = str(ti.start_date) - job_property_bag["end_date"] = str(ti.end_date) - job_property_bag["execution_date"] = str(ti.execution_date) - job_property_bag["try_number"] = str(ti.try_number - 1) - job_property_bag["hostname"] = str(ti.hostname) - job_property_bag["max_tries"] = str(ti.max_tries) - # Not compatible with Airflow 1 - if hasattr(ti, "external_executor_id"): - job_property_bag["external_executor_id"] = str(ti.external_executor_id) - job_property_bag["pid"] = str(ti.pid) - job_property_bag["state"] = str(ti.state) - job_property_bag["operator"] = str(ti.operator) - job_property_bag["priority_weight"] = str(ti.priority_weight) - job_property_bag["unixname"] = str(ti.unixname) - job_property_bag["log_url"] = ti.log_url - dpi.properties.update(job_property_bag) - dpi.url = ti.log_url - - # This property only exists in Airflow2 - if hasattr(ti, "dag_run") and hasattr(ti.dag_run, "run_type"): - from airflow.utils.types import DagRunType - - if ti.dag_run.run_type == DagRunType.SCHEDULED: - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - elif ti.dag_run.run_type == DagRunType.MANUAL: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - else: - if dag_run.run_id.startswith("scheduled__"): - dpi.type = DataProcessTypeClass.BATCH_SCHEDULED - else: - dpi.type = DataProcessTypeClass.BATCH_AD_HOC - - if start_timestamp_millis is None: - assert ti.start_date - start_timestamp_millis = int(ti.start_date.timestamp() * 1000) - - if attempt is None: - attempt = ti.try_number - - dpi.emit_process_start( - emitter=emitter, - start_timestamp_millis=start_timestamp_millis, - attempt=attempt, - emit_template=emit_templates, - ) - return dpi - - @staticmethod - def complete_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], - cluster: str, - ti: "TaskInstance", - dag: "DAG", - dag_run: "DagRun", - end_timestamp_millis: Optional[int] = None, - result: Optional[InstanceRunResult] = None, - datajob: Optional[DataJob] = None, - ) -> DataProcessInstance: - """ - - :param emitter: DatahubRestEmitter - :param cluster: str - :param ti: TaskInstance - :param dag: DAG - :param dag_run: DagRun - :param end_timestamp_millis: Optional[int] - :param result: Optional[str] One of the result from datahub.metadata.schema_class.RunResultTypeClass - :param datajob: Optional[DataJob] - :return: DataProcessInstance - """ - if datajob is None: - datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) - - if end_timestamp_millis is None: - assert ti.end_date - end_timestamp_millis = int(ti.end_date.timestamp() * 1000) - - if result is None: - # We should use TaskInstanceState but it is not available in Airflow 1 - if ti.state == "success": - result = InstanceRunResult.SUCCESS - elif ti.state == "failed": - result = InstanceRunResult.FAILURE - else: - raise Exception( - f"Result should be either success or failure and it was {ti.state}" - ) - - dpi = DataProcessInstance.from_datajob( - datajob=datajob, - id=f"{dag.dag_id}_{ti.task_id}_{dag_run.run_id}", - clone_inlets=True, - clone_outlets=True, - ) - dpi.emit_process_end( - emitter=emitter, - end_timestamp_millis=end_timestamp_millis, - result=result, - result_type="airflow", - ) - return dpi +__all__ = ["AirflowGenerator"] diff --git a/metadata-ingestion/src/datahub_provider/entities.py b/metadata-ingestion/src/datahub_provider/entities.py index bfccc2f22eeb8..13be4ecdad655 100644 --- a/metadata-ingestion/src/datahub_provider/entities.py +++ b/metadata-ingestion/src/datahub_provider/entities.py @@ -1,48 +1,3 @@ -from abc import abstractmethod -from typing import Optional +from datahub_airflow_plugin.entities import Dataset, Urn, _Entity -import attr - -import datahub.emitter.mce_builder as builder -from datahub.utilities.urns.urn import guess_entity_type - - -class _Entity: - @property - @abstractmethod - def urn(self) -> str: - pass - - -@attr.s(auto_attribs=True, str=True) -class Dataset(_Entity): - platform: str - name: str - env: str = builder.DEFAULT_ENV - platform_instance: Optional[str] = None - - @property - def urn(self): - return builder.make_dataset_urn_with_platform_instance( - platform=self.platform, - name=self.name, - platform_instance=self.platform_instance, - env=self.env, - ) - - -@attr.s(str=True) -class Urn(_Entity): - _urn: str = attr.ib() - - @_urn.validator - def _validate_urn(self, attribute, value): - if not value.startswith("urn:"): - raise ValueError("invalid urn provided: urns must start with 'urn:'") - if guess_entity_type(value) != "dataset": - # This is because DataJobs only support Dataset lineage. - raise ValueError("Airflow lineage currently only supports datasets") - - @property - def urn(self): - return self._urn +__all__ = ["_Entity", "Dataset", "Urn"] diff --git a/metadata-ingestion/src/datahub_provider/hooks/datahub.py b/metadata-ingestion/src/datahub_provider/hooks/datahub.py index e2e523fc5d6af..949d98ce631ed 100644 --- a/metadata-ingestion/src/datahub_provider/hooks/datahub.py +++ b/metadata-ingestion/src/datahub_provider/hooks/datahub.py @@ -1,216 +1,8 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union - -from airflow.exceptions import AirflowException -from airflow.hooks.base import BaseHook - -from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( - MetadataChangeEvent, - MetadataChangeProposal, +from datahub_airflow_plugin.hooks.datahub import ( + BaseHook, + DatahubGenericHook, + DatahubKafkaHook, + DatahubRestHook, ) -if TYPE_CHECKING: - from airflow.models.connection import Connection - - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig - - -class DatahubRestHook(BaseHook): - """ - Creates a DataHub Rest API connection used to send metadata to DataHub. - Takes the endpoint for your DataHub Rest API in the Server Endpoint(host) field. - - URI example: :: - - AIRFLOW_CONN_DATAHUB_REST_DEFAULT='datahub-rest://rest-endpoint' - - :param datahub_rest_conn_id: Reference to the DataHub Rest connection. - :type datahub_rest_conn_id: str - """ - - conn_name_attr = "datahub_rest_conn_id" - default_conn_name = "datahub_rest_default" - conn_type = "datahub_rest" - hook_name = "DataHub REST Server" - - def __init__(self, datahub_rest_conn_id: str = default_conn_name) -> None: - super().__init__() - self.datahub_rest_conn_id = datahub_rest_conn_id - - @staticmethod - def get_connection_form_widgets() -> Dict[str, Any]: - return {} - - @staticmethod - def get_ui_field_behaviour() -> Dict: - """Returns custom field behavior""" - return { - "hidden_fields": ["port", "schema", "login"], - "relabeling": { - "host": "Server Endpoint", - }, - } - - def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: - conn: "Connection" = self.get_connection(self.datahub_rest_conn_id) - - host = conn.host - if not host: - raise AirflowException("host parameter is required") - if conn.port: - if ":" in host: - raise AirflowException( - "host parameter should not contain a port number if the port is specified separately" - ) - host = f"{host}:{conn.port}" - password = conn.password - timeout_sec = conn.extra_dejson.get("timeout_sec") - return (host, password, timeout_sec) - - def make_emitter(self) -> "DatahubRestEmitter": - import datahub.emitter.rest_emitter - - return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) - - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - emitter = self.make_emitter() - - for mce in mces: - emitter.emit_mce(mce) - - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - - for mce in mcps: - emitter.emit_mcp(mce) - - -class DatahubKafkaHook(BaseHook): - """ - Creates a DataHub Kafka connection used to send metadata to DataHub. - Takes your kafka broker in the Kafka Broker(host) field. - - URI example: :: - - AIRFLOW_CONN_DATAHUB_KAFKA_DEFAULT='datahub-kafka://kafka-broker' - - :param datahub_kafka_conn_id: Reference to the DataHub Kafka connection. - :type datahub_kafka_conn_id: str - """ - - conn_name_attr = "datahub_kafka_conn_id" - default_conn_name = "datahub_kafka_default" - conn_type = "datahub_kafka" - hook_name = "DataHub Kafka Sink" - - def __init__(self, datahub_kafka_conn_id: str = default_conn_name) -> None: - super().__init__() - self.datahub_kafka_conn_id = datahub_kafka_conn_id - - @staticmethod - def get_connection_form_widgets() -> Dict[str, Any]: - return {} - - @staticmethod - def get_ui_field_behaviour() -> Dict: - """Returns custom field behavior""" - return { - "hidden_fields": ["port", "schema", "login", "password"], - "relabeling": { - "host": "Kafka Broker", - }, - } - - def _get_config(self) -> "KafkaSinkConfig": - import datahub.ingestion.sink.datahub_kafka - - conn = self.get_connection(self.datahub_kafka_conn_id) - obj = conn.extra_dejson - obj.setdefault("connection", {}) - if conn.host is not None: - if "bootstrap" in obj["connection"]: - raise AirflowException( - "Kafka broker specified twice (present in host and extra)" - ) - obj["connection"]["bootstrap"] = ":".join( - map(str, filter(None, [conn.host, conn.port])) - ) - config = datahub.ingestion.sink.datahub_kafka.KafkaSinkConfig.parse_obj(obj) - return config - - def make_emitter(self) -> "DatahubKafkaEmitter": - import datahub.emitter.kafka_emitter - - sink_config = self._get_config() - return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(sink_config) - - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - emitter = self.make_emitter() - errors = [] - - def callback(exc, msg): - if exc: - errors.append(exc) - - for mce in mces: - emitter.emit_mce_async(mce, callback) - - emitter.flush() - - if errors: - raise AirflowException(f"failed to push some MCEs: {errors}") - - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - errors = [] - - def callback(exc, msg): - if exc: - errors.append(exc) - - for mcp in mcps: - emitter.emit_mcp_async(mcp, callback) - - emitter.flush() - - if errors: - raise AirflowException(f"failed to push some MCPs: {errors}") - - -class DatahubGenericHook(BaseHook): - """ - Emits Metadata Change Events using either the DatahubRestHook or the - DatahubKafkaHook. Set up a DataHub Rest or Kafka connection to use. - - :param datahub_conn_id: Reference to the DataHub connection. - :type datahub_conn_id: str - """ - - def __init__(self, datahub_conn_id: str) -> None: - super().__init__() - self.datahub_conn_id = datahub_conn_id - - def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: - conn = self.get_connection(self.datahub_conn_id) - - # We need to figure out the underlying hook type. First check the - # conn_type. If that fails, attempt to guess using the conn id name. - if conn.conn_type == DatahubRestHook.conn_type: - return DatahubRestHook(self.datahub_conn_id) - elif conn.conn_type == DatahubKafkaHook.conn_type: - return DatahubKafkaHook(self.datahub_conn_id) - elif "rest" in self.datahub_conn_id: - return DatahubRestHook(self.datahub_conn_id) - elif "kafka" in self.datahub_conn_id: - return DatahubKafkaHook(self.datahub_conn_id) - else: - raise AirflowException( - f"DataHub cannot handle conn_type {conn.conn_type} in {conn}" - ) - - def make_emitter(self) -> Union["DatahubRestEmitter", "DatahubKafkaEmitter"]: - return self.get_underlying_hook().make_emitter() - - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - return self.get_underlying_hook().emit_mces(mces) +__all__ = ["DatahubRestHook", "DatahubKafkaHook", "DatahubGenericHook", "BaseHook"] diff --git a/metadata-ingestion/src/datahub_provider/lineage/datahub.py b/metadata-ingestion/src/datahub_provider/lineage/datahub.py index 009ce4bb29a97..ffe1adb8255b2 100644 --- a/metadata-ingestion/src/datahub_provider/lineage/datahub.py +++ b/metadata-ingestion/src/datahub_provider/lineage/datahub.py @@ -1,91 +1,6 @@ -import json -from typing import TYPE_CHECKING, Dict, List, Optional - -from airflow.configuration import conf -from airflow.lineage.backend import LineageBackend - -from datahub_provider._lineage_core import ( - DatahubBasicLineageConfig, - send_lineage_to_datahub, +from datahub_airflow_plugin.lineage.datahub import ( + DatahubLineageBackend, + DatahubLineageConfig, ) -if TYPE_CHECKING: - from airflow.models.baseoperator import BaseOperator - - -class DatahubLineageConfig(DatahubBasicLineageConfig): - # If set to true, most runtime errors in the lineage backend will be - # suppressed and will not cause the overall task to fail. Note that - # configuration issues will still throw exceptions. - graceful_exceptions: bool = True - - -def get_lineage_config() -> DatahubLineageConfig: - """Load the lineage config from airflow.cfg.""" - - # The kwargs pattern is also used for secret backends. - kwargs_str = conf.get("lineage", "datahub_kwargs", fallback="{}") - kwargs = json.loads(kwargs_str) - - # Continue to support top-level datahub_conn_id config. - datahub_conn_id = conf.get("lineage", "datahub_conn_id", fallback=None) - if datahub_conn_id: - kwargs["datahub_conn_id"] = datahub_conn_id - - return DatahubLineageConfig.parse_obj(kwargs) - - -class DatahubLineageBackend(LineageBackend): - """ - Sends lineage data from tasks to DataHub. - - Configurable via ``airflow.cfg`` as follows: :: - - # For REST-based: - airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://localhost:8080' - # For Kafka-based (standard Kafka sink config can be passed via extras): - airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' - - [lineage] - backend = datahub_provider.lineage.datahub.DatahubLineageBackend - datahub_kwargs = { - "datahub_conn_id": "datahub_rest_default", - "capture_ownership_info": true, - "capture_tags_info": true, - "graceful_exceptions": true } - # The above indentation is important! - """ - - def __init__(self) -> None: - super().__init__() - - # By attempting to get and parse the config, we can detect configuration errors - # ahead of time. The init method is only called in Airflow 2.x. - _ = get_lineage_config() - - # With Airflow 2.0, this can be an instance method. However, with Airflow 1.10.x, this - # method is used statically, even though LineageBackend declares it as an instance variable. - @staticmethod - def send_lineage( - operator: "BaseOperator", - inlets: Optional[List] = None, # unused - outlets: Optional[List] = None, # unused - context: Optional[Dict] = None, - ) -> None: - config = get_lineage_config() - if not config.enabled: - return - - try: - context = context or {} # ensure not None to satisfy mypy - send_lineage_to_datahub( - config, operator, operator.inlets, operator.outlets, context - ) - except Exception as e: - if config.graceful_exceptions: - operator.log.error(e) - operator.log.info( - "Suppressing error because graceful_exceptions is set" - ) - else: - raise +__all__ = ["DatahubLineageBackend", "DatahubLineageConfig"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub.py b/metadata-ingestion/src/datahub_provider/operators/datahub.py index cd1d5187e6d85..08b1807cd4614 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub.py @@ -1,63 +1,6 @@ -from typing import List, Union - -from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults - -from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent -from datahub_provider.hooks.datahub import ( - DatahubGenericHook, - DatahubKafkaHook, - DatahubRestHook, +from datahub_airflow_plugin.operators.datahub import ( + DatahubBaseOperator, + DatahubEmitterOperator, ) - -class DatahubBaseOperator(BaseOperator): - """ - The DatahubBaseOperator is used as a base operator all DataHub operators. - """ - - ui_color = "#4398c8" - - hook: Union[DatahubRestHook, DatahubKafkaHook] - - # mypy is not a fan of this. Newer versions of Airflow support proper typing for the decorator - # using PEP 612. However, there is not yet a good way to inherit the types of the kwargs from - # the superclass. - @apply_defaults # type: ignore[misc] - def __init__( # type: ignore[no-untyped-def] - self, - *, - datahub_conn_id: str, - **kwargs, - ): - super().__init__(**kwargs) - - self.datahub_conn_id = datahub_conn_id - self.generic_hook = DatahubGenericHook(datahub_conn_id) - - -class DatahubEmitterOperator(DatahubBaseOperator): - """ - Emits a Metadata Change Event to DataHub using either a DataHub - Rest or Kafka connection. - - :param datahub_conn_id: Reference to the DataHub Rest or Kafka Connection. - :type datahub_conn_id: str - """ - - # See above for why these mypy type issues are ignored here. - @apply_defaults # type: ignore[misc] - def __init__( # type: ignore[no-untyped-def] - self, - mces: List[MetadataChangeEvent], - datahub_conn_id: str, - **kwargs, - ): - super().__init__( - datahub_conn_id=datahub_conn_id, - **kwargs, - ) - self.mces = mces - - def execute(self, context): - self.generic_hook.get_underlying_hook().emit_mces(self.mces) +__all__ = ["DatahubEmitterOperator", "DatahubBaseOperator"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py index 28be8ad860179..85469c10f271c 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_operator.py @@ -1,78 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.models import BaseOperator - -from datahub.api.circuit_breaker import ( - AssertionCircuitBreaker, - AssertionCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_assertion_operator import ( + DataHubAssertionOperator, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubAssertionOperator(BaseOperator): - r""" - DataHub Assertion Circuit Breaker Operator. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. - By default it is True. - :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. - """ - - template_fields: Sequence[str] = ("urn",) - circuit_breaker: AssertionCircuitBreaker - urn: Union[List[str], str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - check_last_assertion_time: bool = True, - time_delta: Optional[datetime.timedelta] = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - self.urn = urn - config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - verify_after_last_update=check_last_assertion_time, - time_delta=time_delta if time_delta else datetime.timedelta(days=1), - ) - - self.circuit_breaker = AssertionCircuitBreaker(config=config) - - def execute(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) - if ret: - raise Exception(f"Dataset {self.urn} is not in consumable state") - return True +__all__ = ["DataHubAssertionOperator"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py index ceb970dd8dc7f..e560ecb6145e0 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_assertion_sensor.py @@ -1,78 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.sensors.base import BaseSensorOperator - -from datahub.api.circuit_breaker import ( - AssertionCircuitBreaker, - AssertionCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_assertion_sensor import ( + DataHubAssertionSensor, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubAssertionSensor(BaseSensorOperator): - r""" - DataHub Assertion Circuit Breaker Sensor. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param check_last_assertion_time: If set it checks assertions after the last operation was set on the dataset. - By default it is True. - :param time_delta: If verify_after_last_update is False it checks for assertion within the time delta. - """ - - template_fields: Sequence[str] = ("urn",) - circuit_breaker: AssertionCircuitBreaker - urn: Union[List[str], str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - check_last_assertion_time: bool = True, - time_delta: datetime.timedelta = datetime.timedelta(days=1), - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - self.urn = urn - config: AssertionCircuitBreakerConfig = AssertionCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - verify_after_last_update=check_last_assertion_time, - time_delta=time_delta, - ) - self.circuit_breaker = AssertionCircuitBreaker(config=config) - - def poke(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active(urn=urn) - if ret: - self.log.info(f"Dataset {self.urn} is not in consumable state") - return False - return True +__all__ = ["DataHubAssertionSensor"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py index 6b2535994c101..6107e70c9eddd 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_operator.py @@ -1,97 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.sensors.base import BaseSensorOperator - -from datahub.api.circuit_breaker import ( - OperationCircuitBreaker, - OperationCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_operation_operator import ( + DataHubOperationCircuitBreakerOperator, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubOperationCircuitBreakerOperator(BaseSensorOperator): - r""" - DataHub Operation Circuit Breaker Operator. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param partition: The partition to check the operation. - :param source_type: The partition to check the operation. :ref:`https://datahubproject.io/docs/graphql/enums#operationsourcetype` - - """ - - template_fields: Sequence[str] = ( - "urn", - "partition", - "source_type", - "operation_type", - ) - circuit_breaker: OperationCircuitBreaker - urn: Union[List[str], str] - partition: Optional[str] - source_type: Optional[str] - operation_type: Optional[str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), - partition: Optional[str] = None, - source_type: Optional[str] = None, - operation_type: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - - self.urn = urn - self.partition = partition - self.operation_type = operation_type - self.source_type = source_type - - config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - time_delta=time_delta, - ) - - self.circuit_breaker = OperationCircuitBreaker(config=config) - - def execute(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active( - urn=urn, - partition=self.partition, - operation_type=self.operation_type, - source_type=self.source_type, - ) - if ret: - raise Exception(f"Dataset {self.urn} is not in consumable state") - return True +__all__ = ["DataHubOperationCircuitBreakerOperator"] diff --git a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py index 8796215453500..902a342081490 100644 --- a/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py +++ b/metadata-ingestion/src/datahub_provider/operators/datahub_operation_sensor.py @@ -1,100 +1,5 @@ -import datetime -from typing import Any, List, Optional, Sequence, Union - -from airflow.sensors.base import BaseSensorOperator - -from datahub.api.circuit_breaker import ( - OperationCircuitBreaker, - OperationCircuitBreakerConfig, +from datahub_airflow_plugin.operators.datahub_operation_sensor import ( + DataHubOperationCircuitBreakerSensor, ) -from datahub_provider.hooks.datahub import DatahubRestHook - - -class DataHubOperationCircuitBreakerSensor(BaseSensorOperator): - r""" - DataHub Operation Circuit Breaker Sensor. - - :param urn: The DataHub dataset unique identifier. (templated) - :param datahub_rest_conn_id: The REST datahub connection id to communicate with DataHub - which is set as Airflow connection. - :param partition: The partition to check the operation. - :param source_type: The source type to filter on. If not set it will accept any source type. - See valid values at: https://datahubproject.io/docs/graphql/enums#operationsourcetype - :param operation_type: The operation type to filter on. If not set it will accept any source type. - See valid values at: https://datahubproject.io/docs/graphql/enums/#operationtype - """ - - template_fields: Sequence[str] = ( - "urn", - "partition", - "source_type", - "operation_type", - ) - circuit_breaker: OperationCircuitBreaker - urn: Union[List[str], str] - partition: Optional[str] - source_type: Optional[str] - operation_type: Optional[str] - - def __init__( # type: ignore[no-untyped-def] - self, - *, - urn: Union[List[str], str], - datahub_rest_conn_id: Optional[str] = None, - time_delta: Optional[datetime.timedelta] = datetime.timedelta(days=1), - partition: Optional[str] = None, - source_type: Optional[str] = None, - operation_type: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - hook: DatahubRestHook - if datahub_rest_conn_id is not None: - hook = DatahubRestHook(datahub_rest_conn_id=datahub_rest_conn_id) - else: - hook = DatahubRestHook() - - host, password, timeout_sec = hook._get_config() - - self.urn = urn - self.partition = partition - self.operation_type = operation_type - self.source_type = source_type - - config: OperationCircuitBreakerConfig = OperationCircuitBreakerConfig( - datahub_host=host, - datahub_token=password, - timeout=timeout_sec, - time_delta=time_delta, - ) - - self.circuit_breaker = OperationCircuitBreaker(config=config) - - def poke(self, context: Any) -> bool: - if "datahub_silence_circuit_breakers" in context["dag_run"].conf: - self.log.info( - "Circuit breaker is silenced because datahub_silence_circuit_breakers config is set" - ) - return True - - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - if isinstance(self.urn, str): - urns = [self.urn] - elif isinstance(self.urn, list): - urns = self.urn - else: - raise Exception(f"urn parameter has invalid type {type(self.urn)}") - - for urn in urns: - self.log.info(f"Checking if dataset {self.urn} is ready to be consumed") - ret = self.circuit_breaker.is_circuit_breaker_active( - urn=urn, - partition=self.partition, - operation_type=self.operation_type, - source_type=self.source_type, - ) - if ret: - self.log.info(f"Dataset {self.urn} is not in consumable state") - return False - return True +__all__ = ["DataHubOperationCircuitBreakerSensor"] diff --git a/metadata-ingestion/tests/integration/iceberg/.gitignore b/metadata-ingestion/tests/integration/iceberg/.gitignore new file mode 100644 index 0000000000000..a7dfcf56788b4 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/.gitignore @@ -0,0 +1,3 @@ +# Folders created by Iceberg's docker-compose +notebooks/ +warehouse/ \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/docker-compose.yml b/metadata-ingestion/tests/integration/iceberg/docker-compose.yml new file mode 100644 index 0000000000000..ab5c534e7289b --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/docker-compose.yml @@ -0,0 +1,74 @@ +version: "3" + +services: + spark-iceberg: + image: tabulario/spark-iceberg:3.3.2_1.3.0 + container_name: spark-iceberg + networks: + iceberg_net: + depends_on: + - rest + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + - ./setup:/home/iceberg/setup + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + ports: + - 8888:8888 + - 8080:8080 + - 10000:10000 + - 10001:10001 + rest: + image: tabulario/iceberg-rest:0.5.0 + container_name: iceberg-rest + networks: + iceberg_net: + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3a://warehouse/wh/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + minio: + image: minio/minio + container_name: minio + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + iceberg_net: + aliases: + - warehouse.minio + ports: + - 9001:9001 + - 9000:9000 + command: ["server", "/data", "--console-address", ":9001"] + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + networks: + iceberg_net: + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + exit 0; + " +networks: + iceberg_net: diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json new file mode 100644 index 0000000000000..cc94625560a43 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_deleted_table_mces_golden.json @@ -0,0 +1,184 @@ +[ + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2023-07-04T14:23:10.457317300Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/another_taxis", + "format-version": "1", + "snapshot-id": "6904764113937987369", + "manifest-list": "s3a://warehouse/wh/nyc/another_taxis/metadata/snap-6904764113937987369-1-f18ce54a-d59c-461a-a066-9d3085ccf2f2.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" + }, + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.another_taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.another_taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:iceberg", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:iceberg,test_platform_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-2020_04_14-07_00_00" + } + } + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json new file mode 100644 index 0000000000000..163911623470e --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_ingest_mces_golden.json @@ -0,0 +1,153 @@ +[ + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2023-06-12T17:32:17.227545005Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/taxis", + "format-version": "1", + "snapshot-id": "2505818429184337337", + "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-2505818429184337337-1-a64915c4-afc8-40e3-97a7-98b072b42e10.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" + }, + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test" + } + } + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json new file mode 100644 index 0000000000000..bdb7091014626 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_mces_golden.json @@ -0,0 +1,216 @@ +[ + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "owner": "root", + "created-at": "2023-06-12T17:33:25.422993540Z", + "write.format.default": "parquet", + "location": "s3a://warehouse/wh/nyc/taxis", + "format-version": "1", + "snapshot-id": "2585047006374307840", + "manifest-list": "s3a://warehouse/wh/nyc/taxis/metadata/snap-2585047006374307840-1-2e2bef19-40d1-4ad1-8fad-e57783477710.avro" + }, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:root", + "type": "TECHNICAL_OWNER" + }, + { + "owner": "urn:li:corpGroup:root", + "type": "TECHNICAL_OWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "nyc.taxis", + "platform": "urn:li:dataPlatform:iceberg", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "table {\n 1: vendor_id: optional long\n 2: trip_date: optional timestamptz\n 3: trip_id: optional long\n 4: trip_distance: optional float\n 5: fare_amount: optional double\n 6: store_and_fwd_flag: optional string\n}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=long].vendor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_date", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamptz", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=long].trip_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "long", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=float].trip_distance", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"float\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=double].fare_amount", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "double", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"double\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=string].store_and_fwd_flag", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test" + } + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,nyc.taxis,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 5, + "columnCount": 6, + "fieldProfiles": [ + { + "fieldPath": "vendor_id", + "nullCount": 0, + "nullProportion": 0.0, + "min": "1", + "max": "3" + }, + { + "fieldPath": "trip_date", + "nullCount": 0, + "nullProportion": 0.0, + "min": "2000-01-01T12:00:00+00:00", + "max": "2000-01-04T12:00:00+00:00" + }, + { + "fieldPath": "trip_id", + "nullCount": 0, + "nullProportion": 0.0, + "min": "1000371", + "max": "1000375" + }, + { + "fieldPath": "trip_distance", + "nullCount": 0, + "nullProportion": 0.0, + "min": "0.0", + "max": "8.399999618530273" + }, + { + "fieldPath": "fare_amount", + "nullCount": 0, + "nullProportion": 0.0, + "min": "0.0", + "max": "42.13" + }, + { + "fieldPath": "store_and_fwd_flag", + "nullCount": 0, + "nullProportion": 0.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "iceberg-test" + } + } + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml new file mode 100644 index 0000000000000..197c03bf2ee8d --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_profile_to_file.yml @@ -0,0 +1,25 @@ +run_id: iceberg-test + +source: + type: iceberg + config: + catalog: + name: default + type: rest + config: + uri: http://localhost:8181 + s3.access-key-id: admin + s3.secret-access-key: password + s3.region: us-east-1 + warehouse: s3a://warehouse/wh/ + py-io-impl: pyiceberg.io.pyarrow.PyArrowFileIO + s3.endpoint: http://localhost:9000 + user_ownership_property: owner + group_ownership_property: owner + profiling: + enabled: true + +sink: + type: file + config: + filename: "./iceberg_mces.json" diff --git a/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml b/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml new file mode 100644 index 0000000000000..8b5d035aed259 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/iceberg_to_file.yml @@ -0,0 +1,22 @@ +run_id: iceberg-test + +source: + type: iceberg + config: + catalog: + name: default + type: rest + config: + uri: http://localhost:8181 + s3.access-key-id: admin + s3.secret-access-key: password + s3.region: us-east-1 + warehouse: s3a://warehouse/wh/ + s3.endpoint: http://localhost:9000 + user_ownership_property: owner + group_ownership_property: owner + +sink: + type: file + config: + filename: "./iceberg_mces.json" diff --git a/metadata-ingestion/tests/integration/iceberg/setup/create.py b/metadata-ingestion/tests/integration/iceberg/setup/create.py new file mode 100644 index 0000000000000..0799ce9c93916 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/setup/create.py @@ -0,0 +1,46 @@ +import sys +from datetime import datetime + +from pyspark.sql import SparkSession +from pyspark.sql.types import ( + DoubleType, + FloatType, + LongType, + StringType, + StructField, + StructType, + TimestampType, +) + + +def main(table_name: str) -> None: + spark = SparkSession.builder.getOrCreate() + + schema = StructType( + [ + StructField("vendor_id", LongType(), True), + StructField("trip_date", TimestampType(), True), + StructField("trip_id", LongType(), True), + StructField("trip_distance", FloatType(), True), + StructField("fare_amount", DoubleType(), True), + StructField("store_and_fwd_flag", StringType(), True), + ] + ) + + data = [ + (1, datetime(2000, 1, 1, 12, 0), 1000371, 1.8, 15.32, "N"), + (2, datetime(2000, 1, 2, 12, 0), 1000372, 2.5, 22.15, "N"), + (2, datetime(2000, 1, 3, 12, 0), 1000373, 0.9, 9.01, "N"), + (1, datetime(2000, 1, 4, 12, 0), 1000374, 8.4, 42.13, "Y"), + # Following entry will test profiling values at 0 + (3, datetime(2000, 1, 4, 12, 0), 1000375, 0.0, 0.0, "Y"), + ] + + df = spark.createDataFrame(data, schema) + df.write.partitionBy("trip_date").saveAsTable(table_name) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + raise ValueError("Missing required parameter ") + main(sys.argv[1]) diff --git a/metadata-ingestion/tests/integration/iceberg/setup/delete.py b/metadata-ingestion/tests/integration/iceberg/setup/delete.py new file mode 100644 index 0000000000000..b00306982f517 --- /dev/null +++ b/metadata-ingestion/tests/integration/iceberg/setup/delete.py @@ -0,0 +1,5 @@ +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() + +spark.sql("DROP TABLE nyc.taxis PURGE") diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/iceberg_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/iceberg_mces_golden.json deleted file mode 100644 index b106b91275835..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/iceberg_mces_golden.json +++ /dev/null @@ -1,131 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace.iceberg_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "new_owner", - "provider": "ICEBERG", - "location": "/namespace/iceberg_test" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:new_owner", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:new_owner", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "namespace.iceberg_test", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "Schema(1: level: required string(level documentation),2: event_time: required timestamptz(event_time documentation),3: message: required string(message documentation),4: call_stack: optional list(call_stack documentation))" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=string].level", - "nullable": false, - "description": "level documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].event_time", - "nullable": false, - "description": "event_time documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].message", - "nullable": false, - "description": "message documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].call_stack", - "nullable": true, - "description": "call_stack documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "string" - ] - } - } - }, - "nativeDataType": "list", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"list\", \"_nullable\": true}" - } - ] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/ingest_test/namespace/iceberg_test/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-0-72133c37-bb5c-4ffd-8ead-08f33fa2675d-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-0-72133c37-bb5c-4ffd-8ead-08f33fa2675d-00001.parquet deleted file mode 100644 index 48e75a030f1ca..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-0-72133c37-bb5c-4ffd-8ead-08f33fa2675d-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-3-c638dd0f-498a-4ce9-b525-8242758d18f8-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-3-c638dd0f-498a-4ce9-b525-8242758d18f8-00001.parquet deleted file mode 100644 index c70b94612db64..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00000-3-c638dd0f-498a-4ce9-b525-8242758d18f8-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-1-5f69f6ed-191f-4a11-9953-09435ffce01d-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-1-5f69f6ed-191f-4a11-9953-09435ffce01d-00001.parquet deleted file mode 100644 index 4c95fceed72e6..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-1-5f69f6ed-191f-4a11-9953-09435ffce01d-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-4-b21a5375-b547-40b9-89ca-caf4fcfe6685-00001.parquet b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-4-b21a5375-b547-40b9-89ca-caf4fcfe6685-00001.parquet deleted file mode 100644 index d33a3fd0d8a07..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/data/00001-4-b21a5375-b547-40b9-89ca-caf4fcfe6685-00001.parquet and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json deleted file mode 100644 index b6ffcfdc55daf..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "e54626bf-c7ab-4f36-a3d0-3e13eec0824f", - "location" : "/home/iceberg/warehouse/datahub/integration/profiling", - "last-updated-ms" : 1651614148692, - "last-column-id" : 3, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - } ], - "partition-spec" : [ ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ ] - } ], - "last-partition-id" : 999, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "root" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json deleted file mode 100644 index da2afa6569f11..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "e54626bf-c7ab-4f36-a3d0-3e13eec0824f", - "location" : "/home/iceberg/warehouse/datahub/integration/profiling", - "last-updated-ms" : 1651614151056, - "last-column-id" : 3, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - } ], - "partition-spec" : [ ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ ] - } ], - "last-partition-id" : 999, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "root" - }, - "current-snapshot-id" : 4437197002876030991, - "snapshots" : [ { - "snapshot-id" : 4437197002876030991, - "timestamp-ms" : 1651614151056, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1651614127284", - "added-data-files" : "2", - "added-records" : "2", - "added-files-size" : "2114", - "changed-partition-count" : "1", - "total-records" : "2", - "total-files-size" : "2114", - "total-data-files" : "2", - "total-delete-files" : "0", - "total-position-deletes" : "0", - "total-equality-deletes" : "0" - }, - "manifest-list" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro", - "schema-id" : 0 - } ], - "snapshot-log" : [ { - "timestamp-ms" : 1651614151056, - "snapshot-id" : 4437197002876030991 - } ], - "metadata-log" : [ { - "timestamp-ms" : 1651614148692, - "metadata-file" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json deleted file mode 100644 index ab028a647de4c..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json +++ /dev/null @@ -1,124 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "e54626bf-c7ab-4f36-a3d0-3e13eec0824f", - "location" : "/home/iceberg/warehouse/datahub/integration/profiling", - "last-updated-ms" : 1651614244732, - "last-column-id" : 3, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "field_int", - "required" : false, - "type" : "long", - "doc" : "An integer field" - }, { - "id" : 2, - "name" : "field_str", - "required" : false, - "type" : "string", - "doc" : "A string field" - }, { - "id" : 3, - "name" : "field_timestamp", - "required" : false, - "type" : "timestamptz", - "doc" : "A timestamp field" - } ] - } ], - "partition-spec" : [ ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ ] - } ], - "last-partition-id" : 999, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "root" - }, - "current-snapshot-id" : 4220723025353071767, - "snapshots" : [ { - "snapshot-id" : 4437197002876030991, - "timestamp-ms" : 1651614151056, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1651614127284", - "added-data-files" : "2", - "added-records" : "2", - "added-files-size" : "2114", - "changed-partition-count" : "1", - "total-records" : "2", - "total-files-size" : "2114", - "total-data-files" : "2", - "total-delete-files" : "0", - "total-position-deletes" : "0", - "total-equality-deletes" : "0" - }, - "manifest-list" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro", - "schema-id" : 0 - }, { - "snapshot-id" : 4220723025353071767, - "parent-snapshot-id" : 4437197002876030991, - "timestamp-ms" : 1651614244732, - "summary" : { - "operation" : "append", - "spark.app.id" : "local-1651614127284", - "added-data-files" : "2", - "added-records" : "2", - "added-files-size" : "2111", - "changed-partition-count" : "1", - "total-records" : "4", - "total-files-size" : "4225", - "total-data-files" : "4", - "total-delete-files" : "0", - "total-position-deletes" : "0", - "total-equality-deletes" : "0" - }, - "manifest-list" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro", - "schema-id" : 0 - } ], - "snapshot-log" : [ { - "timestamp-ms" : 1651614151056, - "snapshot-id" : 4437197002876030991 - }, { - "timestamp-ms" : 1651614244732, - "snapshot-id" : 4220723025353071767 - } ], - "metadata-log" : [ { - "timestamp-ms" : 1651614148692, - "metadata-file" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/00000-331b9f67-e02b-44b1-8ec8-4dfa287c3bd5.metadata.json" - }, { - "timestamp-ms" : 1651614151056, - "metadata-file" : "/home/iceberg/warehouse/datahub/integration/profiling/metadata/00001-fb50681e-5f25-4180-99e2-065ef0b9791b.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8-m0.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8-m0.avro deleted file mode 100644 index 3019df4adae30..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8-m0.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/ec0bd970-e5ef-4843-abcb-e96a35a8f14d-m0.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/ec0bd970-e5ef-4843-abcb-e96a35a8f14d-m0.avro deleted file mode 100644 index 1b51cd60d136a..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/ec0bd970-e5ef-4843-abcb-e96a35a8f14d-m0.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro deleted file mode 100644 index 0dd50d23037e9..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro deleted file mode 100644 index 93f69f0ac1540..0000000000000 Binary files a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/snap-4437197002876030991-1-23acaffc-9bed-4d97-8ddd-0ea1ea15a2b8.avro and /dev/null differ diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/version-hint.text deleted file mode 100755 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/datahub/integration/profiling/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/iceberg_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/iceberg_mces_golden.json deleted file mode 100644 index edfa8f80670cf..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/profiling_test/iceberg_mces_golden.json +++ /dev/null @@ -1,129 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,datahub.integration.profiling,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "root", - "provider": "ICEBERG", - "location": "/home/iceberg/warehouse/datahub/integration/profiling", - "snapshot-id": "4220723025353071767", - "manifest-list": "/home/iceberg/warehouse/datahub/integration/profiling/metadata/snap-4220723025353071767-1-ec0bd970-e5ef-4843-abcb-e96a35a8f14d.avro" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:root", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:root", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "datahub.integration.profiling", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "Schema(1: field_int: optional long(An integer field),2: field_str: optional string(A string field),3: field_timestamp: optional timestamptz(A timestamp field))" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=long].field_int", - "nullable": true, - "description": "An integer field", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "long", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"long\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].field_str", - "nullable": true, - "description": "A string field", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].field_timestamp", - "nullable": true, - "description": "A timestamp field", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": true}" - } - ] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,datahub.integration.profiling,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 4, \"columnCount\": 3, \"fieldProfiles\": [{\"fieldPath\": \"field_int\", \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"4\"}, {\"fieldPath\": \"field_str\", \"nullCount\": 0, \"nullProportion\": 0.0}, {\"fieldPath\": \"field_timestamp\", \"nullCount\": 2, \"nullProportion\": 0.5, \"min\": \"2022-05-03 21:42:29\", \"max\": \"2022-05-03 21:44:04\"}]}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-test" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/iceberg_deleted_table_mces_golden.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/iceberg_deleted_table_mces_golden.json deleted file mode 100644 index d376d8b645d66..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/iceberg_deleted_table_mces_golden.json +++ /dev/null @@ -1,159 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "owner": "new_owner", - "provider": "ICEBERG", - "location": "/namespace/iceberg_test" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:new_owner", - "type": "TECHNICAL_OWNER" - }, - { - "owner": "urn:li:corpGroup:new_owner", - "type": "TECHNICAL_OWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "namespace.iceberg_test", - "platform": "urn:li:dataPlatform:iceberg", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "Schema(1: level: required string(level documentation),2: event_time: required timestamptz(event_time documentation),3: message: required string(message documentation),4: call_stack: optional list(call_stack documentation))" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=string].level", - "nullable": false, - "description": "level documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=long].event_time", - "nullable": false, - "description": "event_time documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "timestamptz", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"logicalType\": \"timestamp-micros\", \"native_data_type\": \"timestamptz\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=string].message", - "nullable": false, - "description": "message documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": false}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=string].call_stack", - "nullable": true, - "description": "call_stack documentation", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "string" - ] - } - } - }, - "nativeDataType": "list", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"list\", \"_nullable\": true}" - } - ] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test,PROD)", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "value": "{\"platform\": \"urn:li:dataPlatform:iceberg\", \"instance\": \"urn:li:dataPlatformInstance:(urn:li:dataPlatform:iceberg,test_platform_instance)\"}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test_2,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "value": "{\"removed\": true}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "iceberg-2020_04_14-07_00_00" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run1/namespace/iceberg_test_2/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v1.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v1.metadata.json deleted file mode 100644 index e4ac0b9685ddc..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v1.metadata.json +++ /dev/null @@ -1,105 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "11bbe5de-5ef6-4074-80db-f041065f9862", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1648729616724, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - } - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v2.metadata.json b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v2.metadata.json deleted file mode 100644 index 02221330b0665..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/v2.metadata.json +++ /dev/null @@ -1,118 +0,0 @@ -{ - "format-version" : 1, - "table-uuid" : "16e6ecee-cd5d-470f-a7a6-a197944fa4db", - "location" : "/namespace/iceberg_test", - "last-updated-ms" : 1649086837695, - "last-column-id" : 5, - "schema" : { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - }, - "current-schema-id" : 0, - "schemas" : [ { - "type" : "struct", - "schema-id" : 0, - "fields" : [ { - "id" : 1, - "name" : "level", - "required" : true, - "type" : "string", - "doc" : "level documentation" - }, { - "id" : 2, - "name" : "event_time", - "required" : true, - "type" : "timestamptz", - "doc" : "event_time documentation" - }, { - "id" : 3, - "name" : "message", - "required" : true, - "type" : "string", - "doc" : "message documentation" - }, { - "id" : 4, - "name" : "call_stack", - "required" : false, - "type" : { - "type" : "list", - "element-id" : 5, - "element" : "string", - "element-required" : true - }, - "doc" : "call_stack documentation" - } ] - } ], - "partition-spec" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ], - "default-spec-id" : 0, - "partition-specs" : [ { - "spec-id" : 0, - "fields" : [ { - "name" : "event_time_hour", - "transform" : "hour", - "source-id" : 2, - "field-id" : 1000 - }, { - "name" : "level", - "transform" : "identity", - "source-id" : 1, - "field-id" : 1001 - } ] - } ], - "last-partition-id" : 1001, - "default-sort-order-id" : 0, - "sort-orders" : [ { - "order-id" : 0, - "fields" : [ ] - } ], - "properties" : { - "owner" : "new_owner" - }, - "current-snapshot-id" : -1, - "snapshots" : [ ], - "snapshot-log" : [ ], - "metadata-log" : [ { - "timestamp-ms" : 1649086837511, - "metadata-file" : "/namespace/iceberg_test/metadata/v1.metadata.json" - } ] -} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/version-hint.text b/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/version-hint.text deleted file mode 100644 index d8263ee986059..0000000000000 --- a/metadata-ingestion/tests/integration/iceberg/test_data/stateful_test/run2/namespace/iceberg_test/metadata/version-hint.text +++ /dev/null @@ -1 +0,0 @@ -2 \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py index b26b574e54c47..e2a86480672e5 100644 --- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py +++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py @@ -1,14 +1,14 @@ -from pathlib import PosixPath -from typing import Any, Dict, Union +import subprocess +import sys +from typing import Any, Dict, List from unittest.mock import patch import pytest from freezegun import freeze_time -from iceberg.core.filesystem.file_status import FileStatus -from iceberg.core.filesystem.local_filesystem import LocalFileSystem -from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers +from tests.test_helpers.click_helpers import run_datahub_cmd +from tests.test_helpers.docker_helpers import wait_for_port from tests.test_helpers.state_helpers import ( get_current_checkpoint_from_pipeline, run_and_get_pipeline, @@ -20,89 +20,92 @@ GMS_SERVER = f"http://localhost:{GMS_PORT}" +@pytest.fixture(autouse=True) +def skip_tests_if_python_before_3_8(): + if sys.version_info < (3, 8): + pytest.skip("Requires python 3.8 or higher") + + +def spark_submit(file_path: str, args: str = "") -> None: + docker = "docker" + command = f"{docker} exec spark-iceberg spark-submit {file_path} {args}" + ret = subprocess.run( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + assert ret.returncode == 0 + + @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_iceberg_ingest(pytestconfig, tmp_path, mock_time): +def test_iceberg_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/" - # Run the metadata ingestion pipeline. - pipeline = Pipeline.create( - { - "run_id": "iceberg-test", - "source": { - "type": "iceberg", - "config": { - "localfs": str(test_resources_dir / "test_data/ingest_test"), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/iceberg_mces.json", - }, - }, - } - ) - pipeline.run() - pipeline.raise_from_status() - - # Verify the output. - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / "iceberg_mces.json", - golden_path=test_resources_dir - / "test_data/ingest_test/iceberg_mces_golden.json", - ) + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "iceberg" + ) as docker_services: + wait_for_port(docker_services, "spark-iceberg", 8888, timeout=120) + + # Run the create.py pyspark file to populate the table. + spark_submit("/home/iceberg/setup/create.py", "nyc.taxis") + + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "iceberg_to_file.yml").resolve() + run_datahub_cmd( + ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path + ) + # These paths change from one instance run of the clickhouse docker to the other, and the FROZEN_TIME does not apply to these. + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['created-at'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['snapshot-id'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['manifest-list'\]", + ] + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + ignore_paths=ignore_paths, + output_path=tmp_path / "iceberg_mces.json", + golden_path=test_resources_dir / "iceberg_ingest_mces_golden.json", + ) @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = ( - pytestconfig.rootpath / "tests/integration/iceberg/test_data/stateful_test" - ) +def test_iceberg_stateful_ingest( + docker_compose_runner, pytestconfig, tmp_path, mock_time, mock_datahub_graph +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg" platform_instance = "test_platform_instance" - scd_before_deletion: Dict[str, Any] = { - "localfs": str(test_resources_dir / "run1"), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - "platform_instance": f"{platform_instance}", - # enable stateful ingestion - "stateful_ingestion": { - "enabled": True, - "remove_stale_metadata": True, - "fail_safe_threshold": 100.0, - "state_provider": { - "type": "datahub", - "config": {"datahub_api": {"server": GMS_SERVER}}, - }, - }, - } - - scd_after_deletion: Dict[str, Any] = { - "localfs": str(test_resources_dir / "run2"), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - "platform_instance": f"{platform_instance}", - # enable stateful ingestion - "stateful_ingestion": { - "enabled": True, - "remove_stale_metadata": True, - "fail_safe_threshold": 100.0, - "state_provider": { - "type": "datahub", - "config": {"datahub_api": {"server": GMS_SERVER}}, - }, - }, - } - pipeline_config_dict: Dict[str, Any] = { "source": { "type": "iceberg", - "config": scd_before_deletion, + "config": { + "catalog": { + "name": "default", + "type": "rest", + "config": { + "uri": "http://localhost:8181", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + "s3.region": "us-east-1", + "warehouse": "s3a://warehouse/wh/", + "s3.endpoint": "http://localhost:9000", + }, + }, + "user_ownership_property": "owner", + "group_ownership_property": "owner", + "platform_instance": f"{platform_instance}", + # enable stateful ingestion + "stateful_ingestion": { + "enabled": True, + "remove_stale_metadata": True, + "fail_safe_threshold": 100.0, + "state_provider": { + "type": "datahub", + "config": {"datahub_api": {"server": GMS_SERVER}}, + }, + }, + }, }, "sink": { # we are not really interested in the resulting events for this test @@ -111,10 +114,18 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub "pipeline_name": "test_pipeline", } - with patch( + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "iceberg" + ) as docker_services, patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint: + wait_for_port(docker_services, "spark-iceberg", 8888, timeout=120) + + # Run the create.py pyspark file to populate two tables. + spark_submit("/home/iceberg/setup/create.py", "nyc.taxis") + spark_submit("/home/iceberg/setup/create.py", "nyc.another_taxis") + # Both checkpoint and reporting will use the same mocked graph instance. mock_checkpoint.return_value = mock_datahub_graph @@ -125,13 +136,14 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub assert checkpoint1 assert checkpoint1.state - # Set iceberg config where a table is deleted. - pipeline_config_dict["source"]["config"] = scd_after_deletion # Capture MCEs of second run to validate Status(removed=true) deleted_mces_path = f"{tmp_path}/iceberg_deleted_mces.json" pipeline_config_dict["sink"]["type"] = "file" pipeline_config_dict["sink"]["config"] = {"filename": deleted_mces_path} + # Run the delete.py pyspark file to delete the table. + spark_submit("/home/iceberg/setup/delete.py") + # Do the second run of the pipeline. pipeline_run2 = run_and_get_pipeline(pipeline_config_dict) checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) @@ -149,7 +161,7 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub assert len(difference_urns) == 1 - urn1 = "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.namespace.iceberg_test_2,PROD)" + urn1 = "urn:li:dataset:(urn:li:dataPlatform:iceberg,test_platform_instance.nyc.taxis,PROD)" assert urn1 in difference_urns @@ -161,9 +173,16 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub pipeline=pipeline_run2, expected_providers=1 ) + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['created-at'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['snapshot-id'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['manifest-list'\]", + ] + # Verify the output. mce_helpers.check_golden_file( pytestconfig, + ignore_paths=ignore_paths, output_path=deleted_mces_path, golden_path=test_resources_dir / "iceberg_deleted_table_mces_golden.json", ) @@ -171,117 +190,32 @@ def test_iceberg_stateful_ingest(pytestconfig, tmp_path, mock_time, mock_datahub @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_iceberg_profiling(pytestconfig, tmp_path, mock_time): - """ - This test is using a table created using https://github.com/tabular-io/docker-spark-iceberg. - Here are the DDL statements that you can execute with `spark-sql`: - ```SQL - CREATE TABLE datahub.integration.profiling ( - field_int bigint COMMENT 'An integer field', - field_str string COMMENT 'A string field', - field_timestamp timestamp COMMENT 'A timestamp field') - USING iceberg; - - INSERT INTO datahub.integration.profiling VALUES (1, 'row1', current_timestamp()), (2, 'row2', null); - INSERT INTO datahub.integration.profiling VALUES (3, 'row3', current_timestamp()), (4, 'row4', null); - ``` - - When importing the metadata files into this test, we need to create a `version-hint.text` with a value that - reflects the version of the table, and then change the code in `TestLocalFileSystem._replace_path()` accordingly. - """ - test_resources_dir = ( - pytestconfig.rootpath / "tests/integration/iceberg/test_data/profiling_test" - ) +def test_iceberg_profiling(docker_compose_runner, pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/iceberg/" - # Run the metadata ingestion pipeline. - pipeline = Pipeline.create( - { - "run_id": "iceberg-test", - "source": { - "type": "iceberg", - "config": { - "localfs": str(test_resources_dir), - "user_ownership_property": "owner", - "group_ownership_property": "owner", - "max_path_depth": 3, - "profiling": { - "enabled": True, - }, - "table_pattern": {"allow": ["datahub.integration.profiling"]}, - }, - }, - "sink": { - "type": "file", - "config": { - "filename": f"{tmp_path}/iceberg_mces.json", - }, - }, - } - ) + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "iceberg" + ) as docker_services: + wait_for_port(docker_services, "spark-iceberg", 8888, timeout=120) - class TestLocalFileSystem(LocalFileSystem): - # This class acts as a wrapper on LocalFileSystem to intercept calls using a path location. - # The wrapper will normalize those paths to be usable by the test. - fs: LocalFileSystem - - @staticmethod - def _replace_path(path: Union[str, PosixPath]) -> str: - # When the Iceberg table was created, its warehouse folder was '/home/iceberg/warehouse'. Iceberg tables - # are not portable, so we need to replace the warehouse folder by the test location at runtime. - normalized_path: str = str(path).replace( - "/home/iceberg/warehouse", str(test_resources_dir) - ) - - # When the Iceberg table was created, a postgres catalog was used instead of a HadoopCatalog. The HadoopCatalog - # expects a file named 'v{}.metadata.json' where {} is the version number from 'version-hint.text'. Since - # 'v2.metadata.json' does not exist, we will redirect the call to '00002-02782173-8364-4caf-a3c4-9567c1d6608f.metadata.json'. - if normalized_path.endswith("v2.metadata.json"): - return normalized_path.replace( - "v2.metadata.json", - "00002-cc241948-4c12-46d0-9a75-ce3578ec03d4.metadata.json", - ) - return normalized_path - - def __init__(self, fs: LocalFileSystem) -> None: - self.fs = fs - - def open(self, path: str, mode: str = "rb") -> object: - return self.fs.open(TestLocalFileSystem._replace_path(path), mode) - - def delete(self, path: str) -> None: - self.fs.delete(TestLocalFileSystem._replace_path(path)) - - def stat(self, path: str) -> FileStatus: - return self.fs.stat(TestLocalFileSystem._replace_path(path)) - - @staticmethod - def fix_path(path: str) -> str: - return TestLocalFileSystem.fs.fix_path( - TestLocalFileSystem._replace_path(path) - ) - - def create(self, path: str, overwrite: bool = False) -> object: - return self.fs.create(TestLocalFileSystem._replace_path(path), overwrite) - - def rename(self, src: str, dest: str) -> bool: - return self.fs.rename( - TestLocalFileSystem._replace_path(src), - TestLocalFileSystem._replace_path(dest), - ) - - def exists(self, path: str) -> bool: - return self.fs.exists(TestLocalFileSystem._replace_path(path)) - - local_fs_wrapper: TestLocalFileSystem = TestLocalFileSystem( - LocalFileSystem.get_instance() - ) - with patch.object(LocalFileSystem, "get_instance", return_value=local_fs_wrapper): - pipeline.run() - pipeline.raise_from_status() - - # Verify the output. - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / "iceberg_mces.json", - golden_path=test_resources_dir / "iceberg_mces_golden.json", - ) + # Run the create.py pyspark file to populate the table. + spark_submit("/home/iceberg/setup/create.py", "nyc.taxis") + + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "iceberg_profile_to_file.yml").resolve() + run_datahub_cmd( + ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path + ) + # These paths change from one instance run of the clickhouse docker to the other, and the FROZEN_TIME does not apply to these. + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['created-at'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['snapshot-id'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['manifest-list'\]", + ] + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + ignore_paths=ignore_paths, + output_path=tmp_path / "iceberg_mces.json", + golden_path=test_resources_dir / "iceberg_profile_mces_golden.json", + ) diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json index ceec764bfbc86..d59fce788c95e 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json @@ -2782,7 +2782,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -2820,62 +2820,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -2939,7 +2939,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -2990,7 +3041,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -3049,7 +3100,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -3104,7 +3155,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -3213,41 +3264,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json index 1bd75ae457cb4..ed2c992655a89 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json index b9687b97571cb..f7793140fe033 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json index a5a68777cad5c..f54c62865bcde 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index 36d3ba1b3510d..58c225e1ec4c9 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -949,7 +949,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -1003,62 +1003,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json index 84ace7d673676..9c41bbdc80c49 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json index f7f3cb8fb743e..985140f774ab4 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json index 5353d95ada8f7..5d87d423a6a67 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro b/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro index 8a6d9df66bb79..79c329b3f8dca 100644 Binary files a/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro and b/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro differ diff --git a/metadata-ingestion/tests/integration/s3/test_s3.py b/metadata-ingestion/tests/integration/s3/test_s3.py index 98ae2eaa393ab..462ca88b7c123 100644 --- a/metadata-ingestion/tests/integration/s3/test_s3.py +++ b/metadata-ingestion/tests/integration/s3/test_s3.py @@ -140,7 +140,7 @@ def test_data_lake_s3_ingest( def test_data_lake_local_ingest( pytestconfig, touch_local_files, source_file, tmp_path, mock_time ): - os.environ["SPARK_VERSION"] = "3.0.3" + os.environ["SPARK_VERSION"] = "3.3.2" test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f) diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json index e456e4450c50a..e241bdd08e243 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json @@ -8,7 +8,7 @@ { "downstream": { "table": null, - "column": "TOTAL_AGG" + "column": "total_agg" }, "upstreams": [ { @@ -20,7 +20,7 @@ { "downstream": { "table": null, - "column": "ORDERKEY" + "column": "orderkey" }, "upstreams": [ { @@ -32,7 +32,7 @@ { "downstream": { "table": null, - "column": "CUSTKEY" + "column": "custkey" }, "upstreams": [ { @@ -44,7 +44,7 @@ { "downstream": { "table": null, - "column": "ORDERSTATUS" + "column": "orderstatus" }, "upstreams": [ { @@ -56,7 +56,7 @@ { "downstream": { "table": null, - "column": "TOTALPRICE" + "column": "totalprice" }, "upstreams": [ { @@ -68,7 +68,7 @@ { "downstream": { "table": null, - "column": "ORDERDATE" + "column": "orderdate" }, "upstreams": [ { @@ -80,7 +80,7 @@ { "downstream": { "table": null, - "column": "ORDERPRIORITY" + "column": "orderpriority" }, "upstreams": [ { @@ -92,7 +92,7 @@ { "downstream": { "table": null, - "column": "CLERK" + "column": "clerk" }, "upstreams": [ { @@ -104,7 +104,7 @@ { "downstream": { "table": null, - "column": "SHIPPRIORITY" + "column": "shippriority" }, "upstreams": [ { @@ -116,7 +116,7 @@ { "downstream": { "table": null, - "column": "COMMENT" + "column": "comment" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json index 8e1fd453ce09d..2340b2e95b0d0 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json @@ -9,14 +9,14 @@ { "downstream": { "table": null, - "column": "LABEL" + "column": "label" }, "upstreams": [] }, { "downstream": { "table": null, - "column": "TOTAL_AGG" + "column": "total_agg" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json index 7d1a4f2039b10..64cd80e9a2d69 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json @@ -8,7 +8,7 @@ { "downstream": { "table": null, - "column": "TOTAL_PRICE_CATEGORY" + "column": "total_price_category" }, "upstreams": [ { @@ -20,7 +20,7 @@ { "downstream": { "table": null, - "column": "TOTAL_PRICE_SUCCESS" + "column": "total_price_success" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json index 694bec3800dbf..7b22a46757e39 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json @@ -8,7 +8,7 @@ { "downstream": { "table": null, - "column": "TOTAL_AGG" + "column": "total_agg" }, "upstreams": [ { @@ -20,7 +20,7 @@ { "downstream": { "table": null, - "column": "TOTAL_AVG" + "column": "total_avg" }, "upstreams": [ { @@ -32,7 +32,7 @@ { "downstream": { "table": null, - "column": "TOTAL_MIN" + "column": "total_min" }, "upstreams": [ { @@ -44,7 +44,7 @@ { "downstream": { "table": null, - "column": "TOTAL_MAX" + "column": "total_max" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json new file mode 100644 index 0000000000000..c912d99a3a8a3 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json @@ -0,0 +1,59 @@ +{ + "query_type": "CREATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "Total_Agg" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "total_avg" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "TOTAL_MIN" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", + "column": "total_max" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "TotalPrice" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json index 157745854128f..2af308ec60623 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json @@ -11,7 +11,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "USER_FK" + "column": "user_fk" }, "upstreams": [ { @@ -23,7 +23,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "EMAIL" + "column": "email" }, "upstreams": [ { @@ -35,7 +35,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "LAST_PURCHASE_DATE" + "column": "last_purchase_date" }, "upstreams": [ { @@ -47,7 +47,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "LIFETIME_PURCHASE_AMOUNT" + "column": "lifetime_purchase_amount" }, "upstreams": [ { @@ -59,7 +59,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "LIFETIME_PURCHASE_COUNT" + "column": "lifetime_purchase_count" }, "upstreams": [ { @@ -71,7 +71,7 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "AVERAGE_PURCHASE_AMOUNT" + "column": "average_purchase_amount" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 5a294be150fa0..7581d3bac010e 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -287,6 +287,40 @@ def test_snowflake_column_normalization(): ) +def test_snowflake_ctas_column_normalization(): + # For CTAS statements, we also should try to match the output table's + # column name casing. This is technically incorrect since we have the + # exact column names from the query, but necessary to match our column + # name normalization behavior in the Snowflake source. + + assert_sql_result( + """ +CREATE TABLE snowflake_sample_data.tpch_sf1.orders_normalized +AS +SELECT + SUM(o."totalprice") as Total_Agg, + AVG("TotalPrice") as TOTAL_AVG, + MIN("TOTALPRICE") as TOTAL_MIN, + MAX(TotalPrice) as Total_Max +FROM snowflake_sample_data.tpch_sf1.orders o +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER", + "TotalPrice": "FLOAT", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)": { + "Total_Agg": "FLOAT", + "total_avg": "FLOAT", + "TOTAL_MIN": "FLOAT", + # Purposely excluding total_max to test out the fallback behavior. + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_ctas_column_normalization.json", + ) + + def test_snowflake_case_statement(): assert_sql_result( """ diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py index 532ab69d1c6b1..712ae2066b728 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_checkpoint.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from typing import Dict, List import pydantic @@ -29,7 +29,7 @@ def _assert_checkpoint_deserialization( ) -> Checkpoint: # Serialize a checkpoint aspect with the previous state. checkpoint_aspect = DatahubIngestionCheckpointClass( - timestampMillis=int(datetime.now().timestamp() * 1000), + timestampMillis=int(datetime.now(tz=timezone.utc).timestamp() * 1000), pipelineName=test_pipeline_name, platformInstanceId="this-can-be-anything-and-will-be-ignored", config="this-is-also-ignored", diff --git a/metadata-ingestion/tests/unit/test_bigquery_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_lineage.py index c9308fd89ef72..9b09fa36ba586 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_lineage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_lineage.py @@ -1,6 +1,8 @@ import datetime from typing import Dict, List, Set +import pytest + from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigQueryTableRef, QueryEvent, @@ -14,15 +16,17 @@ from datahub.utilities.sqlglot_lineage import SchemaResolver -def test_lineage_with_timestamps(): - config = BigQueryV2Config() - report = BigQueryV2Report() - extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) - lineage_entries: List[QueryEvent] = [ +@pytest.fixture +def lineage_entries() -> List[QueryEvent]: + return [ QueryEvent( timestamp=datetime.datetime.now(tz=datetime.timezone.utc), actor_email="bla@bla.com", - query="testQuery", + query=""" + INSERT INTO `my_project.my_dataset.my_table` + SELECT first.a, second.b FROM `my_project.my_dataset.my_source_table1` first + LEFT JOIN `my_project.my_dataset.my_source_table2` second ON first.id = second.id + """, statementType="SELECT", project_id="proj_12344", end_time=None, @@ -73,6 +77,12 @@ def test_lineage_with_timestamps(): ), ] + +def test_lineage_with_timestamps(lineage_entries: List[QueryEvent]) -> None: + config = BigQueryV2Config() + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + bq_table = BigQueryTableRef.from_string_name( "projects/my_project/datasets/my_dataset/tables/my_table" ) @@ -90,3 +100,31 @@ def test_lineage_with_timestamps(): ) assert upstream_lineage assert len(upstream_lineage.upstreams) == 4 + + +def test_column_level_lineage(lineage_entries: List[QueryEvent]) -> None: + config = BigQueryV2Config(extract_column_lineage=True, incremental_lineage=False) + report = BigQueryV2Report() + extractor: BigqueryLineageExtractor = BigqueryLineageExtractor(config, report) + + bq_table = BigQueryTableRef.from_string_name( + "projects/my_project/datasets/my_dataset/tables/my_table" + ) + + lineage_map: Dict[str, Set[LineageEdge]] = extractor._create_lineage_map( + lineage_entries[:1], + sql_parser_schema_resolver=SchemaResolver(platform="bigquery"), + ) + + upstream_lineage = extractor.get_lineage_for_table( + bq_table=bq_table, + bq_table_urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,my_project.my_dataset.my_table,PROD)", + lineage_metadata=lineage_map, + platform="bigquery", + ) + assert upstream_lineage + assert len(upstream_lineage.upstreams) == 2 + assert ( + upstream_lineage.fineGrainedLineages + and len(upstream_lineage.fineGrainedLineages) == 2 + ) diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index f3ea071d76400..768d4f958af1f 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -1,405 +1,482 @@ +import sys +import uuid +from decimal import Decimal from typing import Any, Optional import pytest -from iceberg.api import types as IcebergTypes -from iceberg.api.types.types import NestedField - -from datahub.configuration.common import ConfigurationError -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.azure.azure_common import AdlsSourceConfig -from datahub.ingestion.source.iceberg.iceberg import IcebergSource, IcebergSourceConfig -from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField -from datahub.metadata.schema_classes import ( - ArrayTypeClass, - BooleanTypeClass, - BytesTypeClass, - DateTypeClass, - FixedTypeClass, - NumberTypeClass, - RecordTypeClass, - StringTypeClass, - TimeTypeClass, -) - +from pydantic import ValidationError + +if sys.version_info >= (3, 8): + from pyiceberg.schema import Schema + from pyiceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IcebergType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + PrimitiveType, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, + ) -def with_iceberg_source() -> IcebergSource: - adls: AdlsSourceConfig = AdlsSourceConfig( - account_name="test", account_key="test", container_name="test" + from datahub.ingestion.api.common import PipelineContext + from datahub.ingestion.source.iceberg.iceberg import ( + IcebergProfiler, + IcebergSource, + IcebergSourceConfig, ) - return IcebergSource( - ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(adls=adls), + from datahub.ingestion.source.iceberg.iceberg_common import IcebergCatalogConfig + from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField + from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + DateTypeClass, + FixedTypeClass, + NumberTypeClass, + RecordTypeClass, + StringTypeClass, + TimeTypeClass, ) - -def assert_field( - schema_field: SchemaField, - expected_description: Optional[str], - expected_nullable: bool, - expected_type: Any, -) -> None: - assert ( - schema_field.description == expected_description - ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" - assert ( - schema_field.nullable == expected_nullable - ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" - assert isinstance( - schema_field.type.type, expected_type - ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" - - -def test_adls_config_no_credential(): - """ - Test when no ADLS credential information is provided (SAS token, Account key). - """ - with pytest.raises(ConfigurationError): - AdlsSourceConfig(account_name="test", container_name="test") - - -def test_adls_config_with_sas_credential(): - """ - Test when a SAS token is used as an ADLS credential. - """ - AdlsSourceConfig(account_name="test", sas_token="test", container_name="test") - - -def test_adls_config_with_key_credential(): - """ - Test when an account key is used as an ADLS credential. - """ - AdlsSourceConfig(account_name="test", account_key="test", container_name="test") - - -def test_adls_config_with_client_secret_credential(): - """ - Test when a client secret is used as an ADLS credential. - """ - AdlsSourceConfig( - account_name="test", - tenant_id="test", - client_id="test", - client_secret="test", - container_name="test", + pytestmark = pytest.mark.skipif( + sys.version_info < (3, 8), reason="requires python 3.8 or higher" ) - # Test when tenant_id is missing - with pytest.raises(ConfigurationError): - AdlsSourceConfig( - account_name="test", - client_id="test", - client_secret="test", - container_name="test", + def with_iceberg_source() -> IcebergSource: + catalog: IcebergCatalogConfig = IcebergCatalogConfig( + name="test", type="rest", config={} ) - - # Test when client_id is missing - with pytest.raises(ConfigurationError): - AdlsSourceConfig( - account_name="test", - tenant_id="test", - client_secret="test", - container_name="test", - ) - - # Test when client_secret is missing - with pytest.raises(ConfigurationError): - AdlsSourceConfig( - account_name="test", - tenant_id="test", - client_id="test", - container_name="test", - ) - - -def test_config_for_tests(): - """ - Test valid iceberg source that will be used in unit tests. - """ - with_iceberg_source() - - -def test_config_no_filesystem(): - """ - Test when a SAS token is used as an ADLS credential. - """ - with pytest.raises(ConfigurationError): - IcebergSource( + return IcebergSource( ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(), + config=IcebergSourceConfig(catalog=catalog), ) - -def test_config_multiple_filesystems(): - """ - Test when more than 1 filesystem is configured. - """ - with pytest.raises(ConfigurationError): - adls: AdlsSourceConfig = AdlsSourceConfig( - account_name="test", container_name="test" - ) - IcebergSource( - ctx=PipelineContext(run_id="iceberg-source-test"), - config=IcebergSourceConfig(adls=adls, localfs="/tmp"), + def with_iceberg_profiler() -> IcebergProfiler: + iceberg_source_instance = with_iceberg_source() + return IcebergProfiler( + iceberg_source_instance.report, iceberg_source_instance.config.profiling ) - -@pytest.mark.parametrize( - "iceberg_type, expected_schema_field_type", - [ - (IcebergTypes.BinaryType.get(), BytesTypeClass), - (IcebergTypes.BooleanType.get(), BooleanTypeClass), - (IcebergTypes.DateType.get(), DateTypeClass), - ( - IcebergTypes.DecimalType.of(3, 2), - NumberTypeClass, - ), - (IcebergTypes.DoubleType.get(), NumberTypeClass), - (IcebergTypes.FixedType.of_length(4), FixedTypeClass), - (IcebergTypes.FloatType.get(), NumberTypeClass), - (IcebergTypes.IntegerType.get(), NumberTypeClass), - (IcebergTypes.LongType.get(), NumberTypeClass), - (IcebergTypes.StringType.get(), StringTypeClass), - ( - IcebergTypes.TimestampType.with_timezone(), - TimeTypeClass, - ), - ( - IcebergTypes.TimestampType.without_timezone(), - TimeTypeClass, - ), - (IcebergTypes.TimeType.get(), TimeTypeClass), - ( - IcebergTypes.UUIDType.get(), - StringTypeClass, - ), - ], -) -def test_iceberg_primitive_type_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_schema_field_type: Any -) -> None: - """ - Test converting a primitive typed Iceberg field to a SchemaField - """ - iceberg_source_instance = with_iceberg_source() - for column in [ - NestedField.required( - 1, "required_field", iceberg_type, "required field documentation" - ), - NestedField.optional( - 1, "optional_field", iceberg_type, "optional field documentation" - ), - ]: - schema_fields = iceberg_source_instance._get_schema_fields_for_column(column) + def assert_field( + schema_field: SchemaField, + expected_description: Optional[str], + expected_nullable: bool, + expected_type: Any, + ) -> None: assert ( - len(schema_fields) == 1 - ), f"Expected 1 field, but got {len(schema_fields)}" - assert_field( - schema_fields[0], column.doc, column.is_optional, expected_schema_field_type - ) - - -@pytest.mark.parametrize( - "iceberg_type, expected_array_nested_type", - [ - (IcebergTypes.BinaryType.get(), "bytes"), - (IcebergTypes.BooleanType.get(), "boolean"), - (IcebergTypes.DateType.get(), "date"), - ( - IcebergTypes.DecimalType.of(3, 2), - "decimal", - ), - (IcebergTypes.DoubleType.get(), "double"), - (IcebergTypes.FixedType.of_length(4), "fixed"), - (IcebergTypes.FloatType.get(), "float"), - (IcebergTypes.IntegerType.get(), "int"), - (IcebergTypes.LongType.get(), "long"), - (IcebergTypes.StringType.get(), "string"), - ( - IcebergTypes.TimestampType.with_timezone(), - "timestamp-micros", - ), - ( - IcebergTypes.TimestampType.without_timezone(), - "timestamp-micros", - ), - (IcebergTypes.TimeType.get(), "time-micros"), - ( - IcebergTypes.UUIDType.get(), - "uuid", - ), - ], -) -def test_iceberg_list_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_array_nested_type: Any -) -> None: - """ - Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type. - """ - list_column: NestedField = NestedField.required( - 1, - "listField", - IcebergTypes.ListType.of_required(2, iceberg_type), - "documentation", + schema_field.description == expected_description + ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'" + assert ( + schema_field.nullable == expected_nullable + ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'" + assert isinstance( + schema_field.type.type, expected_type + ), f"Field type {schema_field.type.type} is different from expected type {expected_type}" + + def test_config_no_catalog(): + """ + Test when no Iceberg catalog is provided. + """ + with pytest.raises(ValidationError, match="catalog"): + IcebergSourceConfig() # type: ignore + + def test_config_catalog_not_configured(): + """ + Test when an Iceberg catalog is provided, but not properly configured. + """ + with pytest.raises(ValidationError): + IcebergCatalogConfig() # type: ignore + + with pytest.raises(ValidationError, match="conf"): + IcebergCatalogConfig(type="a type") # type: ignore + + with pytest.raises(ValidationError, match="type"): + IcebergCatalogConfig(conf={}) # type: ignore + + def test_config_for_tests(): + """ + Test valid iceberg source that will be used in unit tests. + """ + with_iceberg_source() + + @pytest.mark.parametrize( + "iceberg_type, expected_schema_field_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], ) - iceberg_source_instance = with_iceberg_source() - schema_fields = iceberg_source_instance._get_schema_fields_for_column(list_column) - assert len(schema_fields) == 1, f"Expected 1 field, but got {len(schema_fields)}" - assert_field( - schema_fields[0], list_column.doc, list_column.is_optional, ArrayTypeClass + def test_iceberg_primitive_type_to_schema_field( + iceberg_type: PrimitiveType, expected_schema_field_type: Any + ) -> None: + """ + Test converting a primitive typed Iceberg field to a SchemaField + """ + iceberg_source_instance = with_iceberg_source() + for column in [ + NestedField( + 1, "required_field", iceberg_type, True, "required field documentation" + ), + NestedField( + 1, "optional_field", iceberg_type, False, "optional field documentation" + ), + ]: + schema = Schema(column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema( + schema + ) + assert ( + len(schema_fields) == 1 + ), f"Expected 1 field, but got {len(schema_fields)}" + assert_field( + schema_fields[0], + column.doc, + column.optional, + expected_schema_field_type, + ) + + @pytest.mark.parametrize( + "iceberg_type, expected_array_nested_type", + [ + (BinaryType(), "bytes"), + (BooleanType(), "boolean"), + (DateType(), "date"), + ( + DecimalType(3, 2), + "decimal", + ), + (DoubleType(), "double"), + (FixedType(4), "fixed"), + (FloatType(), "float"), + (IntegerType(), "int"), + (LongType(), "long"), + (StringType(), "string"), + ( + TimestampType(), + "timestamp-micros", + ), + ( + TimestamptzType(), + "timestamp-micros", + ), + (TimeType(), "time-micros"), + ( + UUIDType(), + "uuid", + ), + ], ) - assert isinstance( - schema_fields[0].type.type, ArrayType - ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" - arrayType: ArrayType = schema_fields[0].type.type - assert arrayType.nestedType == [ - expected_array_nested_type - ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" - - -@pytest.mark.parametrize( - "iceberg_type, expected_map_type", - [ - (IcebergTypes.BinaryType.get(), BytesTypeClass), - (IcebergTypes.BooleanType.get(), BooleanTypeClass), - (IcebergTypes.DateType.get(), DateTypeClass), - ( - IcebergTypes.DecimalType.of(3, 2), - NumberTypeClass, - ), - (IcebergTypes.DoubleType.get(), NumberTypeClass), - (IcebergTypes.FixedType.of_length(4), FixedTypeClass), - (IcebergTypes.FloatType.get(), NumberTypeClass), - (IcebergTypes.IntegerType.get(), NumberTypeClass), - (IcebergTypes.LongType.get(), NumberTypeClass), - (IcebergTypes.StringType.get(), StringTypeClass), - ( - IcebergTypes.TimestampType.with_timezone(), - TimeTypeClass, - ), - ( - IcebergTypes.TimestampType.without_timezone(), - TimeTypeClass, - ), - (IcebergTypes.TimeType.get(), TimeTypeClass), - ( - IcebergTypes.UUIDType.get(), - StringTypeClass, - ), - ], -) -def test_iceberg_map_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_map_type: Any -) -> None: - """ - Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value. - """ - map_column: NestedField = NestedField.required( - 1, - "mapField", - IcebergTypes.MapType.of_required(11, 12, iceberg_type, iceberg_type), - "documentation", + def test_iceberg_list_to_schema_field( + iceberg_type: PrimitiveType, expected_array_nested_type: Any + ) -> None: + """ + Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type. + """ + for list_column in [ + NestedField( + 1, + "listField", + ListType(2, iceberg_type, True), + True, + "required field, required element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, False), + True, + "required field, optional element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, True), + False, + "optional field, required element documentation", + ), + NestedField( + 1, + "listField", + ListType(2, iceberg_type, False), + False, + "optional field, optional element documentation", + ), + ]: + iceberg_source_instance = with_iceberg_source() + schema = Schema(list_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema( + schema + ) + assert ( + len(schema_fields) == 1 + ), f"Expected 1 field, but got {len(schema_fields)}" + assert_field( + schema_fields[0], list_column.doc, list_column.optional, ArrayTypeClass + ) + assert isinstance( + schema_fields[0].type.type, ArrayType + ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}" + arrayType: ArrayType = schema_fields[0].type.type + assert arrayType.nestedType == [ + expected_array_nested_type + ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}" + + @pytest.mark.parametrize( + "iceberg_type, expected_map_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], ) - iceberg_source_instance = with_iceberg_source() - schema_fields = iceberg_source_instance._get_schema_fields_for_column(map_column) - # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. - # The first field will be the array. - assert len(schema_fields) == 3, f"Expected 3 fields, but got {len(schema_fields)}" - assert_field( - schema_fields[0], map_column.doc, map_column.is_optional, ArrayTypeClass + def test_iceberg_map_to_schema_field( + iceberg_type: PrimitiveType, expected_map_type: Any + ) -> None: + """ + Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value. + """ + for map_column in [ + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, True), + True, + "required field, required value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, False), + True, + "required field, optional value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, True), + False, + "optional field, required value documentation", + ), + NestedField( + 1, + "mapField", + MapType(11, iceberg_type, 12, iceberg_type, False), + False, + "optional field, optional value documentation", + ), + ]: + iceberg_source_instance = with_iceberg_source() + schema = Schema(map_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema( + schema + ) + # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records. + # The first field will be the array. + assert ( + len(schema_fields) == 3 + ), f"Expected 3 fields, but got {len(schema_fields)}" + assert_field( + schema_fields[0], map_column.doc, map_column.optional, ArrayTypeClass + ) + + # The second field will be the key type + assert_field(schema_fields[1], None, False, expected_map_type) + + # The third field will be the value type + assert_field( + schema_fields[2], + None, + not map_column.field_type.value_required, + expected_map_type, + ) + + @pytest.mark.parametrize( + "iceberg_type, expected_schema_field_type", + [ + (BinaryType(), BytesTypeClass), + (BooleanType(), BooleanTypeClass), + (DateType(), DateTypeClass), + ( + DecimalType(3, 2), + NumberTypeClass, + ), + (DoubleType(), NumberTypeClass), + (FixedType(4), FixedTypeClass), + (FloatType(), NumberTypeClass), + (IntegerType(), NumberTypeClass), + (LongType(), NumberTypeClass), + (StringType(), StringTypeClass), + ( + TimestampType(), + TimeTypeClass, + ), + ( + TimestamptzType(), + TimeTypeClass, + ), + (TimeType(), TimeTypeClass), + ( + UUIDType(), + StringTypeClass, + ), + ], ) + def test_iceberg_struct_to_schema_field( + iceberg_type: PrimitiveType, expected_schema_field_type: Any + ) -> None: + """ + Test converting a struct typed Iceberg field to a RecordType SchemaField. + """ + field1 = NestedField(11, "field1", iceberg_type, True, "field documentation") + struct_column = NestedField( + 1, "structField", StructType(field1), True, "struct documentation" + ) + iceberg_source_instance = with_iceberg_source() + schema = Schema(struct_column) + schema_fields = iceberg_source_instance._get_schema_fields_for_schema(schema) + assert ( + len(schema_fields) == 2 + ), f"Expected 2 fields, but got {len(schema_fields)}" + assert_field( + schema_fields[0], struct_column.doc, struct_column.optional, RecordTypeClass + ) + assert_field( + schema_fields[1], field1.doc, field1.optional, expected_schema_field_type + ) - # The second field will be the key type - assert_field(schema_fields[1], None, False, expected_map_type) - - # The third field will be the value type - assert_field(schema_fields[2], None, True, expected_map_type) - - -@pytest.mark.parametrize( - "iceberg_type, expected_schema_field_type", - [ - (IcebergTypes.BinaryType.get(), BytesTypeClass), - (IcebergTypes.BooleanType.get(), BooleanTypeClass), - (IcebergTypes.DateType.get(), DateTypeClass), - ( - IcebergTypes.DecimalType.of(3, 2), - NumberTypeClass, - ), - (IcebergTypes.DoubleType.get(), NumberTypeClass), - (IcebergTypes.FixedType.of_length(4), FixedTypeClass), - (IcebergTypes.FloatType.get(), NumberTypeClass), - (IcebergTypes.IntegerType.get(), NumberTypeClass), - (IcebergTypes.LongType.get(), NumberTypeClass), - (IcebergTypes.StringType.get(), StringTypeClass), - ( - IcebergTypes.TimestampType.with_timezone(), - TimeTypeClass, - ), - ( - IcebergTypes.TimestampType.without_timezone(), - TimeTypeClass, - ), - (IcebergTypes.TimeType.get(), TimeTypeClass), - ( - IcebergTypes.UUIDType.get(), - StringTypeClass, - ), - ], -) -def test_iceberg_struct_to_schema_field( - iceberg_type: IcebergTypes.PrimitiveType, expected_schema_field_type: Any -) -> None: - """ - Test converting a struct typed Iceberg field to a RecordType SchemaField. - """ - field1: NestedField = NestedField.required( - 11, "field1", iceberg_type, "field documentation" - ) - struct_column: NestedField = NestedField.required( - 1, "structField", IcebergTypes.StructType.of([field1]), "struct documentation" + @pytest.mark.parametrize( + "value_type, value, expected_value", + [ + (BinaryType(), bytes([1, 2, 3, 4, 5]), "b'\\x01\\x02\\x03\\x04\\x05'"), + (BooleanType(), True, "True"), + (DateType(), 19543, "2023-07-05"), + (DecimalType(3, 2), Decimal((0, (3, 1, 4), -2)), "3.14"), + (DoubleType(), 3.4, "3.4"), + (FixedType(4), bytes([1, 2, 3, 4]), "b'\\x01\\x02\\x03\\x04'"), + (FloatType(), 3.4, "3.4"), + (IntegerType(), 3, "3"), + (LongType(), 4294967295000, "4294967295000"), + (StringType(), "a string", "a string"), + ( + TimestampType(), + 1688559488157000, + "2023-07-05T12:18:08.157000", + ), + ( + TimestamptzType(), + 1688559488157000, + "2023-07-05T12:18:08.157000+00:00", + ), + (TimeType(), 40400000000, "11:13:20"), + ( + UUIDType(), + uuid.UUID("00010203-0405-0607-0809-0a0b0c0d0e0f"), + "00010203-0405-0607-0809-0a0b0c0d0e0f", + ), + ], ) - iceberg_source_instance = with_iceberg_source() - schema_fields = iceberg_source_instance._get_schema_fields_for_column(struct_column) - assert len(schema_fields) == 2, f"Expected 2 fields, but got {len(schema_fields)}" - assert_field( - schema_fields[0], struct_column.doc, struct_column.is_optional, RecordTypeClass - ) - assert_field( - schema_fields[1], field1.doc, field1.is_optional, expected_schema_field_type - ) - - -def test_avro_decimal_bytes_nullable(): - """ - The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. - NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes. - """ - import avro.schema + def test_iceberg_profiler_value_render( + value_type: IcebergType, value: Any, expected_value: Optional[str] + ) -> None: + iceberg_profiler_instance = with_iceberg_profiler() + assert ( + iceberg_profiler_instance._render_value("a.dataset", value_type, value) + == expected_value + ) - decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" - decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string) - print("\nDecimal (bytes)") - print( - f"Original avro schema string: {decimal_avro_schema_string}" - ) - print(f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}") + def test_avro_decimal_bytes_nullable() -> None: + """ + The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable. Decimal (fixed) and Boolean for example do. + NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes. + """ + import avro.schema + + decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" + decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string) + print("\nDecimal (bytes)") + print( + f"Original avro schema string: {decimal_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is missing: {decimal_avro_schema}" + ) - decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" - decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) - print("\nDecimal (fixed)") - print( - f"Original avro schema string: {decimal_fixed_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is preserved: {decimal_fixed_avro_schema}" - ) + decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}""" + decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string) + print("\nDecimal (fixed)") + print( + f"Original avro schema string: {decimal_fixed_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is preserved: {decimal_fixed_avro_schema}" + ) - boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" - boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string) - print("\nBoolean") - print( - f"Original avro schema string: {boolean_avro_schema_string}" - ) - print( - f"After avro parsing, _nullable attribute is preserved: {boolean_avro_schema}" - ) + boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}""" + boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string) + print("\nBoolean") + print( + f"Original avro schema string: {boolean_avro_schema_string}" + ) + print( + f"After avro parsing, _nullable attribute is preserved: {boolean_avro_schema}" + ) diff --git a/metadata-ingestion/tests/unit/test_usage_common.py b/metadata-ingestion/tests/unit/test_usage_common.py index 8c9c25593afa8..1e2b2b6999177 100644 --- a/metadata-ingestion/tests/unit/test_usage_common.py +++ b/metadata-ingestion/tests/unit/test_usage_common.py @@ -1,6 +1,5 @@ import time from datetime import datetime -from unittest import mock import pytest from freezegun import freeze_time @@ -12,6 +11,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.usage.usage_common import ( + DEFAULT_QUERIES_CHARACTER_LIMIT, BaseUsageConfig, GenericAggregatedDataset, convert_usage_aggregation_class, @@ -183,6 +183,7 @@ def test_make_usage_workunit(): top_n_queries=10, format_sql_queries=False, include_top_n_queries=True, + queries_character_limit=DEFAULT_QUERIES_CHARACTER_LIMIT, ) ts_timestamp = int(floored_ts.timestamp() * 1000) @@ -218,6 +219,7 @@ def test_query_formatting(): top_n_queries=10, format_sql_queries=True, include_top_n_queries=True, + queries_character_limit=DEFAULT_QUERIES_CHARACTER_LIMIT, ) ts_timestamp = int(floored_ts.timestamp() * 1000) assert ( @@ -234,7 +236,7 @@ def test_query_trimming(): test_email: str = "test_email@test.com" test_query: str = "select * from test where a > 10 and b > 20 order by a asc" top_n_queries: int = 10 - total_budget_for_query_list: int = 200 + queries_character_limit: int = 200 event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" @@ -251,7 +253,7 @@ def test_query_trimming(): top_n_queries=top_n_queries, format_sql_queries=False, include_top_n_queries=True, - total_budget_for_query_list=total_budget_for_query_list, + queries_character_limit=queries_character_limit, ) ts_timestamp = int(floored_ts.timestamp() * 1000) @@ -267,11 +269,7 @@ def test_query_trimming(): def test_top_n_queries_validator_fails(): with pytest.raises(ValidationError) as excinfo: - with mock.patch( - "datahub.ingestion.source.usage.usage_common.TOTAL_BUDGET_FOR_QUERY_LIST", - 20, - ): - BaseUsageConfig(top_n_queries=2) + BaseUsageConfig(top_n_queries=2, queries_character_limit=20) assert "top_n_queries is set to 2 but it can be maximum 1" in str(excinfo.value) @@ -294,6 +292,7 @@ def test_make_usage_workunit_include_top_n_queries(): top_n_queries=10, format_sql_queries=False, include_top_n_queries=False, + queries_character_limit=DEFAULT_QUERIES_CHARACTER_LIMIT, ) ts_timestamp = int(floored_ts.timestamp() * 1000) diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 82273427974af..e304bb5329c62 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -26,12 +26,11 @@ tasks.withType(Test).configureEach { } dependencies { - implementation project(':metadata-models') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } - compile externalDependency.avro_1_7 + implementation externalDependency.avro_1_7 constraints { implementation('commons-collections:commons-collections:3.2.2') { because 'Vulnerability Issue' @@ -48,12 +47,14 @@ dependencies { implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito - testCompile externalDependency.mockServer - testCompile externalDependency.mockServerClient - testCompile externalDependency.testContainers - testCompile externalDependency.httpAsyncClient - testRuntime externalDependency.logbackClassic + // VisibleForTesting + compileOnly externalDependency.guava + testImplementation externalDependency.mockito + testImplementation externalDependency.mockServer + testImplementation externalDependency.mockServerClient + testImplementation externalDependency.testContainers + testImplementation externalDependency.httpAsyncClient + testRuntimeOnly externalDependency.logbackClassic swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.33' } @@ -139,6 +140,7 @@ checkShadowJar { assemble { dependsOn shadowJar } +compileTestJava.dependsOn shadowJar task sourcesJar(type: Jar) { archiveClassifier = 'sources' @@ -231,6 +233,7 @@ tasks.register('generateOpenApiPojos', GenerateSwaggerCode) { } compileJava.dependsOn generateOpenApiPojos +processResources.dependsOn generateOpenApiPojos sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" diff --git a/metadata-integration/java/datahub-protobuf/build.gradle b/metadata-integration/java/datahub-protobuf/build.gradle index fa33e6baab534..bc919119f8fac 100644 --- a/metadata-integration/java/datahub-protobuf/build.gradle +++ b/metadata-integration/java/datahub-protobuf/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation project(':metadata-models') implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + implementation externalDependency.guava implementation externalDependency.protobuf implementation externalDependency.jgrapht implementation externalDependency.gson diff --git a/metadata-integration/java/examples/build.gradle b/metadata-integration/java/examples/build.gradle index b9e8e253dc359..581e9f82da0dc 100644 --- a/metadata-integration/java/examples/build.gradle +++ b/metadata-integration/java/examples/build.gradle @@ -24,7 +24,7 @@ dependencies { implementation project(path: ':li-utils') implementation project(path: ':metadata-models') - compile project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') implementation externalDependency.httpAsyncClient // Tests need a concrete log4j available. Providing it here diff --git a/metadata-integration/java/spark-lineage/build.gradle b/metadata-integration/java/spark-lineage/build.gradle index 7257cf0aabc35..7143ac4833143 100644 --- a/metadata-integration/java/spark-lineage/build.gradle +++ b/metadata-integration/java/spark-lineage/build.gradle @@ -145,7 +145,7 @@ assemble { dependsOn shadowJar } -task integrationTest(type: Exec, dependsOn: [shadowJar, ':docker:quickstart'] ) { +task integrationTest(type: Exec, dependsOn: [shadowJar, ':docker:quickstartSlim'] ) { environment "RUN_QUICKSTART", "false" commandLine "spark-smoke-test/smoke.sh" } diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 245edc6f55c3a..d2b584ceb6745 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'org.hidetake.swagger.generator' configurations { @@ -6,65 +6,70 @@ configurations { } dependencies { + implementation project(':entity-registry') + api project(':metadata-utils') + api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-registration') + api project(':metadata-events:mxe-utils-avro-1.7') + api project(':metadata-models') + api project(':metadata-service:restli-client') + api project(':metadata-service:configuration') + api project(':metadata-service:services') + + implementation spec.product.pegasus.data + implementation spec.product.pegasus.generator + + implementation externalDependency.guava + implementation externalDependency.reflections implementation externalDependency.jsonPatch - compile project(':entity-registry') - compile project(':metadata-utils') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':metadata-models') - compile project(':metadata-service:restli-client') - compile project(':metadata-service:configuration') - compile project(':metadata-service:services') - - compile spec.product.pegasus.data - compile spec.product.pegasus.generator - - compile externalDependency.dgraph4j exclude group: 'com.google.guava', module: 'guava' + api externalDependency.dgraph4j exclude group: 'com.google.guava', module: 'guava' implementation externalDependency.slf4jApi - testImplementation project(':metadata-integration:java:datahub-client') - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic compileOnly externalDependency.lombok implementation externalDependency.commonsCollections - compile externalDependency.datastaxOssNativeProtocol - compile externalDependency.datastaxOssCore - compile externalDependency.datastaxOssQueryBuilder - compile externalDependency.elasticSearchRest - compile externalDependency.elasticSearchTransport - compile externalDependency.javatuples - compile externalDependency.javaxValidation + api externalDependency.datastaxOssNativeProtocol + api externalDependency.datastaxOssCore + api externalDependency.datastaxOssQueryBuilder + api externalDependency.elasticSearchRest + api externalDependency.elasticSearchTransport + implementation externalDependency.javatuples + api externalDependency.javaxValidation runtimeOnly externalDependency.jna - compile externalDependency.kafkaClients - compile externalDependency.ebean + api externalDependency.kafkaClients + api externalDependency.ebean enhance externalDependency.ebeanAgent implementation externalDependency.ebeanDdl - compile externalDependency.opentelemetryAnnotations - compile externalDependency.resilience4j - compile externalDependency.springContext - compile externalDependency.swaggerAnnotations + implementation externalDependency.opentelemetryAnnotations + implementation externalDependency.resilience4j + api externalDependency.springContext + implementation externalDependency.swaggerAnnotations swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.33' - compile (externalDependency.mixpanel) { + implementation(externalDependency.mixpanel) { exclude group: 'org.json', module: 'json' } annotationProcessor externalDependency.lombok - testCompile externalDependency.testng - testCompile externalDependency.h2 - testCompile externalDependency.mysqlConnector - testCompile externalDependency.neo4jHarness - testCompile externalDependency.mockito - testCompile externalDependency.mockitoInline - testCompile externalDependency.iStackCommons - testCompile externalDependency.resilience4j - testCompile externalDependency.testContainers - testCompile externalDependency.testContainersJunit - testCompile externalDependency.testContainersElasticsearch - testCompile externalDependency.testContainersCassandra - testCompile externalDependency.lombok - testCompile project(':test-models') - testImplementation externalDependency.springBootTest + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation project(':datahub-graphql-core') + testImplementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + testImplementation externalDependency.testng + testImplementation externalDependency.h2 + testImplementation externalDependency.mysqlConnector + testImplementation externalDependency.neo4jHarness + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline + testImplementation externalDependency.iStackCommons + testImplementation externalDependency.resilience4j + testImplementation externalDependency.testContainers + testImplementation externalDependency.testContainersJunit + testImplementation externalDependency.testContainersElasticsearch + testImplementation externalDependency.testContainersCassandra + testImplementation externalDependency.lombok + testImplementation externalDependency.springBootTest + testImplementation spec.product.pegasus.restliServer + // logback >=1.3 required due to `testcontainers` only testImplementation 'ch.qos.logback:logback-classic:1.4.7' testImplementation 'net.datafaker:datafaker:1.9.0' @@ -137,6 +142,7 @@ tasks.register('generateOpenApiPojos', GenerateSwaggerCode) { } compileJava.dependsOn generateOpenApiPojos +processResources.dependsOn generateOpenApiPojos sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index d50da3059e9cb..51c758f434328 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -11,22 +11,27 @@ ext { } dependencies { + implementation project(':metadata-service:factories') implementation project(':metadata-jobs:mae-consumer') // TODO: Extract PE consumer into separate pod. implementation project(':metadata-jobs:pe-consumer') + implementation(externalDependency.springBootStarterWeb) { exclude module: "spring-boot-starter-tomcat" } implementation externalDependency.springBootStarterJetty implementation externalDependency.springKafka + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springActuator implementation externalDependency.slf4jApi implementation externalDependency.log4j2Api compileOnly externalDependency.lombok implementation externalDependency.logbackClassic + testImplementation project(':metadata-dao-impl:kafka-producer') testImplementation externalDependency.springBootTest - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng } bootJar { diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle index 26b3d82b8570a..69fe2255a6916 100644 --- a/metadata-jobs/mae-consumer/build.gradle +++ b/metadata-jobs/mae-consumer/build.gradle @@ -11,40 +11,41 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') - compile project(':li-utils') - compile (project(':metadata-service:factories')) { + implementation project(':li-utils') + implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } - compile project(':metadata-service:auth-config') - compile project(':metadata-service:restli-client') - compile project(':metadata-io') - compile project(':ingestion-scheduler') - compile project(':metadata-utils') - compile project(":entity-registry") - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-service:auth-config') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-io') + implementation project(':ingestion-scheduler') + implementation project(':metadata-utils') + implementation project(":entity-registry") + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':datahub-graphql-core') - compile externalDependency.elasticSearchRest - compile externalDependency.kafkaAvroSerde + implementation externalDependency.elasticSearchRest + implementation externalDependency.kafkaAvroSerde implementation externalDependency.protobuf - compile externalDependency.neo4jJavaDriver + implementation externalDependency.neo4jJavaDriver - compile externalDependency.springKafka - compile externalDependency.springActuator + implementation externalDependency.springKafka + implementation externalDependency.springActuator implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic - testCompile externalDependency.mockito + testImplementation externalDependency.mockito implementation externalDependency.awsMskIamAuth testImplementation externalDependency.springBootTest - testRuntime externalDependency.logbackClassic + testRuntimeOnly externalDependency.logbackClassic } task avroSchemaSources(type: Copy) { diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index a5d8d0ce49eb9..daf41a1e0303e 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -21,6 +21,8 @@ dependencies { } implementation externalDependency.springBootStarterJetty implementation externalDependency.springKafka + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springActuator implementation spec.product.pegasus.restliDocgen implementation spec.product.pegasus.restliSpringBridge implementation externalDependency.slf4jApi @@ -28,15 +30,16 @@ dependencies { compileOnly externalDependency.lombok implementation externalDependency.logbackClassic - runtime externalDependency.mariadbConnector - runtime externalDependency.mysqlConnector - runtime externalDependency.postgresql + runtimeOnly externalDependency.mariadbConnector + runtimeOnly externalDependency.mysqlConnector + runtimeOnly externalDependency.postgresql annotationProcessor externalDependency.lombok + testImplementation project(':metadata-dao-impl:kafka-producer') testImplementation externalDependency.springBootTest - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng } bootJar { diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle index 467d1dbdd3717..0bca55e0e5f92 100644 --- a/metadata-jobs/mce-consumer/build.gradle +++ b/metadata-jobs/mce-consumer/build.gradle @@ -11,24 +11,24 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') - compile project(':li-utils') - compile (project(':metadata-service:factories')) { + implementation project(':li-utils') + implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } - compile project(':metadata-utils') - compile project(':metadata-events:mxe-schemas') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':metadata-io') - compile project(':metadata-service:restli-client') - compile spec.product.pegasus.restliClient - compile spec.product.pegasus.restliCommon - compile externalDependency.elasticSearchRest + implementation project(':metadata-utils') + implementation project(':metadata-events:mxe-schemas') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-io') + implementation project(':metadata-service:restli-client') + implementation spec.product.pegasus.restliClient + implementation spec.product.pegasus.restliCommon + implementation externalDependency.elasticSearchRest implementation externalDependency.protobuf - compile externalDependency.springKafka - compile externalDependency.springActuator + implementation externalDependency.springKafka + implementation externalDependency.springActuator implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-jobs/pe-consumer/build.gradle b/metadata-jobs/pe-consumer/build.gradle index 517b021353f9d..1899a4de15635 100644 --- a/metadata-jobs/pe-consumer/build.gradle +++ b/metadata-jobs/pe-consumer/build.gradle @@ -9,21 +9,21 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') - compile project(':li-utils') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile (project(':metadata-service:factories')) { + implementation project(':li-utils') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } - compile externalDependency.springKafka - compile externalDependency.springActuator + implementation externalDependency.springKafka + implementation externalDependency.springActuator implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - runtime externalDependency.logbackClassic - testCompile externalDependency.mockito - testRuntime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic + testImplementation externalDependency.mockito + testRuntimeOnly externalDependency.logbackClassic } task avroSchemaSources(type: Copy) { diff --git a/metadata-models-custom/build.gradle b/metadata-models-custom/build.gradle index 4af866502f5dc..95a00766039a8 100644 --- a/metadata-models-custom/build.gradle +++ b/metadata-models-custom/build.gradle @@ -11,10 +11,10 @@ buildscript { plugins { id 'base' + id 'maven-publish' } apply plugin: 'pegasus' - if (project.hasProperty('projVersion')) { project.version = project.projVersion } else { @@ -23,11 +23,11 @@ if (project.hasProperty('projVersion')) { dependencies { - compile spec.product.pegasus.data + implementation spec.product.pegasus.data // Uncomment these if you want to depend on models defined in core datahub - //compile project(':li-utils') + //implementation project(':li-utils') //dataModel project(':li-utils') - //compile project(':metadata-models') + //implementation project(':metadata-models') //dataModel project(':metadata-models') } @@ -69,6 +69,6 @@ task modelDeploy(type: Copy) { modelDeploy.dependsOn modelArtifact -install.dependsOn modelDeploy +publish.dependsOn modelDeploy diff --git a/metadata-models-validator/build.gradle b/metadata-models-validator/build.gradle index bd1ec9449fb19..c8d1d2e6651d6 100644 --- a/metadata-models-validator/build.gradle +++ b/metadata-models-validator/build.gradle @@ -1,13 +1,13 @@ apply plugin: 'java' dependencies { - compile project(":entity-registry") - compile spec.product.pegasus.data - compile spec.product.pegasus.generator + implementation project(":entity-registry") + implementation spec.product.pegasus.data + implementation spec.product.pegasus.generator - compile externalDependency.commonsIo - compile externalDependency.findbugsAnnotations - compile externalDependency.guava + implementation externalDependency.commonsIo + implementation externalDependency.findbugsAnnotations + implementation externalDependency.guava implementation externalDependency.slf4jApi runtimeOnly externalDependency.logbackClassic diff --git a/metadata-models/build.gradle b/metadata-models/build.gradle index 432823852a263..2e8efae9b7bce 100644 --- a/metadata-models/build.gradle +++ b/metadata-models/build.gradle @@ -1,6 +1,6 @@ import io.datahubproject.GenerateJsonSchemaTask - +apply plugin: 'java-library' apply plugin: 'pegasus' tasks.withType(JavaCompile).configureEach { @@ -15,16 +15,16 @@ tasks.withType(Test).configureEach { } dependencies { - compile spec.product.pegasus.data + api spec.product.pegasus.data constraints { implementation('org.apache.commons:commons-text:1.10.0') { because 'Vulnerability Issue' } } - compile project(':li-utils') + api project(':li-utils') dataModel project(':li-utils') - testCompile externalDependency.guava + testImplementation externalDependency.guava } mainAvroSchemaJar.dependsOn generateAvroSchema diff --git a/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl b/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl index b9cf7d58d434e..e0f355229c912 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/mxe/SystemMetadata.pdl @@ -14,6 +14,11 @@ record SystemMetadata { */ runId: optional string = "no-run-id-provided" + /** + * The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion. + */ + pipelineName: optional string + /** * The model registry name that was used to process this event */ diff --git a/metadata-service/auth-config/build.gradle b/metadata-service/auth-config/build.gradle index 2e9210804bed9..c7a1128897dd5 100644 --- a/metadata-service/auth-config/build.gradle +++ b/metadata-service/auth-config/build.gradle @@ -1,9 +1,9 @@ apply plugin: 'java' dependencies { - compile project(path: ':metadata-models') - compile project(path: ':metadata-auth:auth-api') - compile externalDependency.guava + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-auth:auth-api') + implementation externalDependency.guava implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok diff --git a/metadata-service/auth-filter/build.gradle b/metadata-service/auth-filter/build.gradle index 2c77850209205..2dd07ef10274c 100644 --- a/metadata-service/auth-filter/build.gradle +++ b/metadata-service/auth-filter/build.gradle @@ -1,15 +1,17 @@ apply plugin: 'java' dependencies { - compile project(':metadata-auth:auth-api'); - compile project(path: ':metadata-service:auth-config') - compile project(path: ':metadata-service:factories') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(path: ':metadata-service:auth-config') + implementation project(path: ':metadata-service:factories') - compile externalDependency.servletApi + implementation externalDependency.servletApi implementation externalDependency.slf4jApi compileOnly externalDependency.lombok - compile externalDependency.springWeb + implementation externalDependency.springWeb + implementation externalDependency.guice annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito + testImplementation externalDependency.mockito } \ No newline at end of file diff --git a/metadata-service/auth-impl/build.gradle b/metadata-service/auth-impl/build.gradle index aefbf81577a9b..1ffeb99e7ad4a 100644 --- a/metadata-service/auth-impl/build.gradle +++ b/metadata-service/auth-impl/build.gradle @@ -6,11 +6,14 @@ compileJava { } dependencies { - compile project(path: ':metadata-models') - compile project(path: ':metadata-auth:auth-api') - compile project(path: ':metadata-service:auth-config') - compile project(path: ':metadata-io') - + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-auth:auth-api') + implementation project(path: ':metadata-service:auth-config') + implementation project(path: ':metadata-io') + + implementation(externalDependency.mixpanel) { + exclude group: 'org.json', module: 'json' + } implementation 'io.jsonwebtoken:jjwt-api:0.11.2' runtimeOnly 'io.jsonwebtoken:jjwt-impl:0.11.2', 'io.jsonwebtoken:jjwt-jackson:0.11.2' @@ -20,6 +23,5 @@ dependencies { annotationProcessor externalDependency.lombok - testCompile externalDependency.mockito - + testImplementation externalDependency.mockito } \ No newline at end of file diff --git a/metadata-service/auth-servlet-impl/build.gradle b/metadata-service/auth-servlet-impl/build.gradle index 3338f3a5c6b94..7945b3b4e9a06 100644 --- a/metadata-service/auth-servlet-impl/build.gradle +++ b/metadata-service/auth-servlet-impl/build.gradle @@ -1,15 +1,17 @@ apply plugin: 'java' dependencies { - compile project(':metadata-auth:auth-api') - compile project(':metadata-service:factories') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:factories') - compile externalDependency.springCore - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.graphqlJava - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.springCore + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.graphqlJava + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.guice implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/metadata-service/configuration/build.gradle b/metadata-service/configuration/build.gradle index 8623e53d2554a..30fa3079d29a4 100644 --- a/metadata-service/configuration/build.gradle +++ b/metadata-service/configuration/build.gradle @@ -3,7 +3,7 @@ plugins { } dependencies { - compile externalDependency.jacksonDataBind + implementation externalDependency.jacksonDataBind implementation externalDependency.slf4jApi implementation externalDependency.springCore diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index d21442d0bf5c8..f49498bfa2325 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -294,8 +294,8 @@ featureFlags: alwaysEmitChangeLog: ${ALWAYS_EMIT_CHANGE_LOG:false} # Enables always emitting a MCL even when no changes are detected. Used for Time Based Lineage when no changes occur. searchServiceDiffModeEnabled: ${SEARCH_SERVICE_DIFF_MODE_ENABLED:true} # Enables diff mode for search document writes, reduces amount of writes to ElasticSearch documents for no-ops readOnlyModeEnabled: ${READ_ONLY_MODE_ENABLED:false} # Enables read only mode for an instance. Right now this only affects ability to edit user profile image URL but can be extended - showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:false} # Enables showing the search filters V2 experience. - showBrowseV2: ${SHOW_BROWSE_V2:false} # Enables showing the browse v2 sidebar experience. + showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:true} # Enables showing the search filters V2 experience. + showBrowseV2: ${SHOW_BROWSE_V2:true} # Enables showing the browse v2 sidebar experience. preProcessHooks: uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to Managed DataHub. Set to true for the demo site. diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 8e9b859e3b136..f848a5e339781 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -1,54 +1,58 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply from: "../../gradle/versioning/versioning.gradle" dependencies { - compile project(':metadata-io') - compile project(':metadata-utils') - compile project(':metadata-service:auth-impl') - compile project(':metadata-service:auth-config') - compile project(':metadata-service:plugin') - compile project(':metadata-service:configuration') - compile project(':datahub-graphql-core') - compile project(':metadata-service:restli-servlet-impl') - compile project(':metadata-dao-impl:kafka-producer') - compile project(':ingestion-scheduler') + api project(':metadata-io') + api project(':metadata-utils') + implementation project(':metadata-service:auth-impl') + api project(':metadata-service:auth-config') + api project(':metadata-service:plugin') + api project(':metadata-service:configuration') + implementation project(':datahub-graphql-core') + implementation project(':metadata-service:restli-servlet-impl') + implementation project(':metadata-dao-impl:kafka-producer') + implementation project(':ingestion-scheduler') - compile (externalDependency.awsGlueSchemaRegistrySerde) { + implementation (externalDependency.awsGlueSchemaRegistrySerde) { exclude group: 'org.json', module: 'json' } - compile externalDependency.elasticSearchRest - compile externalDependency.httpClient - compile externalDependency.gson + implementation externalDependency.elasticSearchRest + implementation externalDependency.httpClient + implementation externalDependency.gson implementation (externalDependency.hazelcast) { exclude group: 'org.json', module: 'json' } - compile externalDependency.hazelcastSpring - compile externalDependency.kafkaClients - compile externalDependency.kafkaAvroSerde + implementation externalDependency.hazelcastSpring + implementation externalDependency.kafkaClients + implementation externalDependency.kafkaAvroSerde compileOnly externalDependency.lombok - compile externalDependency.servletApi - compile externalDependency.springBeans - compile externalDependency.springBootAutoconfigure - compile externalDependency.springBootStarterCache - compile externalDependency.springContext - compile externalDependency.springCore - compile externalDependency.springKafka - compile externalDependency.springWeb + implementation externalDependency.servletApi + api externalDependency.springBeans + implementation externalDependency.springBootAutoconfigure + implementation externalDependency.springBootStarterCache + api externalDependency.springContext + api externalDependency.springCore + api externalDependency.springKafka + api externalDependency.springWeb implementation externalDependency.awsPostgresIamAuth implementation externalDependency.awsRds + implementation(externalDependency.mixpanel) { + exclude group: 'org.json', module: 'json' + } annotationProcessor externalDependency.lombok - compile spec.product.pegasus.restliSpringBridge + implementation spec.product.pegasus.restliSpringBridge implementation spec.product.pegasus.restliDocgen + implementation externalDependency.jline + implementation externalDependency.common testImplementation externalDependency.springBootTest + testImplementation externalDependency.mockito + testImplementation externalDependency.testng + testImplementation externalDependency.hazelcastTest + testImplementation externalDependency.javatuples - testCompile externalDependency.mockito - testCompile externalDependency.testng - testCompile externalDependency.hazelcastTest - implementation externalDependency.jline - implementation externalDependency.common constraints { implementation(externalDependency.snappy) { diff --git a/metadata-service/graphql-servlet-impl/build.gradle b/metadata-service/graphql-servlet-impl/build.gradle index ff64f9a8a8233..52fd20ef32389 100644 --- a/metadata-service/graphql-servlet-impl/build.gradle +++ b/metadata-service/graphql-servlet-impl/build.gradle @@ -1,16 +1,19 @@ apply plugin: 'java' dependencies { - compile project(':datahub-graphql-core') - compile project(':metadata-auth:auth-api') - compile project(':metadata-service:factories') + implementation project(':datahub-graphql-core') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:factories') - compile externalDependency.springCore - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.graphqlJava - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.servletApi + implementation externalDependency.springCore + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.graphqlJava + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.guice implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok diff --git a/metadata-service/health-servlet/build.gradle b/metadata-service/health-servlet/build.gradle index 3237c56779ada..6095f724b3cd4 100644 --- a/metadata-service/health-servlet/build.gradle +++ b/metadata-service/health-servlet/build.gradle @@ -2,16 +2,17 @@ apply plugin: 'java' dependencies { - compile project(':metadata-service:factories') + implementation project(':metadata-service:factories') - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile externalDependency.springDocUI - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.guava + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation externalDependency.springDocUI + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime diff --git a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java index 45edcb2a6a5d9..02ca5182cd2be 100644 --- a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java +++ b/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java @@ -10,6 +10,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.function.Supplier; + import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.client.RequestOptions; diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle index 7cd022f97247c..1909b4862d294 100644 --- a/metadata-service/openapi-servlet/build.gradle +++ b/metadata-service/openapi-servlet/build.gradle @@ -2,36 +2,38 @@ apply plugin: 'java' dependencies { - compile project(':metadata-auth:auth-api') - compile project(':metadata-service:factories') - compile project(':metadata-service:schema-registry-api') + implementation project(':metadata-auth:auth-api') + implementation project(':metadata-service:auth-impl') + implementation project(':metadata-service:factories') + implementation project(':metadata-service:schema-registry-api') - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile(externalDependency.springDocUI) { + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation(externalDependency.springDocUI) { exclude group: 'org.springframework.boot' } - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime implementation externalDependency.antlr4 + implementation externalDependency.swaggerAnnotations annotationProcessor externalDependency.lombok testImplementation externalDependency.springBootTest testImplementation project(':mock-entity-registry') - testCompile externalDependency.springBoot - testCompile externalDependency.testContainers - testCompile externalDependency.springKafka - testCompile externalDependency.testng - testCompile externalDependency.mockito - testCompile externalDependency.logbackClassic - testCompile externalDependency.jacksonCore - testCompile externalDependency.jacksonDataBind - testCompile externalDependency.springBootStarterWeb + testImplementation externalDependency.springBoot + testImplementation externalDependency.testContainers + testImplementation externalDependency.springKafka + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.logbackClassic + testImplementation externalDependency.jacksonCore + testImplementation externalDependency.jacksonDataBind + testImplementation externalDependency.springBootStarterWeb } \ No newline at end of file diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java index 796a7774da303..1e37170f37b3b 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java @@ -18,8 +18,11 @@ import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; import io.datahubproject.openapi.exception.UnauthorizedException; -import io.swagger.annotations.ApiOperation; +import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.media.Content; +import io.swagger.v3.oas.annotations.media.Schema; +import io.swagger.v3.oas.annotations.responses.ApiResponse; import io.swagger.v3.oas.annotations.tags.Tag; import java.net.URLDecoder; import java.nio.charset.Charset; @@ -94,7 +97,8 @@ private RelatedEntitiesResult getRelatedEntities(String rawUrn, List rel } @GetMapping(value = "/", produces = MediaType.APPLICATION_JSON_VALUE) - @ApiOperation(code = 0, response = RelatedEntitiesResult.class, value = "") + @Operation(responses = { @ApiResponse(responseCode = "0", description = "", + content = @Content(schema = @Schema(implementation = RelatedEntitiesResult.class)))}) public ResponseEntity getRelationships( @Parameter(name = "urn", required = true, description = "The urn for the entity whose relationships are being queried") diff --git a/metadata-service/plugin/src/test/sample-test-plugins/build.gradle b/metadata-service/plugin/src/test/sample-test-plugins/build.gradle index 7d4b43402a586..f299a35db0f64 100644 --- a/metadata-service/plugin/src/test/sample-test-plugins/build.gradle +++ b/metadata-service/plugin/src/test/sample-test-plugins/build.gradle @@ -7,6 +7,7 @@ dependencies { implementation project(path: ':metadata-auth:auth-api') implementation externalDependency.lombok implementation externalDependency.logbackClassic; + implementation 'com.google.code.findbugs:jsr305:3.0.2' testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1' testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1' diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index e3beef5ac4871..ee6318026e27d 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -72,6 +72,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", @@ -342,7 +347,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1281,7 +1286,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1408,7 +1413,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1468,7 +1473,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1870,7 +1875,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -1881,7 +1886,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -2067,7 +2072,7 @@ "enableAutocomplete" : true, "fieldName" : "displayName", "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -2104,7 +2109,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -2296,7 +2301,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2349,7 +2354,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2412,7 +2417,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2505,7 +2510,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2525,7 +2530,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2571,7 +2576,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2608,7 +2613,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3183,7 +3188,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3201,7 +3206,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3227,7 +3232,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3293,7 +3298,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3860,7 +3865,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3879,7 +3884,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 0c9b49649bf1e..d63a938bbce9d 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -95,7 +95,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1328,7 +1328,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1474,7 +1474,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1534,7 +1534,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1927,7 +1927,7 @@ "boostScore" : 10.0, "enableAutocomplete" : false, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" }, "validate" : { "strlen" : { @@ -1942,7 +1942,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "type", @@ -2117,7 +2117,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -2128,7 +2128,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -2423,7 +2423,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2562,7 +2562,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2593,7 +2593,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2656,7 +2656,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2717,7 +2717,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2878,7 +2878,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2941,7 +2941,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2994,7 +2994,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3607,7 +3607,7 @@ "Searchable" : { "boostScore" : 4.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "orchestrator", @@ -3713,7 +3713,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4312,7 +4312,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4401,7 +4401,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4496,7 +4496,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4603,7 +4603,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4710,7 +4710,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4792,7 +4792,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4811,7 +4811,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -4866,7 +4866,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4895,7 +4895,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -5073,7 +5073,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -5113,7 +5113,7 @@ "enableAutocomplete" : true, "fieldName" : "displayName", "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -6252,6 +6252,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index de65aa841876f..0b31bf9683d0c 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -117,6 +117,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json index b7bcd8db99691..24a4ec2cc6802 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesVersionedV2.snapshot.json @@ -126,6 +126,11 @@ "doc" : "The run id that produced the metadata. Populated in case of batch-ingestion.", "default" : "no-run-id-provided", "optional" : true + }, { + "name" : "pipelineName", + "type" : "string", + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "optional" : true }, { "name" : "registryName", "type" : "string", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index ffaefc8232e83..b20953749ac35 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -95,7 +95,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1034,7 +1034,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1161,7 +1161,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1221,7 +1221,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1623,7 +1623,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -1634,7 +1634,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1812,7 +1812,7 @@ "enableAutocomplete" : true, "fieldName" : "displayName", "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -1849,7 +1849,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -2041,7 +2041,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2094,7 +2094,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2157,7 +2157,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2250,7 +2250,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2270,7 +2270,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2316,7 +2316,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2353,7 +2353,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2928,7 +2928,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2946,7 +2946,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2972,7 +2972,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3038,7 +3038,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3605,7 +3605,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3624,7 +3624,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index e385c7c30b21a..e29dd6809b968 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -95,7 +95,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1034,7 +1034,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1161,7 +1161,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1221,7 +1221,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1623,7 +1623,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -1634,7 +1634,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1806,7 +1806,7 @@ "enableAutocomplete" : true, "fieldName" : "displayName", "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "id", @@ -1843,7 +1843,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -2035,7 +2035,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2088,7 +2088,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2151,7 +2151,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2244,7 +2244,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2264,7 +2264,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2310,7 +2310,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2347,7 +2347,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2922,7 +2922,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2940,7 +2940,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2966,7 +2966,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3032,7 +3032,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3599,7 +3599,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -3618,7 +3618,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index b85c84be23795..8391af60f8ece 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -95,7 +95,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1328,7 +1328,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1474,7 +1474,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1534,7 +1534,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -1927,7 +1927,7 @@ "boostScore" : 10.0, "enableAutocomplete" : false, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" }, "validate" : { "strlen" : { @@ -1942,7 +1942,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "type", @@ -2117,7 +2117,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "qualifiedName", @@ -2128,7 +2128,7 @@ "addToFilters" : false, "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -2417,7 +2417,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } } ], @@ -2556,7 +2556,7 @@ "boostScore" : 2.0, "enableAutocomplete" : true, "fieldName" : "ldap", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2587,7 +2587,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2650,7 +2650,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2711,7 +2711,7 @@ "optional" : true, "Searchable" : { "boostScore" : 10.0, - "fieldType" : "TEXT_PARTIAL", + "fieldType" : "WORD_GRAM", "queryByDefault" : true } }, { @@ -2872,7 +2872,7 @@ "doc" : "Unique Identifier of the data flow", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "cluster", @@ -2935,7 +2935,7 @@ "doc" : "Unique Identifier of the data job", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -2988,7 +2988,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -3601,7 +3601,7 @@ "Searchable" : { "boostScore" : 4.0, "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "orchestrator", @@ -3707,7 +3707,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4306,7 +4306,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4395,7 +4395,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4490,7 +4490,7 @@ "boostScore" : 8.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4597,7 +4597,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4704,7 +4704,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "origin", @@ -4786,7 +4786,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4805,7 +4805,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "description", @@ -4860,7 +4860,7 @@ "Searchable" : { "enableAutocomplete" : true, "fieldName" : "id", - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -4889,7 +4889,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "definition", @@ -5067,7 +5067,7 @@ "type" : "string", "Searchable" : { "enableAutocomplete" : true, - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } } ], "Aspect" : { @@ -5107,7 +5107,7 @@ "enableAutocomplete" : true, "fieldName" : "displayName", "fieldNameAliases" : [ "_entityName" ], - "fieldType" : "TEXT_PARTIAL" + "fieldType" : "WORD_GRAM" } }, { "name" : "id", diff --git a/metadata-service/restli-client/build.gradle b/metadata-service/restli-client/build.gradle index 263d4b49197f4..45cf008d3ca7d 100644 --- a/metadata-service/restli-client/build.gradle +++ b/metadata-service/restli-client/build.gradle @@ -1,18 +1,19 @@ apply plugin: 'pegasus' +apply plugin: 'java-library' dependencies { - compile project(':metadata-service:restli-api') - compile project(':metadata-auth:auth-api') - compile project(path: ':metadata-service:restli-api', configuration: 'restClient') - compile project(':metadata-events:mxe-schemas') - compile project(':metadata-utils') + api project(':metadata-service:restli-api') + api project(':metadata-auth:auth-api') + api project(path: ':metadata-service:restli-api', configuration: 'restClient') + api project(':metadata-events:mxe-schemas') + api project(':metadata-utils') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - compile spec.product.pegasus.restliClient + implementation spec.product.pegasus.restliClient - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng } diff --git a/metadata-service/restli-servlet-impl/build.gradle b/metadata-service/restli-servlet-impl/build.gradle index 1028f7c3dcce4..cb307863748c3 100644 --- a/metadata-service/restli-servlet-impl/build.gradle +++ b/metadata-service/restli-servlet-impl/build.gradle @@ -11,7 +11,7 @@ sourceSets { idea { module { testSourceDirs += file('src/integTest/java') - scopes.TEST.plus += [ configurations.integTestCompile ] + scopes.TEST.plus += [ configurations.integTestCompileOnly ] } } @@ -19,6 +19,10 @@ idea { configurations { integTestImplementation.extendsFrom implementation integTestRuntimeOnly.extendsFrom runtimeOnly + integTestCompileOnly { + extendsFrom compileOnly + canBeResolved = true + } modelValidation } @@ -32,34 +36,37 @@ dependencies { } } - compile project(':metadata-service:restli-api') - compile project(':metadata-auth:auth-api') - compile project(path: ':metadata-service:restli-api', configuration: 'dataTemplate') - compile project(':li-utils') - compile project(':metadata-models') - compile project(':metadata-utils') - compile project(':metadata-io') - compile spec.product.pegasus.restliServer + implementation project(':metadata-service:restli-api') + implementation project(':metadata-auth:auth-api') + implementation project(path: ':metadata-service:restli-api', configuration: 'dataTemplate') + implementation project(':li-utils') + implementation project(':metadata-models') + implementation project(':metadata-utils') + implementation project(':metadata-io') + implementation spec.product.pegasus.restliServer implementation externalDependency.slf4jApi - // This is compile and not compileOnly because of restli - compile externalDependency.lombok - compile externalDependency.neo4jJavaDriver - compile externalDependency.opentelemetryAnnotations + implementation externalDependency.dropwizardMetricsCore + implementation externalDependency.dropwizardMetricsJmx + + compileOnly externalDependency.lombok + implementation externalDependency.neo4jJavaDriver + implementation externalDependency.opentelemetryAnnotations runtimeOnly externalDependency.logbackClassic annotationProcessor externalDependency.lombok - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation project(':mock-entity-registry') - testCompile externalDependency.mockito - testCompile externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testng integTestImplementation externalDependency.junitJupiterApi integTestRuntimeOnly externalDependency.junitJupiterEngine - integTestCompile externalDependency.junitJupiterApi - integTestCompile externalDependency.junitJupiterParams + integTestCompileOnly externalDependency.junitJupiterApi + integTestCompileOnly externalDependency.junitJupiterParams modelValidation project(path: ':metadata-models-validator') dataModel project(path: ':metadata-models', configuration: 'dataTemplate') diff --git a/metadata-service/schema-registry-api/build.gradle b/metadata-service/schema-registry-api/build.gradle index e60ca7d348b5c..7bf1e558c8906 100644 --- a/metadata-service/schema-registry-api/build.gradle +++ b/metadata-service/schema-registry-api/build.gradle @@ -3,26 +3,26 @@ apply plugin: 'org.hidetake.swagger.generator' dependencies { // Dependencies for open api - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext implementation externalDependency.antlr4Runtime implementation externalDependency.antlr4 - compile externalDependency.javaxValidation - compile externalDependency.servletApi - compile group: 'javax.annotation', name: 'javax.annotation-api', version: '1.3.2' - compile externalDependency.jacksonDataBind - compile externalDependency.slf4jApi + implementation externalDependency.javaxValidation + implementation externalDependency.servletApi + implementation group: 'javax.annotation', name: 'javax.annotation-api', version: '1.3.2' + implementation externalDependency.jacksonDataBind + implementation externalDependency.slf4jApi // End of dependencies - compile externalDependency.swaggerAnnotations - swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.33' + implementation externalDependency.swaggerAnnotations + swaggerCodegen 'io.swagger.codegen.v3:swagger-codegen-cli:3.0.46' - testCompile externalDependency.assertJ + testImplementation externalDependency.assertJ } tasks.register('generateOpenApiPojos', GenerateSwaggerCode) { diff --git a/metadata-service/schema-registry-servlet/build.gradle b/metadata-service/schema-registry-servlet/build.gradle index ec62203ddf0c5..554ac696c94fd 100644 --- a/metadata-service/schema-registry-servlet/build.gradle +++ b/metadata-service/schema-registry-servlet/build.gradle @@ -1,19 +1,20 @@ apply plugin: 'java' dependencies { - compile project(':metadata-service:factories') - compile project(':metadata-service:schema-registry-api') + implementation project(':metadata-service:factories') + implementation project(':metadata-service:schema-registry-api') - compile externalDependency.reflections - compile externalDependency.springBoot - compile externalDependency.springCore - compile(externalDependency.springDocUI) { + implementation externalDependency.reflections + implementation externalDependency.springBoot + implementation externalDependency.springCore + implementation(externalDependency.springDocUI) { exclude group: 'org.springframework.boot' } - compile externalDependency.springWeb - compile externalDependency.springWebMVC - compile externalDependency.springBeans - compile externalDependency.springContext + implementation externalDependency.springWeb + implementation externalDependency.springWebMVC + implementation externalDependency.springBeans + implementation externalDependency.springContext + implementation externalDependency.springBootAutoconfigure implementation externalDependency.slf4jApi compileOnly externalDependency.lombok implementation externalDependency.antlr4Runtime @@ -23,14 +24,14 @@ dependencies { testImplementation externalDependency.springBootTest testImplementation project(':mock-entity-registry') - testCompile externalDependency.springBoot - testCompile externalDependency.testContainers - testCompile externalDependency.testContainersKafka - testCompile externalDependency.springKafka - testCompile externalDependency.testng - testCompile externalDependency.mockito - testCompile externalDependency.logbackClassic - testCompile externalDependency.jacksonCore - testCompile externalDependency.jacksonDataBind - testCompile externalDependency.springBootStarterWeb + testImplementation externalDependency.springBoot + testImplementation externalDependency.testContainers + testImplementation externalDependency.testContainersKafka + testImplementation externalDependency.springKafka + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.logbackClassic + testImplementation externalDependency.jacksonCore + testImplementation externalDependency.jacksonDataBind + testImplementation externalDependency.springBootStarterWeb } \ No newline at end of file diff --git a/metadata-service/services/build.gradle b/metadata-service/services/build.gradle index adc7b7bf09d99..99345d6f6bc3f 100644 --- a/metadata-service/services/build.gradle +++ b/metadata-service/services/build.gradle @@ -7,32 +7,33 @@ configurations { dependencies { implementation externalDependency.jsonPatch - compile project(':entity-registry') - compile project(':metadata-utils') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-registration') - compile project(':metadata-events:mxe-utils-avro-1.7') - compile project(':metadata-models') - compile project(':metadata-service:restli-client') - compile project(':metadata-service:configuration') + implementation project(':entity-registry') + implementation project(':metadata-utils') + implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-registration') + implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-models') + implementation project(':metadata-service:restli-client') + implementation project(':metadata-service:configuration') implementation externalDependency.slf4jApi implementation externalDependency.swaggerAnnotations - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.logbackClassic compileOnly externalDependency.lombok implementation externalDependency.commonsCollections - compile externalDependency.javatuples - compile externalDependency.javaxValidation - compile externalDependency.opentelemetryAnnotations + implementation externalDependency.javatuples + implementation externalDependency.javaxValidation + implementation externalDependency.opentelemetryAnnotations annotationProcessor externalDependency.lombok - testCompile externalDependency.testng - testCompile externalDependency.junit - testCompile externalDependency.mockito - testCompile externalDependency.mockitoInline + testImplementation externalDependency.testng + testImplementation externalDependency.junit + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline testCompileOnly externalDependency.lombok - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation project(':datahub-graphql-core') // logback >=1.3 required due to `testcontainers` only testImplementation 'ch.qos.logback:logback-classic:1.4.7' diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java index b52d68e2e75ee..9e12fc80a3cdb 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/service/TagService.java @@ -20,7 +20,7 @@ import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; -import com.linkedin.entity.client.EntityClient; +import com.linkedin.entity.client.EntityClient; import com.datahub.authentication.Authentication; import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; diff --git a/metadata-service/servlet/build.gradle b/metadata-service/servlet/build.gradle index 9242d21201886..eb2cd9c2d3de7 100644 --- a/metadata-service/servlet/build.gradle +++ b/metadata-service/servlet/build.gradle @@ -1,13 +1,16 @@ apply plugin: 'java' dependencies { - compile project(':metadata-io') - compile externalDependency.httpClient - compile externalDependency.servletApi - compile externalDependency.gson - compile externalDependency.jacksonDataBind - compile externalDependency.springWebMVC + implementation project(':metadata-io') + implementation project(':datahub-graphql-core') + implementation project(':entity-registry') + implementation project(':metadata-service:factories') + + implementation externalDependency.httpClient + implementation externalDependency.servletApi + implementation externalDependency.gson + implementation externalDependency.jacksonDataBind + implementation externalDependency.springWebMVC + compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - compile project(':entity-registry') - compile project(':metadata-service:factories') } diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 7103116ca6322..3bd2695c927a7 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -12,33 +12,33 @@ ext { ext.apiProject = project(':metadata-service:restli-api') dependencies { - runtime project(':metadata-service:factories') - runtime project(':metadata-service:auth-filter') - runtime project(':metadata-service:servlet') - runtime project(':metadata-service:auth-servlet-impl') - runtime project(':metadata-service:graphql-servlet-impl') - runtime project(':metadata-service:health-servlet') - runtime project(':metadata-service:openapi-servlet') - runtime project(':metadata-service:schema-registry-servlet') - runtime project(':metadata-jobs:mce-consumer') - runtime project(':metadata-jobs:mae-consumer') - runtime project(':metadata-jobs:pe-consumer') + runtimeOnly project(':metadata-service:factories') + runtimeOnly project(':metadata-service:auth-filter') + runtimeOnly project(':metadata-service:servlet') + runtimeOnly project(':metadata-service:auth-servlet-impl') + runtimeOnly project(':metadata-service:graphql-servlet-impl') + runtimeOnly project(':metadata-service:health-servlet') + runtimeOnly project(':metadata-service:openapi-servlet') + runtimeOnly project(':metadata-service:schema-registry-servlet') + runtimeOnly project(':metadata-jobs:mce-consumer') + runtimeOnly project(':metadata-jobs:mae-consumer') + runtimeOnly project(':metadata-jobs:pe-consumer') - runtime externalDependency.awsSecretsManagerJdbc - runtime externalDependency.h2 - runtime externalDependency.mariadbConnector - runtime externalDependency.mysqlConnector - runtime externalDependency.postgresql - runtime externalDependency.springWebMVC + runtimeOnly externalDependency.awsSecretsManagerJdbc + runtimeOnly externalDependency.h2 + runtimeOnly externalDependency.mariadbConnector + runtimeOnly externalDependency.mysqlConnector + runtimeOnly externalDependency.postgresql + runtimeOnly externalDependency.springWebMVC - runtime spec.product.pegasus.restliDocgen - runtime spec.product.pegasus.restliSpringBridge + runtimeOnly spec.product.pegasus.restliDocgen + runtimeOnly spec.product.pegasus.restliSpringBridge - runtime externalDependency.log4jCore - runtime externalDependency.log4j2Api - runtime externalDependency.logbackClassic + runtimeOnly externalDependency.log4jCore + runtimeOnly externalDependency.log4j2Api + runtimeOnly externalDependency.logbackClassic implementation externalDependency.awsMskIamAuth - testRuntime externalDependency.logbackClassic + testRuntimeOnly externalDependency.logbackClassic implementation externalDependency.charle } configurations.all{ diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 3b04a5dc53d75..9f8ef70a0e728 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -1,30 +1,31 @@ -apply plugin: 'java' +apply plugin: 'java-library' dependencies { - compile externalDependency.avro_1_7 - compile externalDependency.commonsLang - compile externalDependency.dropwizardMetricsCore - compile externalDependency.dropwizardMetricsJmx - compile externalDependency.elasticSearchRest - compile externalDependency.httpClient - compile externalDependency.neo4jJavaDriver - compile externalDependency.json - - compile spec.product.pegasus.restliClient - compile spec.product.pegasus.restliCommon - compile spec.product.pegasus.restliServer - - compile project(':li-utils') - compile project(':entity-registry') - compile project(':metadata-events:mxe-avro-1.7') - compile project(':metadata-events:mxe-utils-avro-1.7') + api externalDependency.avro_1_7 + implementation externalDependency.commonsLang + api externalDependency.dropwizardMetricsCore + implementation externalDependency.dropwizardMetricsJmx + api externalDependency.elasticSearchRest + implementation externalDependency.httpClient + api externalDependency.neo4jJavaDriver + api externalDependency.json + + implementation spec.product.pegasus.restliClient + implementation spec.product.pegasus.restliCommon + implementation spec.product.pegasus.restliServer + + api project(':li-utils') + api project(':entity-registry') + api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-utils-avro-1.7') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testCompile project(':test-models') + testImplementation project(':test-models') + testImplementation project(path: ':test-models', configuration: 'testDataTemplate') constraints { implementation(externalDependency.log4jCore) { diff --git a/test-models/build.gradle b/test-models/build.gradle index 4cfbcc1399e7d..c74f7249fa1d9 100644 --- a/test-models/build.gradle +++ b/test-models/build.gradle @@ -1,5 +1,5 @@ apply plugin: 'pegasus' -apply plugin: 'java' +apply plugin: 'java-library' tasks.withType(JavaCompile).configureEach { javaCompiler = javaToolchains.compilerFor { @@ -13,8 +13,8 @@ tasks.withType(Test).configureEach { } dependencies { - compile spec.product.pegasus.data - compile externalDependency.commonsIo + implementation spec.product.pegasus.data + implementation externalDependency.commonsIo dataModel project(':metadata-models') dataModel project(':li-utils') }