diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index 96d4d759dbb84..bd6bb842b1fb8 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -30,6 +30,9 @@ inputs: # e.g. latest,head,sha12345 description: "List of tags to use for the Docker image" required: true + target: + description: "Sets the target stage to build" + required: false outputs: image_tag: description: "Docker image tags" @@ -62,6 +65,7 @@ runs: platforms: linux/amd64 build-args: ${{ inputs.build-args }} tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} load: true push: false cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} @@ -94,6 +98,7 @@ runs: platforms: ${{ inputs.platforms }} build-args: ${{ inputs.build-args }} tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} push: true cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} cache-to: type=inline diff --git a/.github/workflows/docker-ingestion-base.yml b/.github/workflows/docker-ingestion-base.yml index 0d29f79aa5f6c..e69de29bb2d1d 100644 --- a/.github/workflows/docker-ingestion-base.yml +++ b/.github/workflows/docker-ingestion-base.yml @@ -1,45 +0,0 @@ -name: ingestion base -on: - release: - types: [published] - push: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - pull_request: - branches: - - master - paths: - - ".github/workflows/docker-ingestion-base.yml" - - "docker/datahub-ingestion-base/**" - - "gradle*" - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-base: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 - - name: Build and Push image - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-base - tags: latest - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ github.ref == 'refs/heads/master' }} - context: . - file: ./docker/datahub-ingestion-base/Dockerfile - platforms: linux/amd64,linux/arm64/v8 diff --git a/.github/workflows/docker-ingestion.yml b/.github/workflows/docker-ingestion.yml deleted file mode 100644 index 26e85cfcb4a2d..0000000000000 --- a/.github/workflows/docker-ingestion.yml +++ /dev/null @@ -1,118 +0,0 @@ -name: datahub-ingestion docker acryl -on: - push: - branches: - - master - paths-ignore: - - "docs/**" - - "**.md" - pull_request: - branches: - - master - paths: - - "metadata-ingestion/**" - - "metadata-models/**" - - "docker/datahub-ingestion/**" - - "docker/datahub-ingestion-slim/**" - - ".github/workflows/docker-ingestion.yml" - release: - types: [published] - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - setup: - runs-on: ubuntu-latest - outputs: - tag: ${{ steps.tag.outputs.tag }} - publish: ${{ steps.publish.outputs.publish }} - python_release_version: ${{ steps.python_release_version.outputs.release_version }} - steps: - - name: Checkout - uses: actions/checkout@v3 - - name: Compute Tag - id: tag - run: | - source .github/scripts/docker_helpers.sh - echo "tag=$(get_tag)" >> $GITHUB_OUTPUT - - name: Compute Python Release Version - id: python_release_version - run: | - source .github/scripts/docker_helpers.sh - echo "release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT - - name: Check whether publishing enabled - id: publish - env: - ENABLE_PUBLISH: ${{ secrets.ORG_DOCKER_PASSWORD }} - run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT - push_to_registries: - name: Build and Push Docker Image to Docker Hub - runs-on: ubuntu-latest - needs: setup - steps: - - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 - - name: Build and push - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.ORG_DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - build-args: | - RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} - - name: Build and Push image (slim) - uses: ./.github/actions/docker-custom-build-and-push - with: - images: | - acryldata/datahub-ingestion-slim - tags: ${{ needs.setup.outputs.tag }} - username: ${{ secrets.ACRYL_DOCKER_USERNAME }} - password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - publish: ${{ needs.setup.outputs.publish == 'true' }} - context: . - file: ./docker/datahub-ingestion-slim/Dockerfile - platforms: linux/amd64,linux/arm64/v8 - ingestion-slim_scan: - permissions: - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/upload-sarif to upload SARIF results - actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status - name: "[Monitoring] Scan datahub-ingestion-slim images for vulnerabilities" - if: ${{ github.ref == 'refs/heads/master' }} - runs-on: ubuntu-latest - needs: [push_to_registries] - steps: - - name: Checkout # adding checkout step just to make trivy upload happy - uses: actions/checkout@v3 - - name: Download image - uses: ishworkh/docker-image-artifact-download@v1 - with: - image: acryldata/datahub-ingestion-slim:latest - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@0.8.0 - env: - TRIVY_OFFLINE_SCAN: true - with: - image-ref: acryldata/datahub-ingestion-slim:latest - format: "template" - template: "@/contrib/sarif.tpl" - output: "trivy-results.sarif" - severity: "CRITICAL,HIGH" - ignore-unfixed: true - vuln-type: "os,library" - - name: Upload Trivy scan results to GitHub Security tab - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: "trivy-results.sarif" diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 537f9cbf31d2a..c695fd98c1b46 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -29,6 +29,8 @@ env: DATAHUB_MCE_CONSUMER_IMAGE: "acryldata/datahub-mce-consumer" DATAHUB_KAFKA_SETUP_IMAGE: "acryldata/datahub-kafka-setup" DATAHUB_ELASTIC_SETUP_IMAGE: "acryldata/datahub-elasticsearch-setup" + DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base" + DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion" #### IMPORTANT #### #### THIS IS A CHANGE TO PREVENT OSS QUICKSTART INSTABILITY, DO NOT OVERWRITE THIS CHANGE IN MERGES #### #### IMPORTANT #### @@ -43,7 +45,11 @@ jobs: runs-on: ubuntu-latest outputs: tag: ${{ steps.tag.outputs.tag }} + slim_tag: ${{ steps.tag.outputs.slim_tag }} + full_tag: ${{ steps.tag.outputs.full_tag }} unique_tag: ${{ steps.tag.outputs.unique_tag }} + unique_slim_tag: ${{ steps.tag.outputs.unique_slim_tag }} + unique_full_tag: ${{ steps.tag.outputs.unique_full_tag }} publish: ${{ steps.publish.outputs.publish }} steps: - name: Checkout @@ -53,14 +59,18 @@ jobs: run: | source .github/scripts/docker_helpers.sh echo "tag=$(get_tag)" >> $GITHUB_OUTPUT + echo "slim_tag=$(get_tag)-slim" >> $GITHUB_OUTPUT + echo "full_tag=$(get_tag)-full" >> $GITHUB_OUTPUT echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT + echo "unique_slim_tag=$(get_unique_tag)-slim" >> $GITHUB_OUTPUT + echo "unique_full_tag=$(get_unique_tag)-full" >> $GITHUB_OUTPUT - name: Check whether publishing enabled id: publish env: - ENABLE_PUBLISH: ${{ secrets.ORG_DOCKER_PASSWORD }} + ENABLE_PUBLISH: ${{ secrets.ORG_DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | - echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}" - echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT + echo "Enable publish: ${{ env.ENABLE_PUBLISH }}" + echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT gms_build: name: Build and Push DataHub GMS Docker Image @@ -420,6 +430,289 @@ jobs: file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + datahub_ingestion_base_build: + name: Build and Push DataHub Ingestion (Base) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: setup + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Build and push Base Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: base + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_base_slim_build: + name: Build and Push DataHub Ingestion (Base-Slim) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: [setup, datahub_ingestion_base_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + - name: Build and push Base-Slim Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: slim-install + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.slim_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + build-args: | + APP_ENV=slim + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base-Slim) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_base_full_build: + name: Build and Push DataHub Ingestion (Base-Full) Docker Image + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs: [setup, datahub_ingestion_base_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + - name: Build and push Base-Full Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: full-install + images: | + ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + tags: ${{ needs.setup.outputs.unique_full_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + build-args: | + APP_ENV=full + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion-base/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute DataHub Ingestion (Base-Full) Tag + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + + + datahub_ingestion_slim_build: + name: Build and Push DataHub Ingestion Docker Images + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} + needs: [setup, datahub_ingestion_base_slim_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + datahub-ingestion: + - 'docker/datahub-ingestion/**' + - name: Build codegen + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + run: ./gradlew :metadata-ingestion:codegen + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + - name: Build and push Slim Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: final + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + build-args: | + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }} + APP_ENV=slim + tags: ${{ needs.setup.outputs.slim_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute Tag + id: tag + run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_slim_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion Slim images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_slim_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Slim Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + - name: Run Trivy vulnerability scanner Slim Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + + datahub_ingestion_full_build: + name: Build and Push DataHub Ingestion (Full) Docker Images + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.tag.outputs.tag }} + needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }} + needs: [setup, datahub_ingestion_base_full_build] + steps: + - name: Check out the repo + uses: actions/checkout@v3 + with: + fetch-depth: 800 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + datahub-ingestion-base: + - 'docker/datahub-ingestion-base/**' + datahub-ingestion: + - 'docker/datahub-ingestion/**' + - name: Build codegen + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + run: ./gradlew :metadata-ingestion:codegen + - name: Download Base Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + - name: Build and push Full Image + if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }} + uses: ./.github/actions/docker-custom-build-and-push + with: + target: final + images: | + ${{ env.DATAHUB_INGESTION_IMAGE }} + build-args: | + BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} + DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }} + tags: ${{ needs.setup.outputs.unique_full_tag }} + username: ${{ secrets.ACRYL_DOCKER_USERNAME }} + password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} + publish: ${{ needs.setup.outputs.publish }} + context: . + file: ./docker/datahub-ingestion/Dockerfile + platforms: linux/amd64,linux/arm64/v8 + - name: Compute Tag (Full) + id: tag + run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + datahub_ingestion_full_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Datahub Ingestion images for vulnerabilities" + runs-on: ubuntu-latest + needs: [setup, datahub_ingestion_full_build] + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: actions/checkout@v3 + - name: Download image Full Image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} + - name: Run Trivy vulnerability scanner Full Image + uses: aquasecurity/trivy-action@0.8.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + smoke_test: name: Run Smoke Tests runs-on: ubuntu-latest @@ -438,8 +731,11 @@ jobs: mae_consumer_build, mce_consumer_build, datahub_upgrade_build, + datahub_ingestion_slim_build, ] steps: + - name: Disk Check + run: df -h . && docker images - name: Check out the repo uses: actions/checkout@v3 - name: Set up JDK 11 @@ -456,6 +752,12 @@ jobs: - name: Build datahub cli run: | ./gradlew :metadata-ingestion:install + - name: Disk Check + run: df -h . && docker images + - name: Remove images + run: docker image prune -a -f || true + - name: Disk Check + run: df -h . && docker images - name: Download GMS image uses: ishworkh/docker-image-artifact-download@v1 if: ${{ needs.setup.outputs.publish != 'true' }} @@ -496,13 +798,21 @@ jobs: if: ${{ needs.setup.outputs.publish != 'true' }} with: image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }} - - name: Disable datahub-actions - run: | - yq -i 'del(.services.datahub-actions)' docker/quickstart/docker-compose-without-neo4j.quickstart.yml + - name: Download datahub-ingestion-slim image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }} + with: + image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }} + - name: Disk Check + run: df -h . && docker images - name: run quickstart env: DATAHUB_TELEMETRY_ENABLED: false DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} + DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }} + ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }} + ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor] acryl-datahub-actions' + ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' run: | ./smoke-test/run-quickstart.sh - name: sleep 60s @@ -510,6 +820,8 @@ jobs: # we are doing this because gms takes time to get ready # and we don't have a better readiness check when bootstrap is done sleep 60s + - name: Disk Check + run: df -h . && docker images - name: Disable ES Disk Threshold run: | curl -XPUT "http://localhost:9200/_cluster/settings" \ @@ -524,6 +836,8 @@ jobs: }' - name: Remove Source Code run: find ./*/* ! -path "./metadata-ingestion*" ! -path "./smoke-test*" ! -path "./gradle*" -delete + - name: Disk Check + run: df -h . && docker images - name: Smoke test env: RUN_QUICKSTART: false @@ -534,11 +848,14 @@ jobs: run: | echo "$DATAHUB_VERSION" ./smoke-test/smoke.sh + - name: Disk Check + run: df -h . && docker images - name: store logs if: failure() run: | docker ps -a docker logs datahub-gms >& gms-${{ matrix.test_strategy }}.log + docker logs datahub-actions >& actions-${{ matrix.test_strategy }}.log - name: Upload logs uses: actions/upload-artifact@v3 if: failure() diff --git a/build.gradle b/build.gradle index 605b4fcc050e7..ae54de07cb81c 100644 --- a/build.gradle +++ b/build.gradle @@ -3,8 +3,8 @@ buildscript { // Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md ext.pegasusVersion = '29.22.16' ext.mavenVersion = '3.6.3' - ext.springVersion = '5.3.27' - ext.springBootVersion = '2.7.11' + ext.springVersion = '5.3.29' + ext.springBootVersion = '2.7.14' ext.openTelemetryVersion = '1.18.0' ext.neo4jVersion = '4.4.9' ext.testContainersVersion = '1.17.4' @@ -18,6 +18,7 @@ buildscript { ext.logbackClassic = '1.2.12' ext.hadoop3Version = '3.3.5' ext.kafkaVersion = '2.3.0' + ext.hazelcastVersion = '5.3.1' ext.docker_registry = 'linkedin' @@ -38,7 +39,7 @@ buildscript { plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2' id 'com.github.johnrengelman.shadow' version '6.1.0' - id "com.palantir.docker" version "0.34.0" + id "com.palantir.docker" version "0.35.0" // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" } @@ -101,9 +102,9 @@ project.ext.externalDependency = [ 'hadoopMapreduceClient':'org.apache.hadoop:hadoop-mapreduce-client-core:2.7.2', "hadoopClient": "org.apache.hadoop:hadoop-client:$hadoop3Version", "hadoopCommon3":"org.apache.hadoop:hadoop-common:$hadoop3Version", - 'hazelcast':'com.hazelcast:hazelcast:5.2.3', - 'hazelcastSpring':'com.hazelcast:hazelcast-spring:5.2.1', - 'hazelcastTest':'com.hazelcast:hazelcast:5.2.1:tests', + 'hazelcast':"com.hazelcast:hazelcast:$hazelcastVersion", + 'hazelcastSpring':"com.hazelcast:hazelcast-spring:$hazelcastVersion", + 'hazelcastTest':"com.hazelcast:hazelcast:$hazelcastVersion:tests", 'hibernateCore': 'org.hibernate:hibernate-core:5.2.16.Final', 'httpClient': 'org.apache.httpcomponents:httpclient:4.5.9', 'httpAsyncClient': 'org.apache.httpcomponents:httpasyncclient:4.1.5', @@ -137,6 +138,7 @@ project.ext.externalDependency = [ 'kafkaAvroSerde': 'io.confluent:kafka-streams-avro-serde:5.5.1', 'kafkaAvroSerializer': 'io.confluent:kafka-avro-serializer:5.1.4', 'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion", + 'snappy': 'org.xerial.snappy:snappy-java:1.1.10.3', 'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic", 'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion", 'log4jCore': "org.apache.logging.log4j:log4j-core:$log4jVersion", diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index f21d10d8f3842..fda33e4a9a3c6 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -79,6 +79,8 @@ docker { files fileTree(rootProject.projectDir) { include 'docker/monitoring/*' include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -98,7 +100,7 @@ tasks.getByName("docker").dependsOn(unversionZip) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index 57f64960033aa..e7121d277926d 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -28,6 +28,9 @@ dependencies { implementation(externalDependency.commonsText) { because("previous versions are vulnerable to CVE-2022-42889") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } compile project(":metadata-service:restli-client") diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index d6dd2de6d31e3..682710ad5d539 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -68,6 +68,7 @@ import com.linkedin.datahub.graphql.generated.ListQueriesResult; import com.linkedin.datahub.graphql.generated.ListTestsResult; import com.linkedin.datahub.graphql.generated.ListViewsResult; +import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.MLFeature; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; import com.linkedin.datahub.graphql.generated.MLFeatureTable; @@ -1008,6 +1009,10 @@ private void configureGenericEntityResolvers(final RuntimeWiring.Builder builder .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchResult) env.getSource()).getEntity())) ) + .type("MatchedField", typeWiring -> typeWiring + .dataFetcher("entity", new EntityTypeResolver(entityTypes, + (env) -> ((MatchedField) env.getSource()).getEntity())) + ) .type("SearchAcrossLineageResult", typeWiring -> typeWiring .dataFetcher("entity", new EntityTypeResolver(entityTypes, (env) -> ((SearchAcrossLineageResult) env.getSource()).getEntity())) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java index 94880c77d74bc..3089b8c8fc2db 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java @@ -107,7 +107,31 @@ public static boolean canEditGroupMembers(@Nonnull String groupUrnStr, @Nonnull } public static boolean canCreateGlobalAnnouncements(@Nonnull QueryContext context) { - return isAuthorized(context, Optional.empty(), PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE); + final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup( + ImmutableList.of( + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.CREATE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())), + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())) + )); + + return AuthorizationUtils.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + orPrivilegeGroups); + } + + public static boolean canManageGlobalAnnouncements(@Nonnull QueryContext context) { + final DisjunctivePrivilegeGroup orPrivilegeGroups = new DisjunctivePrivilegeGroup( + ImmutableList.of( + new ConjunctivePrivilegeGroup(ImmutableList.of( + PoliciesConfig.MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE.getType())) + )); + + return AuthorizationUtils.isAuthorized( + context.getAuthorizer(), + context.getActorUrn(), + orPrivilegeGroups); } public static boolean canManageGlobalViews(@Nonnull QueryContext context) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java index d2a7b19857f95..02921b453e315 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/MeResolver.java @@ -74,6 +74,7 @@ public CompletableFuture get(DataFetchingEnvironment environm platformPrivileges.setManageTags(AuthorizationUtils.canManageTags(context)); platformPrivileges.setManageGlobalViews(AuthorizationUtils.canManageGlobalViews(context)); platformPrivileges.setManageOwnershipTypes(AuthorizationUtils.canManageOwnershipTypes(context)); + platformPrivileges.setManageGlobalAnnouncements(AuthorizationUtils.canManageGlobalAnnouncements(context)); // Construct and return authenticated user object. final AuthenticatedUser authUser = new AuthenticatedUser(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 2c55bc79fe501..90017f7b87997 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -18,6 +18,7 @@ import com.linkedin.datahub.graphql.generated.Privilege; import com.linkedin.datahub.graphql.generated.QueriesTabConfig; import com.linkedin.datahub.graphql.generated.ResourcePrivileges; +import com.linkedin.datahub.graphql.generated.SearchResultsVisualConfig; import com.linkedin.datahub.graphql.generated.TelemetryConfig; import com.linkedin.datahub.graphql.generated.TestsConfig; import com.linkedin.datahub.graphql.generated.ViewsConfig; @@ -144,6 +145,13 @@ public CompletableFuture get(final DataFetchingEnvironment environmen } visualConfig.setEntityProfiles(entityProfilesConfig); } + if (_visualConfiguration != null && _visualConfiguration.getSearchResult() != null) { + SearchResultsVisualConfig searchResultsVisualConfig = new SearchResultsVisualConfig(); + if (_visualConfiguration.getSearchResult().getEnableNameHighlight() != null) { + searchResultsVisualConfig.setEnableNameHighlight(_visualConfiguration.getSearchResult().getEnableNameHighlight()); + } + visualConfig.setSearchResult(searchResultsVisualConfig); + } appConfig.setVisualConfig(visualConfig); final TelemetryConfig telemetryConfig = new TelemetryConfig(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java index cd2a3dda70033..d3cd0126fb852 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/post/DeletePostResolver.java @@ -23,7 +23,7 @@ public class DeletePostResolver implements DataFetcher get(final DataFetchingEnvironment environment) throws Exception { final QueryContext context = environment.getContext(); - if (!AuthorizationUtils.canCreateGlobalAnnouncements(context)) { + if (!AuthorizationUtils.canManageGlobalAnnouncements(context)) { throw new AuthorizationException( "Unauthorized to delete posts. Please contact your DataHub administrator if this needs corrective action."); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java index e40bbca56b416..fe5b79ba2ea3d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java @@ -73,7 +73,6 @@ private SearchUtils() { EntityType.CONTAINER, EntityType.DOMAIN, EntityType.DATA_PRODUCT, - EntityType.ROLE, EntityType.NOTEBOOK); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java index 6435d6ee4c8e5..f3ac008734339 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/SearchFlagsInputMapper.java @@ -39,6 +39,9 @@ public com.linkedin.metadata.query.SearchFlags apply(@Nonnull final SearchFlags if (searchFlags.getSkipAggregates() != null) { result.setSkipAggregates(searchFlags.getSkipAggregates()); } + if (searchFlags.getGetSuggestions() != null) { + result.setGetSuggestions(searchFlags.getGetSuggestions()); + } return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java index 0b292a373ea40..5ba32b0c2a77c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/MapperUtils.java @@ -1,12 +1,18 @@ package com.linkedin.datahub.graphql.types.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.generated.AggregationMetadata; import com.linkedin.datahub.graphql.generated.FacetMetadata; import com.linkedin.datahub.graphql.generated.MatchedField; import com.linkedin.datahub.graphql.generated.SearchResult; +import com.linkedin.datahub.graphql.generated.SearchSuggestion; import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; import com.linkedin.datahub.graphql.types.common.mappers.UrnToEntityMapper; import com.linkedin.metadata.search.SearchEntity; +import com.linkedin.metadata.search.utils.SearchUtils; +import lombok.extern.slf4j.Slf4j; + +import java.net.URISyntaxException; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -16,6 +22,7 @@ import static com.linkedin.metadata.utils.SearchUtil.*; +@Slf4j public class MapperUtils { private MapperUtils() { @@ -54,7 +61,24 @@ public static String convertFilterValue(String filterValue, List isEnti public static List getMatchedFieldEntry(List highlightMetadata) { return highlightMetadata.stream() - .map(field -> new MatchedField(field.getName(), field.getValue())) + .map(field -> { + MatchedField matchedField = new MatchedField(); + matchedField.setName(field.getName()); + matchedField.setValue(field.getValue()); + if (SearchUtils.isUrn(field.getValue())) { + try { + Urn urn = Urn.createFromString(field.getValue()); + matchedField.setEntity(UrnToEntityMapper.map(urn)); + } catch (URISyntaxException e) { + log.warn("Failed to create urn from MatchedField value: {}", field.getValue(), e); + } + } + return matchedField; + }) .collect(Collectors.toList()); } + + public static SearchSuggestion mapSearchSuggestion(com.linkedin.metadata.search.SearchSuggestion suggestion) { + return new SearchSuggestion(suggestion.getText(), suggestion.getScore(), Math.toIntExact(suggestion.getFrequency())); + } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java index 9f750820e3093..b16e2f10d1df7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/UrnSearchResultsMapper.java @@ -27,6 +27,7 @@ public SearchResults apply(com.linkedin.metadata.search.SearchResult input) { final SearchResultMetadata searchResultMetadata = input.getMetadata(); result.setSearchResults(input.getEntities().stream().map(MapperUtils::mapResult).collect(Collectors.toList())); result.setFacets(searchResultMetadata.getAggregations().stream().map(MapperUtils::mapFacet).collect(Collectors.toList())); + result.setSuggestions(searchResultMetadata.getSuggestions().stream().map(MapperUtils::mapSearchSuggestion).collect(Collectors.toList())); return result; } diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 37183bac13f0e..dbee24b4bf6f7 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -125,6 +125,11 @@ type PlatformPrivileges { Whether the user should be able to create, update, and delete ownership types. """ manageOwnershipTypes: Boolean! + + """ + Whether the user can create and delete posts pinned to the home page. + """ + manageGlobalAnnouncements: Boolean! } """ @@ -216,6 +221,11 @@ type VisualConfig { Configuration for the queries tab """ entityProfiles: EntityProfilesConfig + + """ + Configuration for search results + """ + searchResult: SearchResultsVisualConfig } """ @@ -250,6 +260,16 @@ type EntityProfileConfig { defaultTab: String } +""" +Configuration for a search result +""" +type SearchResultsVisualConfig { + """ + Whether a search result should highlight the name/description if it was matched on those fields. + """ + enableNameHighlight: Boolean +} + """ Configurations related to tracking users in the app """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index f15535bfb4eb8..4cabdb04afe77 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -138,6 +138,11 @@ input SearchFlags { Whether to skip aggregates/facets """ skipAggregates: Boolean + + """ + Whether to request for search suggestions on the _entityName virtualized field + """ + getSuggestions: Boolean } """ @@ -448,6 +453,11 @@ enum FilterOperator { * Represent the relation: String field is one of the array values to, e.g. name in ["Profile", "Event"] """ IN + + """ + Represents the relation: The field exists. If the field is an array, the field is either not present or empty. + """ + EXISTS } """ @@ -478,6 +488,11 @@ type SearchResults { Candidate facet aggregations used for search filtering """ facets: [FacetMetadata!] + + """ + Search suggestions based on the query provided for alternate query texts + """ + suggestions: [SearchSuggestion!] } """ @@ -660,6 +675,11 @@ type MatchedField { Value of the field that matched """ value: String! + + """ + Entity if the value is an urn + """ + entity: Entity } """ @@ -717,6 +737,31 @@ type AggregationMetadata { entity: Entity } +""" +A suggestion for an alternate search query given an original query compared to all +of the entity names in our search index. +""" +type SearchSuggestion { + """ + The suggested text based on the provided query text compared to + the entity name field in the search index. + """ + text: String! + + """ + The "edit distance" for this suggestion. The closer this number is to 1, the + closer the suggested text is to the original text. The closer it is to 0, the + further from the original text it is. + """ + score: Float + + """ + The number of entities that would match on the name field given the suggested text + """ + frequency: Int +} + + """ Input for performing an auto completion query against a single Metadata Entity """ diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index ad2bf02bfdcc7..78d9f6a09948d 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -89,6 +89,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -101,7 +103,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index dcefc7f70d785..a2e14308e8cee 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -1973,6 +1973,7 @@ export const mocks = [ count: 10, filters: [], orFilters: [], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2033,6 +2034,7 @@ export const mocks = [ ], }, ], + suggestions: [], }, } as GetSearchResultsQuery, }, @@ -2059,6 +2061,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2112,6 +2115,7 @@ export const mocks = [ ], }, ], + suggestions: [], }, } as GetSearchResultsQuery, }, @@ -2230,6 +2234,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2251,6 +2256,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -2772,6 +2778,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2794,6 +2801,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { __typename: 'FacetMetadata', @@ -2886,6 +2894,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2908,6 +2917,7 @@ export const mocks = [ }, ], facets: [], + suggestions: [], }, } as GetSearchResultsForMultipleQuery, }, @@ -2934,6 +2944,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -2955,6 +2966,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3007,6 +3019,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3028,6 +3041,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3084,6 +3098,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3113,6 +3128,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3175,6 +3191,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3196,6 +3213,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3258,6 +3276,7 @@ export const mocks = [ ], }, ], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3279,6 +3298,7 @@ export const mocks = [ insights: [], }, ], + suggestions: [], facets: [ { field: 'origin', @@ -3363,6 +3383,7 @@ export const mocks = [ generatePersonalAccessTokens: true, manageGlobalViews: true, manageOwnershipTypes: true, + manageGlobalAnnouncements: true, }, }, }, @@ -3450,6 +3471,7 @@ export const mocks = [ count: 10, filters: [], orFilters: [], + searchFlags: { getSuggestions: true }, }, }, }, @@ -3461,6 +3483,7 @@ export const mocks = [ total: 0, searchResults: [], facets: [], + suggestions: [], }, }, }, @@ -3609,4 +3632,5 @@ export const platformPrivileges: PlatformPrivileges = { createDomains: true, manageGlobalViews: true, manageOwnershipTypes: true, + manageGlobalAnnouncements: true, }; diff --git a/datahub-web-react/src/app/entity/EntityRegistry.tsx b/datahub-web-react/src/app/entity/EntityRegistry.tsx index a07fd02841197..56b085cf69f4a 100644 --- a/datahub-web-react/src/app/entity/EntityRegistry.tsx +++ b/datahub-web-react/src/app/entity/EntityRegistry.tsx @@ -1,5 +1,7 @@ +import React from 'react'; import { Entity as EntityInterface, EntityType, SearchResult } from '../../types.generated'; import { FetchedEntity } from '../lineage/types'; +import { SearchResultProvider } from '../search/context/SearchResultContext'; import { Entity, EntityCapabilityType, IconStyleType, PreviewType } from './Entity'; import { GLOSSARY_ENTITY_TYPES } from './shared/constants'; import { GenericEntityProperties } from './shared/types'; @@ -119,7 +121,9 @@ export default class EntityRegistry { renderSearchResult(type: EntityType, searchResult: SearchResult): JSX.Element { const entity = validatedGet(type, this.entityTypeToEntity); - return entity.renderSearch(searchResult); + return ( + {entity.renderSearch(searchResult)} + ); } renderBrowse(type: EntityType, data: T): JSX.Element { diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index b5ebcbef80379..0f1b6dbf3d660 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -19,13 +19,14 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { ChartStatsSummarySubHeader } from './profile/stats/ChartStatsSummarySubHeader'; import { InputFieldsTab } from '../shared/tabs/Entity/InputFieldsTab'; -import { ChartSnippet } from './ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Chart entity. @@ -203,7 +204,11 @@ export class ChartEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} externalUrl={data.properties?.externalUrl} - snippet={} + snippet={ + matchedInputFieldRenderer(matchedField, data)} + /> + } degree={(result as any).degree} paths={(result as any).paths} /> diff --git a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx deleted file mode 100644 index 27982d3037207..0000000000000 --- a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx +++ /dev/null @@ -1,53 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { InputFields, MatchedField, Maybe } from '../../../types.generated'; -import TagTermGroup from '../../shared/tags/TagTermGroup'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; - -type Props = { - matchedFields: MatchedField[]; - inputFields: Maybe | undefined; - isMatchingDashboard?: boolean; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; -const TYPE_PROPERTY_KEY_NAME = 'type'; - -export const ChartSnippet = ({ matchedFields, inputFields, isMatchingDashboard = false }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, 'fieldLabels'); - - if (matchedField?.name === LABEL_INDEX_NAME) { - const matchedSchemaField = inputFields?.fields?.find( - (field) => field?.schemaField?.label === matchedField.value, - ); - const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( - (term) => term?.term?.name === matchedField.value, - ); - - if (matchedGlossaryTerm) { - let termType = 'term'; - const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( - (property) => property.key === TYPE_PROPERTY_KEY_NAME, - ); - if (typeProperty) { - termType = typeProperty.value || termType; - } - - return ( - - Matches {termType} {' '} - {isMatchingDashboard && 'on a contained Chart'} - - ); - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value}{' '} - {isMatchingDashboard && 'on a contained Chart'} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx index a64e437265262..0a36d0e5f1bfa 100644 --- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx +++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx @@ -24,12 +24,13 @@ import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; import { DashboardStatsSummarySubHeader } from './profile/DashboardStatsSummarySubHeader'; -import { ChartSnippet } from '../chart/ChartSnippet'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; import { LOOKER_URN } from '../../ingest/source/builder/constants'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; +import { matchedInputFieldRenderer } from '../../search/matches/matchedInputFieldRenderer'; /** * Definition of the DataHub Dashboard entity. @@ -227,10 +228,9 @@ export class DashboardEntity implements Entity { lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} snippet={ - matchedInputFieldRenderer(matchedField, data)} + matchSuffix="on a contained chart" /> } subtype={data.subTypes?.typeNames?.[0]} diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx index cb4239872045f..ed3904bcf4e2d 100644 --- a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx +++ b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx @@ -25,11 +25,12 @@ import { OperationsTab } from './profile/OperationsTab'; import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { SidebarSiblingsSection } from '../shared/containers/profile/sidebar/SidebarSiblingsSection'; import { DatasetStatsSummarySubHeader } from './profile/stats/stats/DatasetStatsSummarySubHeader'; -import { DatasetSearchSnippet } from './DatasetSearchSnippet'; +import { MatchedFieldList } from '../../search/matches/MatchedFieldList'; import { EmbedTab } from '../shared/tabs/Embed/EmbedTab'; import EmbeddedProfile from '../shared/embed/EmbeddedProfile'; import DataProductSection from '../shared/containers/profile/sidebar/DataProduct/DataProductSection'; import { getDataProduct } from '../shared/utils'; +import { matchedFieldPathsRenderer } from '../../search/matches/matchedFieldPathsRenderer'; const SUBTYPES = { VIEW: 'view', @@ -290,7 +291,7 @@ export class DatasetEntity implements Entity { subtype={data.subTypes?.typeNames?.[0]} container={data.container} parentContainers={data.parentContainers} - snippet={} + snippet={} insights={result.insights} externalUrl={data.properties?.externalUrl} statsSummary={data.statsSummary} diff --git a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx b/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx deleted file mode 100644 index e4f88eb0fbbfa..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/DatasetSearchSnippet.tsx +++ /dev/null @@ -1,39 +0,0 @@ -import React from 'react'; - -import { Typography } from 'antd'; -import { MatchedField } from '../../../types.generated'; -import { TagSummary } from './shared/TagSummary'; -import { TermSummary } from './shared/TermSummary'; -import { FIELDS_TO_HIGHLIGHT } from './search/highlights'; -import { getMatchPrioritizingPrimary } from '../shared/utils'; -import { downgradeV2FieldPath } from './profile/schema/utils/utils'; - -type Props = { - matchedFields: MatchedField[]; -}; - -const LABEL_INDEX_NAME = 'fieldLabels'; - -export const DatasetSearchSnippet = ({ matchedFields }: Props) => { - const matchedField = getMatchPrioritizingPrimary(matchedFields, LABEL_INDEX_NAME); - - let snippet: React.ReactNode; - - if (matchedField) { - if (matchedField.value.includes('urn:li:tag')) { - snippet = ; - } else if (matchedField.value.includes('urn:li:glossaryTerm')) { - snippet = ; - } else if (matchedField.name === 'fieldPaths') { - snippet = {downgradeV2FieldPath(matchedField.value)}; - } else { - snippet = {matchedField.value}; - } - } - - return matchedField ? ( - - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {snippet}{' '} - - ) : null; -}; diff --git a/datahub-web-react/src/app/entity/dataset/search/highlights.ts b/datahub-web-react/src/app/entity/dataset/search/highlights.ts deleted file mode 100644 index 64505e0709c7b..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/search/highlights.ts +++ /dev/null @@ -1,7 +0,0 @@ -export const FIELDS_TO_HIGHLIGHT = new Map(); -FIELDS_TO_HIGHLIGHT.set('fieldPaths', 'column'); -FIELDS_TO_HIGHLIGHT.set('fieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('fieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('editedFieldDescriptions', 'column description'); -FIELDS_TO_HIGHLIGHT.set('editedFieldTags', 'column tag'); -FIELDS_TO_HIGHLIGHT.set('fieldLabels', 'label'); diff --git a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx deleted file mode 100644 index 106cc298fb58c..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TagSummary.tsx +++ /dev/null @@ -1,38 +0,0 @@ -import React from 'react'; -import styled from 'styled-components'; -import { useGetTagQuery } from '../../../../graphql/tag.generated'; -import { EntityType, Tag } from '../../../../types.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { useEntityRegistry } from '../../../useEntityRegistry'; -import { StyledTag } from '../../shared/components/styled/StyledTag'; - -const TagLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TagSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetTagQuery({ variables: { urn } }); - return ( - <> - {data && ( - - - - {entityRegistry.getDisplayName(EntityType.Tag, data?.tag)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx deleted file mode 100644 index cc1274693a342..0000000000000 --- a/datahub-web-react/src/app/entity/dataset/shared/TermSummary.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import React from 'react'; -import { Tag } from 'antd'; -import { BookOutlined } from '@ant-design/icons'; -import styled from 'styled-components'; -import { useGetGlossaryTermQuery } from '../../../../graphql/glossaryTerm.generated'; -import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; -import { EntityType, GlossaryTerm } from '../../../../types.generated'; -import { useEntityRegistry } from '../../../useEntityRegistry'; - -const TermLink = styled.span` - display: inline-block; -`; - -type Props = { - urn: string; -}; - -export const TermSummary = ({ urn }: Props) => { - const entityRegistry = useEntityRegistry(); - const { data } = useGetGlossaryTermQuery({ variables: { urn } }); - - return ( - <> - {data && ( - - - - - {entityRegistry.getDisplayName(EntityType.GlossaryTerm, data?.glossaryTerm)} - - - - )} - - ); -}; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx index 26d3cf456ab7a..b6802e37652cb 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/preview/Preview.tsx @@ -4,6 +4,8 @@ import { Deprecation, Domain, EntityType, Owner, ParentNodesResult } from '../.. import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType, PreviewType } from '../../Entity'; +import UrlButton from '../../shared/UrlButton'; +import { getRelatedEntitiesUrl } from '../utils'; export const Preview = ({ urn, @@ -39,6 +41,9 @@ export const Preview = ({ deprecation={deprecation} parentNodes={parentNodes} domain={domain} + entityTitleSuffix={ + View Related Entities + } /> ); }; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx index d0e8de0928b48..098e97e526fd8 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx +++ b/datahub-web-react/src/app/entity/glossaryTerm/profile/GlossaryRelatedEntity.tsx @@ -5,7 +5,7 @@ import { EmbeddedListSearchSection } from '../../shared/components/styled/search import { useEntityData } from '../../shared/EntityContext'; export default function GlossaryRelatedEntity() { - const { entityData }: any = useEntityData(); + const { entityData } = useEntityData(); const entityUrn = entityData?.urn; diff --git a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts index 3a2a3d35a8126..cbfa76fa34866 100644 --- a/datahub-web-react/src/app/entity/glossaryTerm/utils.ts +++ b/datahub-web-react/src/app/entity/glossaryTerm/utils.ts @@ -6,3 +6,7 @@ export function sortGlossaryTerms(entityRegistry: EntityRegistry, nodeA?: Entity const nodeBName = entityRegistry.getDisplayName(EntityType.GlossaryTerm, nodeB) || ''; return nodeAName.localeCompare(nodeBName); } + +export function getRelatedEntitiesUrl(entityRegistry: EntityRegistry, urn: string) { + return `${entityRegistry.getEntityUrl(EntityType.GlossaryTerm, urn)}/${encodeURIComponent('Related Entities')}`; +} diff --git a/datahub-web-react/src/app/entity/group/preview/Preview.tsx b/datahub-web-react/src/app/entity/group/preview/Preview.tsx index dc83f6fe4f840..67449b9a481f0 100644 --- a/datahub-web-react/src/app/entity/group/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/group/preview/Preview.tsx @@ -8,6 +8,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import NoMarkdownViewer from '../../shared/components/styled/StripMarkdownText'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` margin-bottom: 4px; @@ -87,7 +88,9 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpGroup)} - {name || urn} + + {name ? : urn} + {membersCount} members @@ -96,7 +99,12 @@ export const Preview = ({ {description && description.length > 0 && ( - {description} + } + > + {description} + )} diff --git a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx index 9677af0776604..dce74c02cdb34 100644 --- a/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx +++ b/datahub-web-react/src/app/entity/shared/ExternalUrlButton.tsx @@ -1,28 +1,11 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button } from 'antd'; import React from 'react'; -import styled from 'styled-components/macro'; import { EntityType } from '../../../types.generated'; import analytics, { EventType, EntityActionType } from '../../analytics'; +import UrlButton from './UrlButton'; const GITHUB_LINK = 'github.com'; const GITHUB = 'GitHub'; -const ExternalUrlWrapper = styled.span` - font-size: 12px; -`; - -const StyledButton = styled(Button)` - > :hover { - text-decoration: underline; - } - &&& { - padding-bottom: 0px; - } - padding-left: 12px; - padding-right: 12px; -`; - interface Props { externalUrl: string; platformName?: string; @@ -46,17 +29,8 @@ export default function ExternalUrlButton({ externalUrl, platformName, entityTyp } return ( - - - {displayedName ? `View in ${displayedName}` : 'View link'}{' '} - - - + + {displayedName ? `View in ${displayedName}` : 'View link'} + ); } diff --git a/datahub-web-react/src/app/entity/shared/UrlButton.tsx b/datahub-web-react/src/app/entity/shared/UrlButton.tsx new file mode 100644 index 0000000000000..a6f6da4a60ad5 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/UrlButton.tsx @@ -0,0 +1,37 @@ +import React, { ReactNode } from 'react'; +import { ArrowRightOutlined } from '@ant-design/icons'; +import { Button } from 'antd'; +import styled from 'styled-components/macro'; + +const UrlButtonContainer = styled.span` + font-size: 12px; +`; + +const StyledButton = styled(Button)` + > :hover { + text-decoration: underline; + } + &&& { + padding-bottom: 0px; + } + padding-left: 12px; + padding-right: 12px; +`; + +interface Props { + href: string; + children: ReactNode; + onClick?: () => void; +} + +const NOOP = () => {}; + +export default function UrlButton({ href, children, onClick = NOOP }: Props) { + return ( + + + {children} + + + ); +} diff --git a/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts index 6e23d5400ab77..00e89e5943c17 100644 --- a/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts +++ b/datahub-web-react/src/app/entity/shared/__tests__/siblingsUtils.test.ts @@ -1,10 +1,6 @@ import { dataset3WithLineage, dataset3WithSchema, dataset4WithLineage } from '../../../../Mocks'; import { EntityType, SchemaFieldDataType } from '../../../../types.generated'; -import { - combineEntityDataWithSiblings, - combineSiblingsInSearchResults, - shouldEntityBeTreatedAsPrimary, -} from '../siblingUtils'; +import { combineEntityDataWithSiblings, shouldEntityBeTreatedAsPrimary } from '../siblingUtils'; const usageStats = { buckets: [ @@ -191,494 +187,6 @@ const datasetUnprimaryWithNoPrimarySiblings = { }, }; -const searchResultWithSiblings = [ - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - customProperties: [], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['table'], - __typename: 'SubTypes', - }, - domain: null, - container: { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - parentContainers: { - count: 2, - containers: [ - { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - { - urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'cypress_project', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Project'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - ], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: false, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - platform: { - urn: 'urn:li:dataPlatform:dbt', - type: 'DATA_PLATFORM', - name: 'dbt', - properties: { - type: 'OTHERS', - displayName: 'dbt', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/dbtlogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - name: 'cypress_project.jaffle_shop.raw_orders', - properties: { - name: 'raw_orders', - description: '', - qualifiedName: null, - __typename: 'DatasetProperties', - }, - __typename: 'Dataset', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:dbt', - type: 'DATA_PLATFORM', - name: 'dbt', - properties: { - type: 'OTHERS', - displayName: 'dbt', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/dbtlogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: '', - qualifiedName: null, - customProperties: [ - { - key: 'catalog_version', - value: '1.0.4', - __typename: 'StringMapEntry', - }, - { - key: 'node_type', - value: 'seed', - __typename: 'StringMapEntry', - }, - { - key: 'materialization', - value: 'seed', - __typename: 'StringMapEntry', - }, - { - key: 'dbt_file_path', - value: 'data/raw_orders.csv', - __typename: 'StringMapEntry', - }, - { - key: 'catalog_schema', - value: 'https://schemas.getdbt.com/dbt/catalog/v1.json', - __typename: 'StringMapEntry', - }, - { - key: 'catalog_type', - value: 'table', - __typename: 'StringMapEntry', - }, - { - key: 'manifest_version', - value: '1.0.4', - __typename: 'StringMapEntry', - }, - { - key: 'manifest_schema', - value: 'https://schemas.getdbt.com/dbt/manifest/v4.json', - __typename: 'StringMapEntry', - }, - ], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['seed'], - __typename: 'SubTypes', - }, - domain: null, - container: null, - parentContainers: { - count: 0, - containers: [], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: true, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - type: 'DATASET', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - name: 'cypress_project.jaffle_shop.raw_orders', - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - __typename: 'DatasetProperties', - }, - __typename: 'Dataset', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, -]; - -const searchResultWithGhostSiblings = [ - { - entity: { - urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: true, - type: 'DATASET', - name: 'cypress_project.jaffle_shop.raw_orders', - origin: 'PROD', - uri: null, - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - dataPlatformInstance: null, - editableProperties: null, - platformNativeType: null, - properties: { - name: 'raw_orders', - description: null, - qualifiedName: null, - customProperties: [], - __typename: 'DatasetProperties', - }, - ownership: null, - globalTags: null, - glossaryTerms: null, - subTypes: { - typeNames: ['table'], - __typename: 'SubTypes', - }, - domain: null, - container: { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - parentContainers: { - count: 2, - containers: [ - { - urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'jaffle_shop', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Dataset'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - { - urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', - platform: { - urn: 'urn:li:dataPlatform:bigquery', - type: 'DATA_PLATFORM', - name: 'bigquery', - properties: { - type: 'RELATIONAL_DB', - displayName: 'BigQuery', - datasetNameDelimiter: '.', - logoUrl: '/assets/platforms/bigquerylogo.png', - __typename: 'DataPlatformProperties', - }, - displayName: null, - info: null, - __typename: 'DataPlatform', - }, - properties: { - name: 'cypress_project', - __typename: 'ContainerProperties', - }, - subTypes: { - typeNames: ['Project'], - __typename: 'SubTypes', - }, - deprecation: null, - __typename: 'Container', - }, - ], - __typename: 'ParentContainersResult', - }, - deprecation: null, - siblings: { - isPrimary: false, - siblings: [ - { - urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - exists: false, - type: 'DATASET', - }, - ], - __typename: 'SiblingProperties', - }, - __typename: 'Dataset', - }, - matchedFields: [ - { - name: 'name', - value: 'raw_orders', - __typename: 'MatchedField', - }, - { - name: 'id', - value: 'cypress_project.jaffle_shop.raw_orders', - __typename: 'MatchedField', - }, - ], - insights: [], - __typename: 'SearchResult', - }, -]; - describe('siblingUtils', () => { describe('combineEntityDataWithSiblings', () => { it('combines my metadata with my siblings as primary', () => { @@ -719,32 +227,6 @@ describe('siblingUtils', () => { }); }); - describe('combineSiblingsInSearchResults', () => { - it('combines search results to deduplicate siblings', () => { - const result = combineSiblingsInSearchResults(searchResultWithSiblings as any); - - expect(result).toHaveLength(1); - expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - expect(result?.[0]?.matchedEntities?.[1]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - - expect(result?.[0]?.matchedEntities).toHaveLength(2); - }); - - it('will not combine an entity with a ghost node', () => { - const result = combineSiblingsInSearchResults(searchResultWithGhostSiblings as any); - - expect(result).toHaveLength(1); - expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( - 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', - ); - expect(result?.[0]?.matchedEntities).toHaveLength(1); - }); - }); - describe('shouldEntityBeTreatedAsPrimary', () => { it('will say a primary entity is primary', () => { expect(shouldEntityBeTreatedAsPrimary(datasetPrimaryWithSiblings)).toBeTruthy(); diff --git a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts b/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts deleted file mode 100644 index 86dec46528b49..0000000000000 --- a/datahub-web-react/src/app/entity/shared/__tests__/utils.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { getMatchPrioritizingPrimary } from '../utils'; - -const MOCK_MATCHED_FIELDS = [ - { - name: 'fieldPaths', - value: 'rain', - }, - { - name: 'description', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbow', - }, - { - name: 'fieldPaths', - value: 'rainbows', - }, -]; - -describe('utils', () => { - describe('getMatchPrioritizingPrimary', () => { - it('prioritizes exact match', () => { - global.window.location.search = 'query=rainbow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - it('will accept first contains match', () => { - global.window.location.search = 'query=bow'; - const match = getMatchPrioritizingPrimary(MOCK_MATCHED_FIELDS, 'fieldPaths'); - expect(match?.value).toEqual('rainbow'); - expect(match?.name).toEqual('fieldPaths'); - }); - }); -}); diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx index 59293c2b0eee5..212813ffcb643 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StripMarkdownText.tsx @@ -17,6 +17,7 @@ export type Props = { suffix?: JSX.Element; limit?: number; shouldWrap?: boolean; + customRender?: (text: string) => JSX.Element; }; export const removeMarkdown = (text: string) => { @@ -29,7 +30,7 @@ export const removeMarkdown = (text: string) => { .replace(/^•/, ''); // remove first • }; -export default function NoMarkdownViewer({ children, readMore, suffix, limit, shouldWrap }: Props) { +export default function NoMarkdownViewer({ children, customRender, readMore, suffix, limit, shouldWrap }: Props) { let plainText = removeMarkdown(children || ''); if (limit) { @@ -44,7 +45,8 @@ export default function NoMarkdownViewer({ children, readMore, suffix, limit, sh return ( - {plainText} {showReadMore && <>{readMore}} {suffix} + {customRender ? customRender(plainText) : plainText} + {showReadMore && <>{readMore}} {suffix} ); } diff --git a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx index c1a23811fdd7e..08087bfd79b8e 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/StyledTag.tsx @@ -6,7 +6,15 @@ export const generateColor = new ColorHash({ saturation: 0.9, }); -export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number }>` +export const StyledTag = styled(Tag)<{ $color: any; $colorHash?: string; fontSize?: number; highlightTag?: boolean }>` + &&& { + ${(props) => + props.highlightTag && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} ${(props) => props.$colorHash && diff --git a/datahub-web-react/src/app/entity/shared/constants.ts b/datahub-web-react/src/app/entity/shared/constants.ts index e14affc95b6f9..447780fb0d641 100644 --- a/datahub-web-react/src/app/entity/shared/constants.ts +++ b/datahub-web-react/src/app/entity/shared/constants.ts @@ -23,6 +23,7 @@ export const ANTD_GRAY = { export const ANTD_GRAY_V2 = { 2: '#F3F5F6', 5: '#DDE0E4', + 6: '#B2B8BD', 8: '#5E666E', 10: '#1B1E22', }; diff --git a/datahub-web-react/src/app/entity/shared/siblingUtils.ts b/datahub-web-react/src/app/entity/shared/siblingUtils.ts index 977d9fb9a9bf3..66481051055ec 100644 --- a/datahub-web-react/src/app/entity/shared/siblingUtils.ts +++ b/datahub-web-react/src/app/entity/shared/siblingUtils.ts @@ -2,7 +2,7 @@ import merge from 'deepmerge'; import { unionBy, keyBy, values } from 'lodash'; import { useLocation } from 'react-router-dom'; import * as QueryString from 'query-string'; -import { Dataset, Entity, MatchedField, Maybe, SiblingProperties } from '../../../types.generated'; +import { Dataset, Entity, Maybe, SiblingProperties } from '../../../types.generated'; import { GenericEntityProperties } from './types'; export function stripSiblingsFromEntity(entity: any) { @@ -215,54 +215,48 @@ export const combineEntityDataWithSiblings = (baseEntity: T): T => { return { [baseEntityKey]: combinedBaseEntity } as unknown as T; }; -export type CombinedSearchResult = { +export type CombinedEntity = { entity: Entity; - matchedFields: MatchedField[]; - matchedEntities?: Entity[]; + matchedEntities?: Array; }; -export function combineSiblingsInSearchResults( - results: - | { - entity: Entity; - matchedFields: MatchedField[]; - }[] - | undefined, -) { - const combinedResults: CombinedSearchResult[] | undefined = []; - const siblingsToPair: Record = {}; - - // set sibling associations - results?.forEach((result) => { - if (result.entity.urn in siblingsToPair) { - // filter from repeating - // const siblingsCombinedResult = siblingsToPair[result.entity.urn]; - // siblingsCombinedResult.matchedEntities?.push(result.entity); - return; - } +type CombinedEntityResult = + | { + skipped: true; + } + | { + skipped: false; + combinedEntity: CombinedEntity; + }; + +export function combineSiblingsForEntity(entity: Entity, visitedSiblingUrns: Set): CombinedEntityResult { + if (visitedSiblingUrns.has(entity.urn)) return { skipped: true }; + + const combinedEntity: CombinedEntity = { entity: combineEntityWithSiblings({ ...entity }) }; + const siblings = (combinedEntity.entity as GenericEntityProperties).siblings?.siblings ?? []; + const isPrimary = (combinedEntity.entity as GenericEntityProperties).siblings?.isPrimary; + const siblingUrns = siblings.map((sibling) => sibling?.urn); + + if (siblingUrns.length > 0) { + combinedEntity.matchedEntities = isPrimary + ? [stripSiblingsFromEntity(combinedEntity.entity), ...siblings] + : [...siblings, stripSiblingsFromEntity(combinedEntity.entity)]; + + combinedEntity.matchedEntities = combinedEntity.matchedEntities.filter( + (resultToFilter) => (resultToFilter as Dataset).exists, + ); + + siblingUrns.forEach((urn) => urn && visitedSiblingUrns.add(urn)); + } - const combinedResult: CombinedSearchResult = result; - combinedResult.entity = combineEntityWithSiblings({ ...result.entity }); - const { entity }: { entity: any } = result; - const siblingUrns = entity?.siblings?.siblings?.map((sibling) => sibling.urn) || []; - if (siblingUrns.length > 0) { - combinedResult.matchedEntities = entity.siblings.isPrimary - ? [stripSiblingsFromEntity(entity), ...entity.siblings.siblings] - : [...entity.siblings.siblings, stripSiblingsFromEntity(entity)]; - - combinedResult.matchedEntities = combinedResult.matchedEntities.filter( - (resultToFilter) => (resultToFilter as Dataset).exists, - ); - - siblingUrns.forEach((urn) => { - siblingsToPair[urn] = combinedResult; - }); - } - combinedResults.push(combinedResult); - }); + return { combinedEntity, skipped: false }; +} - return combinedResults; +export function createSiblingEntityCombiner() { + const visitedSiblingUrns: Set = new Set(); + return (entity: Entity) => combineSiblingsForEntity(entity, visitedSiblingUrns); } + // used to determine whether sibling entities should be shown merged or not export const SEPARATE_SIBLINGS_URL_PARAM = 'separate_siblings'; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx index 1aef497ced57b..bcce994c3f0f8 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx @@ -33,7 +33,7 @@ type LinkListProps = { }; export const LinkList = ({ refetch }: LinkListProps) => { - const { entityData } = useEntityData(); + const { urn: entityUrn, entityData } = useEntityData(); const entityRegistry = useEntityRegistry(); const [removeLinkMutation] = useRemoveLinkMutation(); const links = entityData?.institutionalMemory?.elements || []; @@ -41,7 +41,7 @@ export const LinkList = ({ refetch }: LinkListProps) => { const handleDeleteLink = async (metadata: InstitutionalMemoryMetadata) => { try { await removeLinkMutation({ - variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn } }, + variables: { input: { linkUrl: metadata.url, resourceUrn: metadata.associatedUrn || entityUrn } }, }); message.success({ content: 'Link Removed', duration: 2 }); } catch (e: unknown) { diff --git a/datahub-web-react/src/app/entity/shared/utils.ts b/datahub-web-react/src/app/entity/shared/utils.ts index 7ec604785d1ff..a158cc9b7c119 100644 --- a/datahub-web-react/src/app/entity/shared/utils.ts +++ b/datahub-web-react/src/app/entity/shared/utils.ts @@ -1,9 +1,7 @@ -import * as QueryString from 'query-string'; import { Maybe } from 'graphql/jsutils/Maybe'; -import { Entity, EntityType, MatchedField, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; +import { Entity, EntityType, EntityRelationshipsResult, DataProduct } from '../../../types.generated'; import { capitalizeFirstLetterOnly } from '../../shared/textUtil'; -import { FIELDS_TO_HIGHLIGHT } from '../dataset/search/highlights'; import { GenericEntityProperties } from './types'; export function dictToQueryStringParams(params: Record) { @@ -87,46 +85,6 @@ export const isListSubset = (l1, l2): boolean => { return l1.every((result) => l2.indexOf(result) >= 0); }; -function normalize(value: string) { - return value.trim().toLowerCase(); -} - -function fromQueryGetBestMatch(selectedMatchedFields: MatchedField[], rawQuery: string) { - const query = normalize(rawQuery); - // first lets see if there's an exact match between a field value and the query - const exactMatch = selectedMatchedFields.find((field) => normalize(field.value) === query); - if (exactMatch) { - return exactMatch; - } - - // if no exact match exists, we'll see if the entire query is contained in any of the values - const containedMatch = selectedMatchedFields.find((field) => normalize(field.value).includes(query)); - if (containedMatch) { - return containedMatch; - } - - // otherwise, just return whichever is first - return selectedMatchedFields[0]; -} - -export const getMatchPrioritizingPrimary = ( - matchedFields: MatchedField[], - primaryField: string, -): MatchedField | undefined => { - const { location } = window; - const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); - const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); - - const primaryMatches = matchedFields.filter((field) => field.name === primaryField); - if (primaryMatches.length > 0) { - return fromQueryGetBestMatch(primaryMatches, query); - } - - const matchesThatShouldBeShownOnFE = matchedFields.filter((field) => FIELDS_TO_HIGHLIGHT.has(field.name)); - - return fromQueryGetBestMatch(matchesThatShouldBeShownOnFE, query); -}; - function getGraphqlErrorCode(e) { if (e.graphQLErrors && e.graphQLErrors.length) { const firstError = e.graphQLErrors[0]; diff --git a/datahub-web-react/src/app/entity/user/preview/Preview.tsx b/datahub-web-react/src/app/entity/user/preview/Preview.tsx index 01f68d9065523..8893d4ab86786 100644 --- a/datahub-web-react/src/app/entity/user/preview/Preview.tsx +++ b/datahub-web-react/src/app/entity/user/preview/Preview.tsx @@ -7,6 +7,7 @@ import { useEntityRegistry } from '../../../useEntityRegistry'; import { ANTD_GRAY } from '../../shared/constants'; import { IconStyleType } from '../../Entity'; import { CustomAvatar } from '../../../shared/avatar'; +import SearchTextHighlighter from '../../../search/matches/SearchTextHighlighter'; const PreviewContainer = styled.div` display: flex; @@ -80,11 +81,17 @@ export const Preview = ({ {entityRegistry.getEntityName(EntityType.CorpUser)} - {name || urn} + + {name ? : urn} + - {title && {title}} + {title && ( + + + + )} diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx index 03689460eb02b..eda9b7d7fe2a4 100644 --- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx +++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx @@ -1,4 +1,4 @@ -import React, { useEffect, useRef, useState } from 'react'; +import React, { CSSProperties, useEffect, useRef, useState } from 'react'; import { useHistory } from 'react-router'; import { Select } from 'antd'; import styled from 'styled-components'; @@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div` .ant-select-selection-item { font-weight: 700; font-size: 14px; + text-align: left; } } } `; +const SelectStyled = styled(Select)` + min-width: 90px; + max-width: 200px; +`; + +type Props = { + dropdownStyle?: CSSProperties; +}; + /** * The View Select component allows you to select a View to apply to query on the current page. For example, * search, recommendations, and browse. @@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div` * * In the event that a user refreshes their browser, the state of the view should be saved as well. */ -export const ViewSelect = () => { +export const ViewSelect = ({ dropdownStyle = {} }: Props) => { const history = useHistory(); const userContext = useUserContext(); const [isOpen, setIsOpen] = useState(false); @@ -188,12 +198,11 @@ export const ViewSelect = () => { return ( - + {viewBuilderDisplayState.visible && ( { ref={clearButtonRef} onClick={onHandleClickClear} > - All Entities + View all ); diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx index def413e13213f..5919d2dbf5b7e 100644 --- a/datahub-web-react/src/app/home/HomePageHeader.tsx +++ b/datahub-web-react/src/app/home/HomePageHeader.tsx @@ -273,6 +273,7 @@ export const HomePageHeader = () => { autoCompleteStyle={styles.searchBox} entityRegistry={entityRegistry} viewsEnabled={viewsEnabled} + combineSiblings showQuickFilters /> {searchResultsToShow && searchResultsToShow.length > 0 && ( diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 0d8f9ddae82d1..0d0a32f7750a8 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -34,6 +34,7 @@ import ExternalUrlButton from '../entity/shared/ExternalUrlButton'; import EntityPaths from './EntityPaths/EntityPaths'; import { DataProductLink } from '../shared/tags/DataProductLink'; import { EntityHealth } from '../entity/shared/containers/profile/header/EntityHealth'; +import SearchTextHighlighter from '../search/matches/SearchTextHighlighter'; import { getUniqueOwners } from './utils'; const PreviewContainer = styled.div` @@ -173,6 +174,7 @@ interface Props { deprecation?: Deprecation | null; topUsers?: Array | null; externalUrl?: string | null; + entityTitleSuffix?: React.ReactNode; subHeader?: React.ReactNode; snippet?: React.ReactNode; insights?: Array | null; @@ -225,6 +227,7 @@ export default function DefaultPreviewCard({ titleSizePx, dataTestID, externalUrl, + entityTitleSuffix, onClick, degree, parentContainers, @@ -289,14 +292,14 @@ export default function DefaultPreviewCard({ ) : ( - {name || ' '} + )} {deprecation?.deprecated && ( )} - {health && health.length > 0 && } + {health && health.length > 0 ? : null} {externalUrl && ( )} + {entityTitleSuffix} - {degree !== undefined && degree !== null && ( ) : undefined } + customRender={(text) => } > {description} diff --git a/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx b/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx index e5f58a8662acc..c562fc6e8349a 100644 --- a/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx +++ b/datahub-web-react/src/app/search/AdvancedFilterSelectValueModal.tsx @@ -23,9 +23,7 @@ import { REMOVED_FILTER_NAME, TAGS_FILTER_NAME, TYPE_NAMES_FILTER_NAME, - DATA_PRODUCTS_FILTER_NAME, } from './utils/constants'; -import SetDataProductModal from '../entity/shared/containers/profile/sidebar/DataProduct/SetDataProductModal'; type Props = { facet?: FacetMetadata | null; @@ -80,23 +78,6 @@ export const AdvancedFilterSelectValueModal = ({ ); } - if (filterField === DATA_PRODUCTS_FILTER_NAME) { - return ( - initialValues?.includes(agg?.entity?.urn || ''))?.entity || null - } - onModalClose={onCloseModal} - onOkOverride={(dataProductUrn) => { - onSelect([dataProductUrn]); - onCloseModal(); - }} - /> - ); - } - if (filterField === CONTAINER_FILTER_NAME) { return ( 0 ? suggestions[0].text : ''; + const refineSearchText = getRefineSearchText(filters, viewUrn); + + const onClickExploreAll = useCallback(() => { + analytics.event({ type: EventType.SearchResultsExploreAllClickEvent }); + navigateToSearchUrl({ query: '*', history }); + }, [history]); + + const searchForSuggestion = () => { + navigateToSearchUrl({ query: suggestText, history }); + }; + + const clearFiltersAndView = () => { + navigateToSearchUrl({ query, history }); + userContext.updateLocalState({ + ...userContext.localState, + selectedViewUrn: undefined, + }); + }; + + return ( + +
No results found for "{query}"
+ {refineSearchText && ( + <> + Try {refineSearchText}{' '} + {suggestText && ( + <> + or searching for {suggestText} + + )} + + )} + {!refineSearchText && suggestText && ( + <> + Did you mean {suggestText} + + )} + {!refineSearchText && !suggestText && ( + + )} +
+ ); +} diff --git a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx b/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx deleted file mode 100644 index 9b577048145c5..0000000000000 --- a/datahub-web-react/src/app/search/EntityGroupSearchResults.tsx +++ /dev/null @@ -1,98 +0,0 @@ -import { ArrowRightOutlined } from '@ant-design/icons'; -import { Button, Card, Divider, List, Space, Typography } from 'antd'; -import { ListProps } from 'antd/lib/list'; -import * as React from 'react'; -import { useHistory } from 'react-router-dom'; -import styled from 'styled-components'; -import { EntityType, SearchResult } from '../../types.generated'; -import { IconStyleType } from '../entity/Entity'; -import { useEntityRegistry } from '../useEntityRegistry'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; -import analytics, { EventType } from '../analytics'; - -const styles = { - header: { marginBottom: 20 }, - resultHeaderCardBody: { padding: '16px 24px' }, - resultHeaderCard: { right: '52px', top: '-40px', position: 'absolute' }, - seeAllButton: { fontSize: 18 }, - resultsContainer: { width: '100%', padding: '40px 132px' }, -}; - -const ResultList = styled(List)` - &&& { - width: 100%; - border-color: ${(props) => props.theme.styles['border-color-base']}; - margin-top: 8px; - padding: 16px 48px; - box-shadow: ${(props) => props.theme.styles['box-shadow']}; - } -`; - -interface Props { - type: EntityType; - query: string; - searchResults: Array; -} - -export const EntityGroupSearchResults = ({ type, query, searchResults }: Props) => { - const history = useHistory(); - const entityRegistry = useEntityRegistry(); - - const onResultClick = (result: SearchResult, index: number) => { - analytics.event({ - type: EventType.SearchResultClickEvent, - query, - entityUrn: result.entity.urn, - entityType: result.entity.type, - index, - total: searchResults.length, - }); - }; - - return ( - - >> - header={ - - {entityRegistry.getCollectionName(type)} - - {entityRegistry.getIcon(type, 36, IconStyleType.ACCENT)} - - - } - footer={ - searchResults.length > 0 && ( - - ) - } - dataSource={searchResults as SearchResult[]} - split={false} - renderItem={(searchResult, index) => ( - <> - onResultClick(searchResult, index)}> - {entityRegistry.renderSearchResult(type, searchResult)} - - {index < searchResults.length - 1 && } - - )} - bordered - /> - - ); -}; diff --git a/datahub-web-react/src/app/search/PostLinkCard.tsx b/datahub-web-react/src/app/search/PostLinkCard.tsx index 04308632c61c9..2111c0b25ad84 100644 --- a/datahub-web-react/src/app/search/PostLinkCard.tsx +++ b/datahub-web-react/src/app/search/PostLinkCard.tsx @@ -39,12 +39,17 @@ const TextContainer = styled.div` flex: 2; `; -const TextWrapper = styled.div` - text-align: left; +const FlexWrapper = styled.div<{ alignCenter?: boolean }>` display: flex; flex-direction: column; justify-content: center; flex: 2; + ${(props) => props.alignCenter && 'align-items: center;'} +`; + +const TextWrapper = styled.div` + display: flex; + flex-direction: column; `; const HeaderText = styled(Typography.Text)` @@ -74,19 +79,21 @@ export const PostLinkCard = ({ linkPost }: Props) => { const link = linkPost?.content?.link || ''; return ( - + {hasMedia && ( )} - - Link - - {linkPost?.content?.title} - - + + + Link + + {linkPost?.content?.title} + + + diff --git a/datahub-web-react/src/app/search/PostTextCard.tsx b/datahub-web-react/src/app/search/PostTextCard.tsx index 1bba55425fe0d..15b34e37fc01c 100644 --- a/datahub-web-react/src/app/search/PostTextCard.tsx +++ b/datahub-web-react/src/app/search/PostTextCard.tsx @@ -7,7 +7,6 @@ import { Post } from '../../types.generated'; const CardContainer = styled.div` display: flex; flex-direction: row; - min-height: 140px; border: 1px solid ${ANTD_GRAY[4]}; border-radius: 12px; box-shadow: ${(props) => props.theme.styles['box-shadow']}; @@ -15,6 +14,7 @@ const CardContainer = styled.div` box-shadow: ${(props) => props.theme.styles['box-shadow-hover']}; } white-space: unset; + padding-bottom: 4px; `; const TextContainer = styled.div` @@ -28,6 +28,9 @@ const TextContainer = styled.div` const TitleText = styled(Typography.Title)` word-break: break-word; min-height: 20px; + &&& { + margin-top: 8px; + } `; const HeaderText = styled(Typography.Text)` diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index 97be6ab6b65e3..fb10e1ca0026e 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -3,7 +3,7 @@ import { Input, AutoComplete, Button } from 'antd'; import { CloseCircleFilled, SearchOutlined } from '@ant-design/icons'; import styled from 'styled-components/macro'; import { useHistory } from 'react-router'; -import { AutoCompleteResultForEntity, Entity, EntityType, FacetFilterInput, ScenarioType } from '../../types.generated'; +import { AutoCompleteResultForEntity, EntityType, FacetFilterInput, ScenarioType } from '../../types.generated'; import EntityRegistry from '../entity/EntityRegistry'; import filterSearchQuery from './utils/filterSearchQuery'; import { ANTD_GRAY, ANTD_GRAY_V2 } from '../entity/shared/constants'; @@ -23,6 +23,7 @@ import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import { getQuickFilterDetails } from './autoComplete/quickFilters/utils'; import ViewAllSearchItem from './ViewAllSearchItem'; import { ViewSelect } from '../entity/view/select/ViewSelect'; +import { combineSiblingsInAutoComplete } from './utils/combineSiblingsInAutoComplete'; const StyledAutoComplete = styled(AutoComplete)` width: 100%; @@ -88,15 +89,6 @@ const QUICK_FILTER_AUTO_COMPLETE_OPTION = { ], }; -const renderItem = (query: string, entity: Entity) => { - return { - value: entity.urn, - label: , - type: entity.type, - style: { padding: '12px 12px 12px 16px' }, - }; -}; - const renderRecommendedQuery = (query: string) => { return { value: query, @@ -123,6 +115,7 @@ interface Props { hideRecommendations?: boolean; showQuickFilters?: boolean; viewsEnabled?: boolean; + combineSiblings?: boolean; setIsSearchBarFocused?: (isSearchBarFocused: boolean) => void; onFocus?: () => void; onBlur?: () => void; @@ -149,6 +142,7 @@ export const SearchBar = ({ hideRecommendations, showQuickFilters, viewsEnabled = false, + combineSiblings = false, setIsSearchBarFocused, onFocus, onBlur, @@ -227,14 +221,26 @@ export const SearchBar = ({ ]; }, [showQuickFilters, suggestions.length, effectiveQuery, selectedQuickFilter, entityRegistry]); - const autoCompleteEntityOptions = useMemo( - () => - suggestions.map((entity: AutoCompleteResultForEntity) => ({ - label: , - options: [...entity.entities.map((e: Entity) => renderItem(effectiveQuery, e))], - })), - [effectiveQuery, suggestions], - ); + const autoCompleteEntityOptions = useMemo(() => { + return suggestions.map((suggestion: AutoCompleteResultForEntity) => { + const combinedSuggestion = combineSiblingsInAutoComplete(suggestion, { combineSiblings }); + return { + label: , + options: combinedSuggestion.combinedEntities.map((combinedEntity) => ({ + value: combinedEntity.entity.urn, + label: ( + + ), + type: combinedEntity.entity.type, + style: { padding: '12px 12px 12px 16px' }, + })), + }; + }); + }, [combineSiblings, effectiveQuery, suggestions]); const previousSelectedQuickFilterValue = usePrevious(selectedQuickFilter?.value); useEffect(() => { @@ -371,7 +377,15 @@ export const SearchBar = ({ onKeyUp={handleStopPropagation} onKeyDown={handleStopPropagation} > - +
)} diff --git a/datahub-web-react/src/app/search/SearchPage.tsx b/datahub-web-react/src/app/search/SearchPage.tsx index ce353640d8179..6387f0ef8c05e 100644 --- a/datahub-web-react/src/app/search/SearchPage.tsx +++ b/datahub-web-react/src/app/search/SearchPage.tsx @@ -59,6 +59,7 @@ export const SearchPage = () => { orFilters, viewUrn, sortInput, + searchFlags: { getSuggestions: true }, }, }, }); @@ -235,6 +236,7 @@ export const SearchPage = () => { error={error} searchResponse={data?.searchAcrossEntities} facets={data?.searchAcrossEntities?.facets} + suggestions={data?.searchAcrossEntities?.suggestions || []} selectedFilters={filters} loading={loading} onChangeFilters={onChangeFilters} diff --git a/datahub-web-react/src/app/search/SearchResultList.tsx b/datahub-web-react/src/app/search/SearchResultList.tsx index c15aa15990009..386b22f34602b 100644 --- a/datahub-web-react/src/app/search/SearchResultList.tsx +++ b/datahub-web-react/src/app/search/SearchResultList.tsx @@ -1,17 +1,16 @@ -import React, { useCallback } from 'react'; -import { Button, Checkbox, Divider, Empty, List, ListProps } from 'antd'; +import React from 'react'; +import { Checkbox, Divider, List, ListProps } from 'antd'; import styled from 'styled-components'; -import { useHistory } from 'react-router'; -import { RocketOutlined } from '@ant-design/icons'; -import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import { ANTD_GRAY } from '../entity/shared/constants'; -import { CombinedSearchResult, SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; +import { SEPARATE_SIBLINGS_URL_PARAM } from '../entity/shared/siblingUtils'; import { CompactEntityNameList } from '../recommendations/renderer/component/CompactEntityNameList'; import { useEntityRegistry } from '../useEntityRegistry'; -import { SearchResult } from '../../types.generated'; +import { SearchResult, SearchSuggestion } from '../../types.generated'; import analytics, { EventType } from '../analytics'; import { EntityAndType } from '../entity/shared/types'; import { useIsSearchV2 } from './useSearchAndBrowseVersion'; +import { CombinedSearchResult } from './utils/combineSiblingsInSearchResults'; +import EmptySearchResults from './EmptySearchResults'; const ResultList = styled(List)` &&& { @@ -27,13 +26,6 @@ const StyledCheckbox = styled(Checkbox)` margin-right: 12px; `; -const NoDataContainer = styled.div` - > div { - margin-top: 28px; - margin-bottom: 28px; - } -`; - const ThinDivider = styled(Divider)` margin-top: 16px; margin-bottom: 16px; @@ -69,6 +61,7 @@ type Props = { isSelectMode: boolean; selectedEntities: EntityAndType[]; setSelectedEntities: (entities: EntityAndType[]) => any; + suggestions: SearchSuggestion[]; }; export const SearchResultList = ({ @@ -78,17 +71,12 @@ export const SearchResultList = ({ isSelectMode, selectedEntities, setSelectedEntities, + suggestions, }: Props) => { - const history = useHistory(); const entityRegistry = useEntityRegistry(); const selectedEntityUrns = selectedEntities.map((entity) => entity.urn); const showSearchFiltersV2 = useIsSearchV2(); - const onClickExploreAll = useCallback(() => { - analytics.event({ type: EventType.SearchResultsExploreAllClickEvent }); - navigateToSearchUrl({ query: '*', history }); - }, [history]); - const onClickResult = (result: SearchResult, index: number) => { analytics.event({ type: EventType.SearchResultClickEvent, @@ -117,19 +105,7 @@ export const SearchResultList = ({ id="search-result-list" dataSource={searchResults} split={false} - locale={{ - emptyText: ( - - - - - ), - }} + locale={{ emptyText: }} renderItem={(item, index) => ( ` display: flex; @@ -131,6 +132,7 @@ interface Props { setNumResultsPerPage: (numResults: number) => void; isSelectMode: boolean; selectedEntities: EntityAndType[]; + suggestions: SearchSuggestion[]; setSelectedEntities: (entities: EntityAndType[]) => void; setIsSelectMode: (showSelectMode: boolean) => any; onChangeSelectAll: (selected: boolean) => void; @@ -155,6 +157,7 @@ export const SearchResults = ({ setNumResultsPerPage, isSelectMode, selectedEntities, + suggestions, setIsSelectMode, setSelectedEntities, onChangeSelectAll, @@ -238,6 +241,7 @@ export const SearchResults = ({ {(error && ) || (!loading && ( + {totalResults > 0 && } - - SearchCfg.RESULTS_PER_PAGE} - onShowSizeChange={(_currNum, newNum) => setNumResultsPerPage(newNum)} - pageSizeOptions={['10', '20', '50', '100']} - /> - + {totalResults > 0 && ( + + SearchCfg.RESULTS_PER_PAGE} + onShowSizeChange={(_currNum, newNum) => setNumResultsPerPage(newNum)} + pageSizeOptions={['10', '20', '50', '100']} + /> + + )} {authenticatedUserUrn && ( ; hasParentTooltip: boolean; } -export default function AutoCompleteEntity({ query, entity, hasParentTooltip }: Props) { +export default function AutoCompleteEntity({ query, entity, siblings, hasParentTooltip }: Props) { const entityRegistry = useEntityRegistry(); const genericEntityProps = entityRegistry.getGenericEntityProperties(entity.type, entity); - const platformName = getPlatformName(genericEntityProps); - const platformLogoUrl = genericEntityProps?.platform?.properties?.logoUrl; const displayName = entityRegistry.getDisplayName(entity.type, entity); - const icon = - (platformLogoUrl && ) || - entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT); const { matchedText, unmatchedText } = getAutoCompleteEntityText(displayName, query); + const entities = siblings?.length ? siblings : [entity]; + const platforms = + genericEntityProps?.siblingPlatforms + ?.map( + (platform) => + getPlatformName(entityRegistry.getGenericEntityProperties(EntityType.DataPlatform, platform)) || '', + ) + .filter(Boolean) ?? []; + const parentContainers = genericEntityProps?.parentContainers?.containers || []; // Need to reverse parentContainers since it returns direct parent first. const orderedParentContainers = [...parentContainers].reverse(); const subtype = genericEntityProps?.subTypes?.typeNames?.[0]; + const showPlatforms = !!platforms.length; + const showPlatformDivider = !!platforms.length && !!parentContainers.length; + const showParentContainers = !!parentContainers.length; + const showHeader = showPlatforms || showParentContainers; + return ( - {icon} - + {showHeader && ( + + + {entities.map((ent) => ( + + ))} + + {showPlatforms && } + {showPlatformDivider && } + {showParentContainers && } + + )} { + const entityRegistry = useEntityRegistry(); + + const genericEntityProps = entityRegistry.getGenericEntityProperties(entity.type, entity); + const platformLogoUrl = genericEntityProps?.platform?.properties?.logoUrl; + const platformName = getPlatformName(genericEntityProps); + return ( + (platformLogoUrl && ) || + entityRegistry.getIcon(entity.type, 12, IconStyleType.ACCENT) + ); +}; + +export default AutoCompleteEntityIcon; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx index c97d171b4c931..b8f5a2c7e4081 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteItem.tsx @@ -18,9 +18,10 @@ export const SuggestionContainer = styled.div` interface Props { query: string; entity: Entity; + siblings?: Array; } -export default function AutoCompleteItem({ query, entity }: Props) { +export default function AutoCompleteItem({ query, entity, siblings }: Props) { const entityRegistry = useEntityRegistry(); const displayTooltip = getShouldDisplayTooltip(entity, entityRegistry); let componentToRender: React.ReactNode = null; @@ -33,7 +34,14 @@ export default function AutoCompleteItem({ query, entity }: Props) { componentToRender = ; break; default: - componentToRender = ; + componentToRender = ( + + ); break; } diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx new file mode 100644 index 0000000000000..61fe6bcae71d0 --- /dev/null +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompletePlatformNames.tsx @@ -0,0 +1,22 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; + +const PlatformText = styled(Typography.Text)` + font-size: 12px; + line-height: 20px; + font-weight: 500; + color: ${ANTD_GRAY_V2[8]}; + white-space: nowrap; +`; + +type Props = { + platforms: Array; +}; + +const AutoCompletePlatformNames = ({ platforms }: Props) => { + return {platforms.join(' & ')}; +}; + +export default AutoCompletePlatformNames; diff --git a/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx b/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx index 1f88b94bb0cc7..53b4d53ef46d4 100644 --- a/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx +++ b/datahub-web-react/src/app/search/autoComplete/AutoCompleteUser.tsx @@ -1,20 +1,10 @@ import { Typography } from 'antd'; import React from 'react'; -import styled from 'styled-components'; import { CorpUser, EntityType } from '../../../types.generated'; -import { ANTD_GRAY } from '../../entity/shared/constants'; import { CustomAvatar } from '../../shared/avatar'; import { useEntityRegistry } from '../../useEntityRegistry'; import { getAutoCompleteEntityText } from './utils'; - -export const SuggestionText = styled.div` - margin-left: 12px; - margin-top: 2px; - margin-bottom: 2px; - color: ${ANTD_GRAY[9]}; - font-size: 16px; - overflow: hidden; -`; +import { SuggestionText } from './styledComponents'; interface Props { query: string; diff --git a/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx b/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx index 77ccde06172c9..98a4f5aa214bb 100644 --- a/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx +++ b/datahub-web-react/src/app/search/autoComplete/ParentContainers.tsx @@ -4,20 +4,21 @@ import React, { Fragment } from 'react'; import styled from 'styled-components/macro'; import { Container, EntityType } from '../../../types.generated'; import { useEntityRegistry } from '../../useEntityRegistry'; -import { ANTD_GRAY } from '../../entity/shared/constants'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; const NUM_VISIBLE_CONTAINERS = 2; const ParentContainersWrapper = styled.div` font-size: 12px; - color: ${ANTD_GRAY[9]}; + color: ${ANTD_GRAY_V2[8]}; display: flex; align-items: center; - margin-bottom: 3px; `; const ParentContainer = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; margin-left: 4px; + font-weight: 500; `; export const ArrowWrapper = styled.span` diff --git a/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx b/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx index 79743858b06d9..f4c31b18c99b2 100644 --- a/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx +++ b/datahub-web-react/src/app/search/autoComplete/RecommendedOption.tsx @@ -1,7 +1,7 @@ import { SearchOutlined } from '@ant-design/icons'; import React from 'react'; import styled from 'styled-components/macro'; -import { SuggestionText } from './AutoCompleteUser'; +import { SuggestionText } from './styledComponents'; const TextWrapper = styled.span``; diff --git a/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx b/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx new file mode 100644 index 0000000000000..9e4b084ab3889 --- /dev/null +++ b/datahub-web-react/src/app/search/autoComplete/styledComponents.tsx @@ -0,0 +1,11 @@ +import styled from 'styled-components'; +import { ANTD_GRAY } from '../../entity/shared/constants'; + +export const SuggestionText = styled.div` + margin-left: 12px; + margin-top: 2px; + margin-bottom: 2px; + color: ${ANTD_GRAY[9]}; + font-size: 16px; + overflow: hidden; +`; diff --git a/datahub-web-react/src/app/search/context/SearchContext.tsx b/datahub-web-react/src/app/search/context/SearchContext.tsx index ec9a0c895e876..656c57b0b22d0 100644 --- a/datahub-web-react/src/app/search/context/SearchContext.tsx +++ b/datahub-web-react/src/app/search/context/SearchContext.tsx @@ -1,11 +1,13 @@ import React, { useContext } from 'react'; export type SearchContextType = { + query: string | undefined; selectedSortOption: string | undefined; setSelectedSortOption: (sortOption: string) => void; }; export const DEFAULT_CONTEXT = { + query: undefined, selectedSortOption: undefined, setSelectedSortOption: (_: string) => null, }; @@ -21,3 +23,7 @@ export function useSearchContext() { export function useSelectedSortOption() { return useSearchContext().selectedSortOption; } + +export function useSearchQuery() { + return useSearchContext().query; +} diff --git a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx index bfb65c1d74d3e..5ad9667ab1fc0 100644 --- a/datahub-web-react/src/app/search/context/SearchContextProvider.tsx +++ b/datahub-web-react/src/app/search/context/SearchContextProvider.tsx @@ -8,6 +8,7 @@ export default function SearchContextProvider({ children }: { children: React.Re const history = useHistory(); const location = useLocation(); const params = useMemo(() => QueryString.parse(location.search, { arrayFormat: 'comma' }), [location.search]); + const query = (params.query ? decodeURIComponent(params.query as string) : undefined) as string | undefined; const selectedSortOption = params.sortOption as string | undefined; function setSelectedSortOption(selectedOption: string) { @@ -15,7 +16,7 @@ export default function SearchContextProvider({ children }: { children: React.Re } return ( - + {children} ); diff --git a/datahub-web-react/src/app/search/context/SearchResultContext.tsx b/datahub-web-react/src/app/search/context/SearchResultContext.tsx new file mode 100644 index 0000000000000..68adead005149 --- /dev/null +++ b/datahub-web-react/src/app/search/context/SearchResultContext.tsx @@ -0,0 +1,72 @@ +import React, { ReactNode, createContext, useContext, useMemo } from 'react'; +import { SearchResult } from '../../../types.generated'; +import { + getMatchedFieldsByUrn, + getMatchedFieldNames, + getMatchedFieldsByNames, + shouldShowInMatchedFieldList, + getMatchedFieldLabel, + getMatchesPrioritized, +} from '../matches/utils'; +import { MatchedFieldName } from '../matches/constants'; + +type SearchResultContextValue = { + searchResult: SearchResult; +} | null; + +const SearchResultContext = createContext(null); + +type Props = { + children: ReactNode; + searchResult: SearchResult; +}; + +export const SearchResultProvider = ({ children, searchResult }: Props) => { + const value = useMemo( + () => ({ + searchResult, + }), + [searchResult], + ); + return {children}; +}; + +const useSearchResultContext = () => { + return useContext(SearchResultContext); +}; + +export const useSearchResult = () => { + return useSearchResultContext()?.searchResult; +}; + +export const useEntityType = () => { + return useSearchResultContext()?.searchResult.entity.type; +}; + +export const useMatchedFields = () => { + return useSearchResult()?.matchedFields ?? []; +}; + +export const useMatchedFieldsForList = (primaryField: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const showableFields = matchedFields.filter((field) => shouldShowInMatchedFieldList(entityType, field)); + return entityType ? getMatchesPrioritized(entityType, showableFields, primaryField) : []; +}; + +export const useMatchedFieldsByGroup = (fieldName: MatchedFieldName) => { + const entityType = useEntityType(); + const matchedFields = useMatchedFields(); + const matchedFieldNames = getMatchedFieldNames(entityType, fieldName); + return getMatchedFieldsByNames(matchedFields, matchedFieldNames); +}; + +export const useHasMatchedFieldByUrn = (urn: string, fieldName: MatchedFieldName) => { + const matchedFields = useMatchedFieldsByGroup(fieldName); + return getMatchedFieldsByUrn(matchedFields, urn).length > 0; +}; + +export const useMatchedFieldLabel = (fieldName: string) => { + const entityType = useEntityType(); + return getMatchedFieldLabel(entityType, fieldName); +}; diff --git a/datahub-web-react/src/app/search/context/constants.ts b/datahub-web-react/src/app/search/context/constants.ts index 372230db023e9..5f841b8536e19 100644 --- a/datahub-web-react/src/app/search/context/constants.ts +++ b/datahub-web-react/src/app/search/context/constants.ts @@ -1,15 +1,23 @@ import { SortOrder } from '../../../types.generated'; export const RELEVANCE = 'relevance'; -export const NAME_FIELD = 'name'; +export const ENTITY_NAME_FIELD = '_entityName'; export const LAST_OPERATION_TIME_FIELD = 'lastOperationTime'; export const DEFAULT_SORT_OPTION = RELEVANCE; export const SORT_OPTIONS = { [RELEVANCE]: { label: 'Relevance', field: RELEVANCE, sortOrder: SortOrder.Descending }, - [`${NAME_FIELD}_${SortOrder.Ascending}`]: { label: 'A to Z', field: NAME_FIELD, sortOrder: SortOrder.Ascending }, - [`${NAME_FIELD}_${SortOrder.Descending}`]: { label: 'Z to A', field: NAME_FIELD, sortOrder: SortOrder.Descending }, + [`${ENTITY_NAME_FIELD}_${SortOrder.Ascending}`]: { + label: 'A to Z', + field: ENTITY_NAME_FIELD, + sortOrder: SortOrder.Ascending, + }, + [`${ENTITY_NAME_FIELD}_${SortOrder.Descending}`]: { + label: 'Z to A', + field: ENTITY_NAME_FIELD, + sortOrder: SortOrder.Descending, + }, [`${LAST_OPERATION_TIME_FIELD}_${SortOrder.Descending}`]: { label: 'Last Modified in Platform', field: LAST_OPERATION_TIME_FIELD, diff --git a/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx new file mode 100644 index 0000000000000..0bfe000dea366 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/MatchedFieldList.tsx @@ -0,0 +1,133 @@ +import React from 'react'; + +import { Tooltip, Typography } from 'antd'; +import styled from 'styled-components'; +import { useMatchedFieldLabel, useMatchedFieldsForList } from '../context/SearchResultContext'; +import { MatchedField } from '../../../types.generated'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchesGroupedByFieldName } from './constants'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { getDescriptionSlice, isDescriptionField, isHighlightableEntityField } from './utils'; + +const MatchesContainer = styled.div` + display: flex; + flex-wrap: wrap; + gap: 8px; +`; + +const MatchText = styled(Typography.Text)` + color: ${ANTD_GRAY_V2[8]}; + background: ${(props) => props.theme.styles['highlight-color']}; + border-radius: 4px; + padding: 2px 4px 2px 4px; + padding-right: 4px; +`; + +const MATCH_GROUP_LIMIT = 3; +const TOOLTIP_MATCH_GROUP_LIMIT = 10; + +type CustomFieldRenderer = (field: MatchedField) => JSX.Element | null; + +type Props = { + customFieldRenderer?: CustomFieldRenderer; + matchSuffix?: string; +}; + +const RenderedField = ({ + customFieldRenderer, + field, +}: { + customFieldRenderer?: CustomFieldRenderer; + field: MatchedField; +}) => { + const entityRegistry = useEntityRegistry(); + const query = useSearchQuery()?.trim().toLowerCase(); + const customRenderedField = customFieldRenderer?.(field); + if (customRenderedField) return {customRenderedField}; + if (isHighlightableEntityField(field)) { + return field.entity ? <>{entityRegistry.getDisplayName(field.entity.type, field.entity)} : <>; + } + if (isDescriptionField(field) && query) return {getDescriptionSlice(field.value, query)}; + return {field.value}; +}; + +const MatchedFieldsList = ({ + groupedMatch, + limit, + tooltip, + matchSuffix = '', + customFieldRenderer, +}: { + groupedMatch: MatchesGroupedByFieldName; + limit: number; + tooltip?: JSX.Element; + matchSuffix?: string; + customFieldRenderer?: CustomFieldRenderer; +}) => { + const label = useMatchedFieldLabel(groupedMatch.fieldName); + const count = groupedMatch.matchedFields.length; + const moreCount = Math.max(count - limit, 0); + const andMore = ( + <> + {' '} + & more + + ); + return ( + <> + Matches {count > 1 && `${count} `} + {label} + {count > 1 && 's'}{' '} + {groupedMatch.matchedFields.slice(0, limit).map((field, index) => ( + <> + {index > 0 && ', '} + <> + + + + ))} + {moreCount > 0 && + (tooltip ? ( + + {andMore} + + ) : ( + <>{andMore} + ))}{' '} + {matchSuffix} + + ); +}; + +export const MatchedFieldList = ({ customFieldRenderer, matchSuffix = '' }: Props) => { + const groupedMatches = useMatchedFieldsForList('fieldLabels'); + + return ( + <> + {groupedMatches.length > 0 ? ( + + {groupedMatches.map((groupedMatch) => { + return ( + + + } + /> + + ); + })} + + ) : null} + + ); +}; diff --git a/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx new file mode 100644 index 0000000000000..d8da1088ea89d --- /dev/null +++ b/datahub-web-react/src/app/search/matches/SearchTextHighlighter.tsx @@ -0,0 +1,42 @@ +import React from 'react'; +import Highlight from 'react-highlighter'; +import styled from 'styled-components'; +import { useMatchedFieldsByGroup } from '../context/SearchResultContext'; +import { useSearchQuery } from '../context/SearchContext'; +import { MatchedFieldName } from './constants'; +import { useAppConfig } from '../../useAppConfig'; + +type Props = { + field: MatchedFieldName; + text: string; + enableFullHighlight?: boolean; +}; + +const HIGHLIGHT_ALL_PATTERN = /.*/; + +const StyledHighlight = styled(Highlight).attrs((props) => ({ + matchStyle: { background: props.theme.styles['highlight-color'] }, +}))``; + +const SearchTextHighlighter = ({ field, text, enableFullHighlight = false }: Props) => { + const appConfig = useAppConfig(); + const enableNameHighlight = appConfig.config.visualConfig.searchResult?.enableNameHighlight; + const matchedFields = useMatchedFieldsByGroup(field); + const hasMatchedField = !!matchedFields?.length; + const normalizedSearchQuery = useSearchQuery()?.trim().toLowerCase(); + const normalizedText = text.trim().toLowerCase(); + const hasSubstring = hasMatchedField && !!normalizedSearchQuery && normalizedText.includes(normalizedSearchQuery); + const pattern = enableFullHighlight ? HIGHLIGHT_ALL_PATTERN : undefined; + + return ( + <> + {enableNameHighlight && hasMatchedField ? ( + {text} + ) : ( + text + )} + + ); +}; + +export default SearchTextHighlighter; diff --git a/datahub-web-react/src/app/search/matches/constants.ts b/datahub-web-react/src/app/search/matches/constants.ts new file mode 100644 index 0000000000000..25ca82eef9597 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/constants.ts @@ -0,0 +1,129 @@ +import { EntityType, MatchedField } from '../../../types.generated'; + +export type MatchedFieldName = + | 'urn' + | 'name' + | 'displayName' + | 'title' + | 'description' + | 'editedDescription' + | 'editedFieldDescriptions' + | 'fieldDescriptions' + | 'tags' + | 'fieldTags' + | 'editedFieldTags' + | 'glossaryTerms' + | 'fieldGlossaryTerms' + | 'editedFieldGlossaryTerms' + | 'fieldLabels' + | 'fieldPaths'; + +export type MatchedFieldConfig = { + name: MatchedFieldName; + groupInto?: MatchedFieldName; + label: string; + showInMatchedFieldList?: boolean; +}; + +const DEFAULT_MATCHED_FIELD_CONFIG: Array = [ + { + name: 'urn', + label: 'urn', + }, + { + name: 'title', + label: 'title', + }, + { + name: 'displayName', + groupInto: 'name', + label: 'display name', + }, + { + name: 'name', + groupInto: 'name', + label: 'name', + }, + { + name: 'editedDescription', + groupInto: 'description', + label: 'description', + }, + { + name: 'description', + groupInto: 'description', + label: 'description', + }, + { + name: 'editedFieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'fieldDescriptions', + groupInto: 'fieldDescriptions', + label: 'column description', + showInMatchedFieldList: true, + }, + { + name: 'tags', + label: 'tag', + }, + { + name: 'editedFieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'fieldTags', + groupInto: 'fieldTags', + label: 'column tag', + showInMatchedFieldList: true, + }, + { + name: 'glossaryTerms', + label: 'term', + }, + { + name: 'editedFieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldGlossaryTerms', + groupInto: 'fieldGlossaryTerms', + label: 'column term', + showInMatchedFieldList: true, + }, + { + name: 'fieldLabels', + label: 'label', + showInMatchedFieldList: true, + }, + { + name: 'fieldPaths', + label: 'column', + showInMatchedFieldList: true, + }, +]; + +export const CHART_DASHBOARD_FIELD_CONFIG: Array = DEFAULT_MATCHED_FIELD_CONFIG.map((config) => { + if (config.name === 'title') return { ...config, groupInto: 'name' }; + return config; +}); + +export const MATCHED_FIELD_CONFIG = { + [EntityType.Chart]: CHART_DASHBOARD_FIELD_CONFIG, + [EntityType.Dashboard]: CHART_DASHBOARD_FIELD_CONFIG, + DEFAULT: DEFAULT_MATCHED_FIELD_CONFIG, +} as const; + +export type MatchesGroupedByFieldName = { + fieldName: string; + matchedFields: Array; +}; + +export const HIGHLIGHTABLE_ENTITY_TYPES = [EntityType.Tag, EntityType.GlossaryTerm]; diff --git a/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx new file mode 100644 index 0000000000000..0a33530552864 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedFieldPathsRenderer.tsx @@ -0,0 +1,8 @@ +import React from 'react'; + +import { MatchedField } from '../../../types.generated'; +import { downgradeV2FieldPath } from '../../entity/dataset/profile/schema/utils/utils'; + +export const matchedFieldPathsRenderer = (matchedField: MatchedField) => { + return matchedField?.name === 'fieldPaths' ? {downgradeV2FieldPath(matchedField.value)} : null; +}; diff --git a/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx new file mode 100644 index 0000000000000..25634c9e8b80e --- /dev/null +++ b/datahub-web-react/src/app/search/matches/matchedInputFieldRenderer.tsx @@ -0,0 +1,40 @@ +import React from 'react'; + +import { Chart, Dashboard, EntityType, GlossaryTerm, MatchedField } from '../../../types.generated'; +import { useEntityRegistry } from '../../useEntityRegistry'; + +const LABEL_INDEX_NAME = 'fieldLabels'; +const TYPE_PROPERTY_KEY_NAME = 'type'; + +const TermName = ({ term }: { term: GlossaryTerm }) => { + const entityRegistry = useEntityRegistry(); + return <>{entityRegistry.getDisplayName(EntityType.GlossaryTerm, term)}; +}; + +export const matchedInputFieldRenderer = (matchedField: MatchedField, entity: Chart | Dashboard) => { + if (matchedField?.name === LABEL_INDEX_NAME) { + const matchedSchemaField = entity.inputFields?.fields?.find( + (field) => field?.schemaField?.label === matchedField.value, + ); + const matchedGlossaryTerm = matchedSchemaField?.schemaField?.glossaryTerms?.terms?.find( + (term) => term?.term?.name === matchedField.value, + ); + + if (matchedGlossaryTerm) { + let termType = 'term'; + const typeProperty = matchedGlossaryTerm.term.properties?.customProperties?.find( + (property) => property.key === TYPE_PROPERTY_KEY_NAME, + ); + if (typeProperty) { + termType = typeProperty.value || termType; + } + + return ( + <> + {termType} + + ); + } + } + return null; +}; diff --git a/datahub-web-react/src/app/search/matches/utils.test.ts b/datahub-web-react/src/app/search/matches/utils.test.ts new file mode 100644 index 0000000000000..8b5ed27f5c2ad --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.test.ts @@ -0,0 +1,110 @@ +import { EntityType } from '../../../types.generated'; +import { getMatchesPrioritized } from './utils'; + +const mapping = new Map(); +mapping.set('fieldPaths', 'column'); +mapping.set('fieldDescriptions', 'column description'); +mapping.set('fieldTags', 'column tag'); + +const MOCK_MATCHED_FIELDS = [ + { + name: 'fieldPaths', + value: 'rain', + }, + { + name: 'fieldDescriptions', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbow', + }, + { + name: 'fieldPaths', + value: 'rainbows', + }, +]; + +const MOCK_MATCHED_DESCRIPTION_FIELDS = [ + { + name: 'editedDescription', + value: 'edited description value', + }, + { + name: 'description', + value: 'description value', + }, + { + name: 'fieldDescriptions', + value: 'field descriptions value', + }, + { + name: 'editedFieldDescriptions', + value: 'edited field descriptions value', + }, +]; + +describe('utils', () => { + describe('getMatchPrioritizingPrimary', () => { + it('prioritizes exact match', () => { + global.window.location.search = 'query=rainbow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will accept first contains match', () => { + global.window.location.search = 'query=bow'; + const groupedMatches = getMatchesPrioritized(EntityType.Dataset, MOCK_MATCHED_FIELDS, 'fieldPaths'); + expect(groupedMatches).toEqual([ + { + fieldName: 'fieldPaths', + matchedFields: [ + { name: 'fieldPaths', value: 'rainbow' }, + { name: 'fieldPaths', value: 'rainbows' }, + { name: 'fieldPaths', value: 'rain' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [{ name: 'fieldDescriptions', value: 'rainbow' }], + }, + ]); + }); + it('will group by field name', () => { + global.window.location.search = ''; + const groupedMatches = getMatchesPrioritized( + EntityType.Dataset, + MOCK_MATCHED_DESCRIPTION_FIELDS, + 'fieldPaths', + ); + expect(groupedMatches).toEqual([ + { + fieldName: 'description', + matchedFields: [ + { name: 'editedDescription', value: 'edited description value' }, + { name: 'description', value: 'description value' }, + ], + }, + { + fieldName: 'fieldDescriptions', + matchedFields: [ + { name: 'fieldDescriptions', value: 'field descriptions value' }, + { name: 'editedFieldDescriptions', value: 'edited field descriptions value' }, + ], + }, + ]); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/matches/utils.ts b/datahub-web-react/src/app/search/matches/utils.ts new file mode 100644 index 0000000000000..78c62f7eef458 --- /dev/null +++ b/datahub-web-react/src/app/search/matches/utils.ts @@ -0,0 +1,136 @@ +import * as QueryString from 'query-string'; +import { EntityType, MatchedField } from '../../../types.generated'; +import { + HIGHLIGHTABLE_ENTITY_TYPES, + MATCHED_FIELD_CONFIG, + MatchedFieldConfig, + MatchedFieldName, + MatchesGroupedByFieldName, +} from './constants'; + +const getFieldConfigsByEntityType = (entityType: EntityType | undefined): Array => { + return entityType && entityType in MATCHED_FIELD_CONFIG + ? MATCHED_FIELD_CONFIG[entityType] + : MATCHED_FIELD_CONFIG.DEFAULT; +}; + +export const shouldShowInMatchedFieldList = (entityType: EntityType | undefined, field: MatchedField): boolean => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.some((config) => config.name === field.name && config.showInMatchedFieldList); +}; + +export const getMatchedFieldLabel = (entityType: EntityType | undefined, fieldName: string): string => { + const configs = getFieldConfigsByEntityType(entityType); + return configs.find((config) => config.name === fieldName)?.label ?? ''; +}; + +export const getGroupedFieldName = ( + entityType: EntityType | undefined, + fieldName: string, +): MatchedFieldName | undefined => { + const configs = getFieldConfigsByEntityType(entityType); + const fieldConfig = configs.find((config) => config.name === fieldName); + return fieldConfig?.groupInto; +}; + +export const getMatchedFieldNames = ( + entityType: EntityType | undefined, + fieldName: MatchedFieldName, +): Array => { + return getFieldConfigsByEntityType(entityType) + .filter((config) => fieldName === config.groupInto || fieldName === config.name) + .map((field) => field.name); +}; + +export const getMatchedFieldsByNames = (fields: Array, names: Array): Array => { + return fields.filter((field) => names.includes(field.name)); +}; + +export const getMatchedFieldsByUrn = (fields: Array, urn: string): Array => { + return fields.filter((field) => field.value === urn); +}; + +function normalize(value: string) { + return value.trim().toLowerCase(); +} + +function fromQueryGetBestMatch( + selectedMatchedFields: MatchedField[], + rawQuery: string, + prioritizedField: string, +): Array { + const query = normalize(rawQuery); + const priorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name === prioritizedField, + ); + const nonPriorityMatches: Array = selectedMatchedFields.filter( + (field) => field.name !== prioritizedField, + ); + const exactMatches: Array = []; + const containedMatches: Array = []; + const rest: Array = []; + + [...priorityMatches, ...nonPriorityMatches].forEach((field) => { + const normalizedValue = normalize(field.value); + if (normalizedValue === query) exactMatches.push(field); + else if (normalizedValue.includes(query)) containedMatches.push(field); + else rest.push(field); + }); + + return [...exactMatches, ...containedMatches, ...rest]; +} + +const getMatchesGroupedByFieldName = ( + entityType: EntityType, + matchedFields: Array, +): Array => { + const fieldNameToMatches = new Map>(); + const fieldNames: Array = []; + matchedFields.forEach((field) => { + const groupedFieldName = getGroupedFieldName(entityType, field.name) || field.name; + const matchesInMap = fieldNameToMatches.get(groupedFieldName); + if (matchesInMap) { + matchesInMap.push(field); + } else { + fieldNameToMatches.set(groupedFieldName, [field]); + fieldNames.push(groupedFieldName); + } + }); + return fieldNames.map((fieldName) => ({ + fieldName, + matchedFields: fieldNameToMatches.get(fieldName) ?? [], + })); +}; + +export const getMatchesPrioritized = ( + entityType: EntityType, + matchedFields: MatchedField[], + prioritizedField: string, +): Array => { + const { location } = window; + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const query: string = decodeURIComponent(params.query ? (params.query as string) : ''); + const matches = fromQueryGetBestMatch(matchedFields, query, prioritizedField); + return getMatchesGroupedByFieldName(entityType, matches); +}; + +export const isHighlightableEntityField = (field: MatchedField) => + !!field.entity && HIGHLIGHTABLE_ENTITY_TYPES.includes(field.entity.type); + +export const isDescriptionField = (field: MatchedField) => field.name.toLowerCase().includes('description'); + +const SURROUNDING_DESCRIPTION_CHARS = 10; +const MAX_DESCRIPTION_CHARS = 50; + +export const getDescriptionSlice = (text: string, target: string) => { + const queryIndex = text.indexOf(target); + const start = Math.max(0, queryIndex - SURROUNDING_DESCRIPTION_CHARS); + const end = Math.min( + start + MAX_DESCRIPTION_CHARS, + text.length, + queryIndex + target.length + SURROUNDING_DESCRIPTION_CHARS, + ); + const startEllipsis = start > 0 ? '...' : ''; + const endEllipsis = end < text.length ? '...' : ''; + return `${startEllipsis}${text.slice(start, end)}${endEllipsis}`; +}; diff --git a/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx new file mode 100644 index 0000000000000..9dbd67883bf64 --- /dev/null +++ b/datahub-web-react/src/app/search/suggestions/SearchQuerySugggester.tsx @@ -0,0 +1,39 @@ +import styled from 'styled-components'; +import React from 'react'; +import { useHistory } from 'react-router'; +import { SearchSuggestion } from '../../../types.generated'; +import { navigateToSearchUrl } from '../utils/navigateToSearchUrl'; +import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; + +const TextWrapper = styled.div` + font-size: 14px; + color: ${ANTD_GRAY_V2[8]}; + margin: 16px 0 -8px 32px; +`; + +export const SuggestedText = styled.span` + color: ${(props) => props.theme.styles['primary-color']}; + text-decoration: underline ${(props) => props.theme.styles['primary-color']}; + cursor: pointer; +`; + +interface Props { + suggestions: SearchSuggestion[]; +} + +export default function SearchQuerySuggester({ suggestions }: Props) { + const history = useHistory(); + + if (suggestions.length === 0) return null; + const suggestText = suggestions[0].text; + + function searchForSuggestion() { + navigateToSearchUrl({ query: suggestText, history }); + } + + return ( + + Did you mean {suggestText} + + ); +} diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts new file mode 100644 index 0000000000000..e8e64559e67a0 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInAutoComplete.ts @@ -0,0 +1,31 @@ +import { AutoCompleteResultForEntity, EntityType } from '../../../types.generated'; +import { CombinedEntity, createSiblingEntityCombiner } from '../../entity/shared/siblingUtils'; + +export type CombinedSuggestion = { + type: EntityType; + combinedEntities: Array; + suggestions?: AutoCompleteResultForEntity['suggestions']; +}; + +export function combineSiblingsInAutoComplete( + autoCompleteResultForEntity: AutoCompleteResultForEntity, + { combineSiblings = false } = {}, +): CombinedSuggestion { + const combine = createSiblingEntityCombiner(); + const combinedEntities: Array = []; + + autoCompleteResultForEntity.entities.forEach((entity) => { + if (!combineSiblings) { + combinedEntities.push({ entity }); + return; + } + const combinedResult = combine(entity); + if (!combinedResult.skipped) combinedEntities.push(combinedResult.combinedEntity); + }); + + return { + type: autoCompleteResultForEntity.type, + suggestions: autoCompleteResultForEntity.suggestions, + combinedEntities, + }; +} diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts new file mode 100644 index 0000000000000..4cf61c715b0e9 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.test.ts @@ -0,0 +1,521 @@ +import { combineSiblingsInSearchResults } from './combineSiblingsInSearchResults'; + +const searchResultWithSiblings = [ + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + customProperties: [], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['table'], + __typename: 'SubTypes', + }, + domain: null, + container: { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + parentContainers: { + count: 2, + containers: [ + { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + { + urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'cypress_project', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Project'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + ], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: false, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + platform: { + urn: 'urn:li:dataPlatform:dbt', + type: 'DATA_PLATFORM', + name: 'dbt', + properties: { + type: 'OTHERS', + displayName: 'dbt', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/dbtlogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + name: 'cypress_project.jaffle_shop.raw_orders', + properties: { + name: 'raw_orders', + description: '', + qualifiedName: null, + __typename: 'DatasetProperties', + }, + __typename: 'Dataset', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:dbt', + type: 'DATA_PLATFORM', + name: 'dbt', + properties: { + type: 'OTHERS', + displayName: 'dbt', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/dbtlogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: '', + qualifiedName: null, + customProperties: [ + { + key: 'catalog_version', + value: '1.0.4', + __typename: 'StringMapEntry', + }, + { + key: 'node_type', + value: 'seed', + __typename: 'StringMapEntry', + }, + { + key: 'materialization', + value: 'seed', + __typename: 'StringMapEntry', + }, + { + key: 'dbt_file_path', + value: 'data/raw_orders.csv', + __typename: 'StringMapEntry', + }, + { + key: 'catalog_schema', + value: 'https://schemas.getdbt.com/dbt/catalog/v1.json', + __typename: 'StringMapEntry', + }, + { + key: 'catalog_type', + value: 'table', + __typename: 'StringMapEntry', + }, + { + key: 'manifest_version', + value: '1.0.4', + __typename: 'StringMapEntry', + }, + { + key: 'manifest_schema', + value: 'https://schemas.getdbt.com/dbt/manifest/v4.json', + __typename: 'StringMapEntry', + }, + ], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['seed'], + __typename: 'SubTypes', + }, + domain: null, + container: null, + parentContainers: { + count: 0, + containers: [], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: true, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + type: 'DATASET', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + name: 'cypress_project.jaffle_shop.raw_orders', + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + __typename: 'DatasetProperties', + }, + __typename: 'Dataset', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, +]; + +const searchResultWithGhostSiblings = [ + { + entity: { + urn: 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: true, + type: 'DATASET', + name: 'cypress_project.jaffle_shop.raw_orders', + origin: 'PROD', + uri: null, + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + dataPlatformInstance: null, + editableProperties: null, + platformNativeType: null, + properties: { + name: 'raw_orders', + description: null, + qualifiedName: null, + customProperties: [], + __typename: 'DatasetProperties', + }, + ownership: null, + globalTags: null, + glossaryTerms: null, + subTypes: { + typeNames: ['table'], + __typename: 'SubTypes', + }, + domain: null, + container: { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + parentContainers: { + count: 2, + containers: [ + { + urn: 'urn:li:container:348c96555971d3f5c1ffd7dd2e7446cb', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'jaffle_shop', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Dataset'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + { + urn: 'urn:li:container:b5e95fce839e7d78151ed7e0a7420d84', + platform: { + urn: 'urn:li:dataPlatform:bigquery', + type: 'DATA_PLATFORM', + name: 'bigquery', + properties: { + type: 'RELATIONAL_DB', + displayName: 'BigQuery', + datasetNameDelimiter: '.', + logoUrl: '/assets/platforms/bigquerylogo.png', + __typename: 'DataPlatformProperties', + }, + displayName: null, + info: null, + __typename: 'DataPlatform', + }, + properties: { + name: 'cypress_project', + __typename: 'ContainerProperties', + }, + subTypes: { + typeNames: ['Project'], + __typename: 'SubTypes', + }, + deprecation: null, + __typename: 'Container', + }, + ], + __typename: 'ParentContainersResult', + }, + deprecation: null, + siblings: { + isPrimary: false, + siblings: [ + { + urn: 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + exists: false, + type: 'DATASET', + }, + ], + __typename: 'SiblingProperties', + }, + __typename: 'Dataset', + }, + matchedFields: [ + { + name: 'name', + value: 'raw_orders', + __typename: 'MatchedField', + }, + { + name: 'id', + value: 'cypress_project.jaffle_shop.raw_orders', + __typename: 'MatchedField', + }, + ], + insights: [], + __typename: 'SearchResult', + }, +]; + +describe('siblingUtils', () => { + describe('combineSiblingsInSearchResults', () => { + it('combines search results to deduplicate siblings', () => { + const result = combineSiblingsInSearchResults(searchResultWithSiblings as any); + + expect(result).toHaveLength(1); + expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + expect(result?.[0]?.matchedEntities?.[1]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + + expect(result?.[0]?.matchedEntities).toHaveLength(2); + + expect(result?.[0]?.matchedFields).toHaveLength(2); + }); + + it('will not combine an entity with a ghost node', () => { + const result = combineSiblingsInSearchResults(searchResultWithGhostSiblings as any); + + expect(result).toHaveLength(1); + expect(result?.[0]?.matchedEntities?.[0]?.urn).toEqual( + 'urn:li:dataset:(urn:li:dataPlatform:bigquery,cypress_project.jaffle_shop.raw_orders,PROD)', + ); + expect(result?.[0]?.matchedEntities).toHaveLength(1); + + expect(result?.[0]?.matchedFields).toHaveLength(2); + }); + }); +}); diff --git a/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts new file mode 100644 index 0000000000000..4a5c8da6381b8 --- /dev/null +++ b/datahub-web-react/src/app/search/utils/combineSiblingsInSearchResults.ts @@ -0,0 +1,28 @@ +import { Entity, MatchedField } from '../../../types.generated'; +import { CombinedEntity, createSiblingEntityCombiner } from '../../entity/shared/siblingUtils'; + +type UncombinedSeaerchResults = { + entity: Entity; + matchedFields: Array; +}; + +export type CombinedSearchResult = CombinedEntity & Pick; + +export function combineSiblingsInSearchResults( + searchResults: Array | undefined = [], +): Array { + const combine = createSiblingEntityCombiner(); + const combinedSearchResults: Array = []; + + searchResults.forEach((searchResult) => { + const combinedResult = combine(searchResult.entity); + if (!combinedResult.skipped) { + combinedSearchResults.push({ + ...searchResult, + ...combinedResult.combinedEntity, + }); + } + }); + + return combinedSearchResults; +} diff --git a/datahub-web-react/src/app/search/utils/constants.ts b/datahub-web-react/src/app/search/utils/constants.ts index eecd18441e7a5..af45129022cc1 100644 --- a/datahub-web-react/src/app/search/utils/constants.ts +++ b/datahub-web-react/src/app/search/utils/constants.ts @@ -10,7 +10,6 @@ export const TAGS_FILTER_NAME = 'tags'; export const GLOSSARY_TERMS_FILTER_NAME = 'glossaryTerms'; export const CONTAINER_FILTER_NAME = 'container'; export const DOMAINS_FILTER_NAME = 'domains'; -export const DATA_PRODUCTS_FILTER_NAME = 'dataProducts'; export const OWNERS_FILTER_NAME = 'owners'; export const TYPE_NAMES_FILTER_NAME = 'typeNames'; export const PLATFORM_FILTER_NAME = 'platform'; @@ -57,7 +56,6 @@ export const ORDERED_FIELDS = [ TAGS_FILTER_NAME, GLOSSARY_TERMS_FILTER_NAME, DOMAINS_FILTER_NAME, - DATA_PRODUCTS_FILTER_NAME, FIELD_TAGS_FILTER_NAME, FIELD_GLOSSARY_TERMS_FILTER_NAME, FIELD_PATHS_FILTER_NAME, @@ -74,7 +72,6 @@ export const FIELD_TO_LABEL = { owners: 'Owner', tags: 'Tag', domains: 'Domain', - [DATA_PRODUCTS_FILTER_NAME]: 'Data Product', platform: 'Platform', fieldTags: 'Column Tag', glossaryTerms: 'Glossary Term', diff --git a/datahub-web-react/src/app/settings/SettingsPage.tsx b/datahub-web-react/src/app/settings/SettingsPage.tsx index bfec9b395cff2..339cc0cf44bac 100644 --- a/datahub-web-react/src/app/settings/SettingsPage.tsx +++ b/datahub-web-react/src/app/settings/SettingsPage.tsx @@ -7,6 +7,7 @@ import { ToolOutlined, FilterOutlined, TeamOutlined, + PushpinOutlined, } from '@ant-design/icons'; import { Redirect, Route, useHistory, useLocation, useRouteMatch, Switch } from 'react-router'; import styled from 'styled-components'; @@ -19,6 +20,7 @@ import { Preferences } from './Preferences'; import { ManageViews } from '../entity/view/ManageViews'; import { useUserContext } from '../context/useUserContext'; import { ManageOwnership } from '../entity/ownership/ManageOwnership'; +import ManagePosts from './posts/ManagePosts'; const PageContainer = styled.div` display: flex; @@ -62,6 +64,7 @@ const PATHS = [ { path: 'preferences', content: }, { path: 'views', content: }, { path: 'ownership', content: }, + { path: 'posts', content: }, ]; /** @@ -91,6 +94,7 @@ export const SettingsPage = () => { const showUsersGroups = (isIdentityManagementEnabled && me && me?.platformPrivileges?.manageIdentities) || false; const showViews = isViewsEnabled || false; const showOwnershipTypes = me && me?.platformPrivileges?.manageOwnershipTypes; + const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements; return ( @@ -143,6 +147,11 @@ export const SettingsPage = () => { Ownership Types )} + {showHomePagePosts && ( + + Home Page Posts + + )} diff --git a/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx new file mode 100644 index 0000000000000..a8d6cfa64c9c1 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/CreatePostForm.tsx @@ -0,0 +1,91 @@ +import React, { useState } from 'react'; +import { Form, Input, Typography, FormInstance, Radio } from 'antd'; +import styled from 'styled-components'; +import { + DESCRIPTION_FIELD_NAME, + LINK_FIELD_NAME, + LOCATION_FIELD_NAME, + TITLE_FIELD_NAME, + TYPE_FIELD_NAME, +} from './constants'; +import { PostContentType } from '../../../types.generated'; + +const TopFormItem = styled(Form.Item)` + margin-bottom: 24px; +`; + +const SubFormItem = styled(Form.Item)` + margin-bottom: 0; +`; + +type Props = { + setCreateButtonEnabled: (isEnabled: boolean) => void; + form: FormInstance; +}; + +export default function CreatePostForm({ setCreateButtonEnabled, form }: Props) { + const [postType, setPostType] = useState(PostContentType.Text); + + return ( +
{ + setCreateButtonEnabled(!form.getFieldsError().some((field) => field.errors.length > 0)); + }} + > + Post Type}> + setPostType(e.target.value)} + value={postType} + defaultValue={postType} + optionType="button" + buttonStyle="solid" + > + Announcement + Link + + + + Title}> + The title for your new post. + + + + + {postType === PostContentType.Text && ( + Description}> + The main content for your new post. + + + + + )} + {postType === PostContentType.Link && ( + <> + Link URL}> + + Where users will be directed when they click this post. + + + + + + Image URL}> + + A URL to an image you want to display on your link post. + + + + + + + )} +
+ ); +} diff --git a/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx new file mode 100644 index 0000000000000..b4851ecb02969 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/CreatePostModal.tsx @@ -0,0 +1,107 @@ +import React, { useState } from 'react'; +import { Button, Form, message, Modal } from 'antd'; +import CreatePostForm from './CreatePostForm'; +import { + CREATE_POST_BUTTON_ID, + DESCRIPTION_FIELD_NAME, + LINK_FIELD_NAME, + LOCATION_FIELD_NAME, + TYPE_FIELD_NAME, + TITLE_FIELD_NAME, +} from './constants'; +import { useEnterKeyListener } from '../../shared/useEnterKeyListener'; +import { MediaType, PostContentType, PostType } from '../../../types.generated'; +import { useCreatePostMutation } from '../../../graphql/mutations.generated'; + +type Props = { + onClose: () => void; + onCreate: ( + contentType: string, + title: string, + description: string | undefined, + link: string | undefined, + location: string | undefined, + ) => void; +}; + +export default function CreatePostModal({ onClose, onCreate }: Props) { + const [createPostMutation] = useCreatePostMutation(); + const [createButtonEnabled, setCreateButtonEnabled] = useState(false); + const [form] = Form.useForm(); + const onCreatePost = () => { + const contentTypeValue = form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text; + const mediaValue = + form.getFieldValue(TYPE_FIELD_NAME) && form.getFieldValue(LOCATION_FIELD_NAME) + ? { + type: MediaType.Image, + location: form.getFieldValue(LOCATION_FIELD_NAME) ?? null, + } + : null; + createPostMutation({ + variables: { + input: { + postType: PostType.HomePageAnnouncement, + content: { + contentType: contentTypeValue, + title: form.getFieldValue(TITLE_FIELD_NAME), + description: form.getFieldValue(DESCRIPTION_FIELD_NAME) ?? null, + link: form.getFieldValue(LINK_FIELD_NAME) ?? null, + media: mediaValue, + }, + }, + }, + }) + .then(({ errors }) => { + if (!errors) { + message.success({ + content: `Created Post!`, + duration: 3, + }); + onCreate( + form.getFieldValue(TYPE_FIELD_NAME) ?? PostContentType.Text, + form.getFieldValue(TITLE_FIELD_NAME), + form.getFieldValue(DESCRIPTION_FIELD_NAME), + form.getFieldValue(LINK_FIELD_NAME), + form.getFieldValue(LOCATION_FIELD_NAME), + ); + form.resetFields(); + } + }) + .catch((e) => { + message.destroy(); + message.error({ content: 'Failed to create Post! An unknown error occured.', duration: 3 }); + console.error('Failed to create Post:', e.message); + }); + onClose(); + }; + + // Handle the Enter press + useEnterKeyListener({ + querySelectorToExecuteClick: '#createPostButton', + }); + + return ( + + + + + } + > + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/ManagePosts.tsx b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx new file mode 100644 index 0000000000000..e0f694c192c62 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/ManagePosts.tsx @@ -0,0 +1,40 @@ +import { Typography } from 'antd'; +import React from 'react'; +import styled from 'styled-components/macro'; +import { PostList } from './PostsList'; + +const PageContainer = styled.div` + padding-top: 20px; + width: 100%; + height: 100%; +`; + +const PageHeaderContainer = styled.div` + && { + padding-left: 24px; + } +`; + +const PageTitle = styled(Typography.Title)` + && { + margin-bottom: 12px; + } +`; + +const ListContainer = styled.div``; + +export default function ManagePosts() { + return ( + + + Home Page Posts + + View and manage pinned posts that appear to all users on the landing page. + + + + + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx new file mode 100644 index 0000000000000..e3fc424a47ef2 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostItemMenu.tsx @@ -0,0 +1,62 @@ +import React from 'react'; +import { DeleteOutlined } from '@ant-design/icons'; +import { Dropdown, Menu, message, Modal } from 'antd'; +import { MenuIcon } from '../../entity/shared/EntityDropdown/EntityDropdown'; +import { useDeletePostMutation } from '../../../graphql/post.generated'; + +type Props = { + urn: string; + title: string; + onDelete?: () => void; +}; + +export default function PostItemMenu({ title, urn, onDelete }: Props) { + const [deletePostMutation] = useDeletePostMutation(); + + const deletePost = () => { + deletePostMutation({ + variables: { + urn, + }, + }) + .then(({ errors }) => { + if (!errors) { + message.success('Deleted Post!'); + onDelete?.(); + } + }) + .catch(() => { + message.destroy(); + message.error({ content: `Failed to delete Post!: An unknown error occurred.`, duration: 3 }); + }); + }; + + const onConfirmDelete = () => { + Modal.confirm({ + title: `Delete Post '${title}'`, + content: `Are you sure you want to remove this Post?`, + onOk() { + deletePost(); + }, + onCancel() {}, + okText: 'Yes', + maskClosable: true, + closable: true, + }); + }; + + return ( + + +  Delete + + + } + > + + + ); +} diff --git a/datahub-web-react/src/app/settings/posts/PostsList.tsx b/datahub-web-react/src/app/settings/posts/PostsList.tsx new file mode 100644 index 0000000000000..5ae2be1547f9b --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostsList.tsx @@ -0,0 +1,200 @@ +import React, { useEffect, useState } from 'react'; +import { Button, Empty, Pagination, Typography } from 'antd'; +import { useLocation } from 'react-router'; +import styled from 'styled-components'; +import * as QueryString from 'query-string'; +import { PlusOutlined } from '@ant-design/icons'; +import { AlignType } from 'rc-table/lib/interface'; +import CreatePostModal from './CreatePostModal'; +import { PostColumn, PostEntry, PostListMenuColumn } from './PostsListColumns'; +import { useEntityRegistry } from '../../useEntityRegistry'; +import { useListPostsQuery } from '../../../graphql/post.generated'; +import { scrollToTop } from '../../shared/searchUtils'; +import { addToListPostCache, removeFromListPostCache } from './utils'; +import { Message } from '../../shared/Message'; +import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; +import { SearchBar } from '../../search/SearchBar'; +import { StyledTable } from '../../entity/shared/components/styled/StyledTable'; +import { POST_TYPE_TO_DISPLAY_TEXT } from './constants'; + +const PostsContainer = styled.div``; + +export const PostsPaginationContainer = styled.div` + display: flex; + justify-content: center; + padding: 12px; + padding-left: 16px; + border-bottom: 1px solid; + border-color: ${(props) => props.theme.styles['border-color-base']}; + display: flex; + justify-content: space-between; + align-items: center; +`; + +const PaginationInfo = styled(Typography.Text)` + padding: 0px; +`; + +const DEFAULT_PAGE_SIZE = 10; + +export const PostList = () => { + const entityRegistry = useEntityRegistry(); + const location = useLocation(); + const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); + const paramsQuery = (params?.query as string) || undefined; + const [query, setQuery] = useState(undefined); + useEffect(() => setQuery(paramsQuery), [paramsQuery]); + + const [page, setPage] = useState(1); + const [isCreatingPost, setIsCreatingPost] = useState(false); + + const pageSize = DEFAULT_PAGE_SIZE; + const start = (page - 1) * pageSize; + + const { loading, error, data, client, refetch } = useListPostsQuery({ + variables: { + input: { + start, + count: pageSize, + query, + }, + }, + fetchPolicy: query && query.length > 0 ? 'no-cache' : 'cache-first', + }); + + const totalPosts = data?.listPosts?.total || 0; + const lastResultIndex = start + pageSize > totalPosts ? totalPosts : start + pageSize; + const posts = data?.listPosts?.posts || []; + + const onChangePage = (newPage: number) => { + scrollToTop(); + setPage(newPage); + }; + + const handleDelete = (urn: string) => { + removeFromListPostCache(client, urn, page, pageSize); + setTimeout(() => { + refetch?.(); + }, 2000); + }; + + const allColumns = [ + { + title: 'Title', + dataIndex: '', + key: 'title', + sorter: (sourceA, sourceB) => { + return sourceA.title.localeCompare(sourceB.title); + }, + render: (record: PostEntry) => PostColumn(record.title, 200), + width: '20%', + }, + { + title: 'Description', + dataIndex: '', + key: 'description', + render: (record: PostEntry) => PostColumn(record.description || ''), + }, + { + title: 'Type', + dataIndex: '', + key: 'type', + render: (record: PostEntry) => PostColumn(POST_TYPE_TO_DISPLAY_TEXT[record.contentType]), + style: { minWidth: 100 }, + width: '10%', + }, + { + title: '', + dataIndex: '', + width: '5%', + align: 'right' as AlignType, + key: 'menu', + render: PostListMenuColumn(handleDelete), + }, + ]; + + const tableData = posts.map((post) => { + return { + urn: post.urn, + title: post.content.title, + description: post.content.description, + contentType: post.content.contentType, + }; + }); + + return ( + <> + {!data && loading && } + {error && } + + + + null} + onQueryChange={(q) => setQuery(q && q.length > 0 ? q : undefined)} + entityRegistry={entityRegistry} + hideRecommendations + /> + + }} + /> + {totalPosts > pageSize && ( + + + + {lastResultIndex > 0 ? (page - 1) * pageSize + 1 : 0} - {lastResultIndex} + {' '} + of {totalPosts} + + + + + )} + {isCreatingPost && ( + setIsCreatingPost(false)} + onCreate={(urn, title, description) => { + addToListPostCache( + client, + { + urn, + properties: { + title, + description: description || null, + }, + }, + pageSize, + ); + setTimeout(() => refetch(), 2000); + }} + /> + )} + + + ); +}; diff --git a/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx new file mode 100644 index 0000000000000..38f910baf8f41 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/PostsListColumns.tsx @@ -0,0 +1,26 @@ +import React from 'react'; +// import { Typography } from 'antd'; +import styled from 'styled-components/macro'; +import { Maybe } from 'graphql/jsutils/Maybe'; +import PostItemMenu from './PostItemMenu'; + +export interface PostEntry { + title: string; + contentType: string; + description: Maybe; + urn: string; +} + +const PostText = styled.div<{ minWidth?: number }>` + ${(props) => props.minWidth !== undefined && `min-width: ${props.minWidth}px;`} +`; + +export function PostListMenuColumn(handleDelete: (urn: string) => void) { + return (record: PostEntry) => ( + handleDelete(record.urn)} /> + ); +} + +export function PostColumn(text: string, minWidth?: number) { + return {text}; +} diff --git a/datahub-web-react/src/app/settings/posts/constants.ts b/datahub-web-react/src/app/settings/posts/constants.ts new file mode 100644 index 0000000000000..5a164019fe2e5 --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/constants.ts @@ -0,0 +1,13 @@ +import { PostContentType } from '../../../types.generated'; + +export const TITLE_FIELD_NAME = 'title'; +export const DESCRIPTION_FIELD_NAME = 'description'; +export const LINK_FIELD_NAME = 'link'; +export const LOCATION_FIELD_NAME = 'location'; +export const TYPE_FIELD_NAME = 'type'; +export const CREATE_POST_BUTTON_ID = 'createPostButton'; + +export const POST_TYPE_TO_DISPLAY_TEXT = { + [PostContentType.Link]: 'Link', + [PostContentType.Text]: 'Announcement', +}; diff --git a/datahub-web-react/src/app/settings/posts/utils.ts b/datahub-web-react/src/app/settings/posts/utils.ts new file mode 100644 index 0000000000000..ce48c7400738c --- /dev/null +++ b/datahub-web-react/src/app/settings/posts/utils.ts @@ -0,0 +1,77 @@ +import { ListPostsDocument, ListPostsQuery } from '../../../graphql/post.generated'; + +/** + * Add an entry to the list posts cache. + */ +export const addToListPostCache = (client, newPost, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPostsQuery | null = client.readQuery({ + query: ListPostsDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + }); + + // Add our new post into the existing list. + const newPosts = [newPost, ...(currData?.listPosts?.posts || [])]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPostsDocument, + variables: { + input: { + start: 0, + count: pageSize, + }, + }, + data: { + listPosts: { + start: 0, + count: (currData?.listPosts?.count || 0) + 1, + total: (currData?.listPosts?.total || 0) + 1, + posts: newPosts, + }, + }, + }); +}; + +/** + * Remove an entry from the list posts cache. + */ +export const removeFromListPostCache = (client, urn, page, pageSize) => { + // Read the data from our cache for this query. + const currData: ListPostsQuery | null = client.readQuery({ + query: ListPostsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + }); + + // Remove the post from the existing posts set. + const newPosts = [...(currData?.listPosts?.posts || []).filter((post) => post.urn !== urn)]; + + // Write our data back to the cache. + client.writeQuery({ + query: ListPostsDocument, + variables: { + input: { + start: (page - 1) * pageSize, + count: pageSize, + }, + }, + data: { + listPosts: { + start: currData?.listPosts?.start || 0, + count: (currData?.listPosts?.count || 1) - 1, + total: (currData?.listPosts?.total || 1) - 1, + posts: newPosts, + }, + }, + }); +}; diff --git a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx index 2288238091776..ed2460b6eea3c 100644 --- a/datahub-web-react/src/app/shared/tags/tag/Tag.tsx +++ b/datahub-web-react/src/app/shared/tags/tag/Tag.tsx @@ -8,6 +8,7 @@ import { StyledTag } from '../../../entity/shared/components/styled/StyledTag'; import { HoverEntityTooltip } from '../../../recommendations/renderer/component/HoverEntityTooltip'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { TagProfileDrawer } from '../TagProfileDrawer'; +import { useHasMatchedFieldByUrn } from '../../../search/context/SearchResultContext'; const TagLink = styled.span` display: inline-block; @@ -41,6 +42,7 @@ export default function Tag({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTagMutation] = useRemoveTagMutation(); + const highlightTag = useHasMatchedFieldByUrn(tag.tag.urn, 'tags'); const [tagProfileDrawerVisible, setTagProfileDrawerVisible] = useState(false); const [addTagUrn, setAddTagUrn] = useState(''); @@ -110,6 +112,7 @@ export default function Tag({ removeTag(tag); }} fontSize={fontSize} + highlightTag={highlightTag} > ` +const StyledTag = styled(Tag)<{ fontSize?: number; highlightTerm?: boolean }>` + &&& { + ${(props) => + props.highlightTerm && + ` + background: ${props.theme.styles['highlight-color']}; + border: 1px solid ${props.theme.styles['highlight-border-color']}; + `} + } ${(props) => props.fontSize && `font-size: ${props.fontSize}px;`} `; @@ -38,6 +47,7 @@ export default function TermContent({ }: Props) { const entityRegistry = useEntityRegistry(); const [removeTermMutation] = useRemoveTermMutation(); + const highlightTerm = useHasMatchedFieldByUrn(term.term.urn, 'glossaryTerms'); const removeTerm = (termToRemove: GlossaryTermAssociation) => { onOpenModal?.(); @@ -85,6 +95,7 @@ export default function TermContent({ removeTerm(term); }} fontSize={fontSize} + highlightTerm={highlightTerm} > diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 3b34b108ecc93..807a17c4fd6a4 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -27,6 +27,9 @@ export const DEFAULT_APP_CONFIG = { entityProfile: { domainDefaultTab: null, }, + searchResult: { + enableNameHighlight: false, + }, }, authConfig: { tokenAuthEnabled: false, diff --git a/datahub-web-react/src/conf/Global.ts b/datahub-web-react/src/conf/Global.ts index b16dd1eaace57..e1220b8c81b53 100644 --- a/datahub-web-react/src/conf/Global.ts +++ b/datahub-web-react/src/conf/Global.ts @@ -28,6 +28,7 @@ export enum PageRoutes { SETTINGS_VIEWS = '/settings/views', EMBED = '/embed', EMBED_LOOKUP = '/embed/lookup/:url', + SETTINGS_POSTS = '/settings/posts', } /** diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json index b648f3d997f21..9746c3ddde5f3 100644 --- a/datahub-web-react/src/conf/theme/theme_dark.config.json +++ b/datahub-web-react/src/conf/theme/theme_dark.config.json @@ -17,7 +17,9 @@ "disabled-color": "fade(white, 25%)", "steps-nav-arrow-color": "fade(white, 25%)", "homepage-background-upper-fade": "#FFFFFF", - "homepage-background-lower-fade": "#333E4C" + "homepage-background-lower-fade": "#333E4C", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json index e842fdb1bb8aa..906c04e38a1ba 100644 --- a/datahub-web-react/src/conf/theme/theme_light.config.json +++ b/datahub-web-react/src/conf/theme/theme_light.config.json @@ -20,7 +20,9 @@ "homepage-background-lower-fade": "#FFFFFF", "homepage-text-color": "#434343", "box-shadow": "0px 0px 30px 0px rgb(239 239 239)", - "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)" + "box-shadow-hover": "0px 1px 0px 0.5px rgb(239 239 239)", + "highlight-color": "#E6F4FF", + "highlight-border-color": "#BAE0FF" }, "assets": { "logoUrl": "/assets/logo.png" diff --git a/datahub-web-react/src/conf/theme/types.ts b/datahub-web-react/src/conf/theme/types.ts index 98140cbbd553d..7d78230092700 100644 --- a/datahub-web-react/src/conf/theme/types.ts +++ b/datahub-web-react/src/conf/theme/types.ts @@ -18,6 +18,8 @@ export type Theme = { 'homepage-background-lower-fade': string; 'box-shadow': string; 'box-shadow-hover': string; + 'highlight-color': string; + 'highlight-border-color': string; }; assets: { logoUrl: string; diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 4b1295f1024a2..bf15e5f757f8f 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -45,6 +45,9 @@ query appConfig { defaultTab } } + searchResult { + enableNameHighlight + } } telemetryConfig { enableThirdPartyLogging diff --git a/datahub-web-react/src/graphql/me.graphql b/datahub-web-react/src/graphql/me.graphql index 2c693c747af56..af850c9c3ce28 100644 --- a/datahub-web-react/src/graphql/me.graphql +++ b/datahub-web-react/src/graphql/me.graphql @@ -46,6 +46,7 @@ query getMe { createTags manageGlobalViews manageOwnershipTypes + manageGlobalAnnouncements } } } diff --git a/datahub-web-react/src/graphql/post.graphql b/datahub-web-react/src/graphql/post.graphql index c19f38fc7751c..ee092ad4fba90 100644 --- a/datahub-web-react/src/graphql/post.graphql +++ b/datahub-web-react/src/graphql/post.graphql @@ -20,3 +20,11 @@ query listPosts($input: ListPostsInput!) { } } } + +mutation createPost($input: CreatePostInput!) { + createPost(input: $input) +} + +mutation deletePost($urn: String!) { + deletePost(urn: $urn) +} diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 09610c9a9cfc1..7cd868d7cd2b2 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -2,6 +2,7 @@ fragment autoCompleteFields on Entity { urn type ... on Dataset { + exists name platform { ...platformFields @@ -19,6 +20,29 @@ fragment autoCompleteFields on Entity { subTypes { typeNames } + siblings { + isPrimary + siblings { + urn + type + ... on Dataset { + exists + platform { + ...platformFields + } + parentContainers { + ...parentContainersFields + } + name + properties { + name + description + qualifiedName + externalUrl + } + } + } + } ...datasetStatsFields } ... on CorpUser { @@ -808,6 +832,11 @@ fragment searchResults on SearchResults { matchedFields { name value + entity { + urn + type + ...entityDisplayNameFields + } } insights { text @@ -817,6 +846,11 @@ fragment searchResults on SearchResults { facets { ...facetFields } + suggestions { + text + frequency + score + } } fragment schemaFieldEntityFields on SchemaFieldEntity { diff --git a/docker/build.gradle b/docker/build.gradle index f33e06f383240..ae101fe1defc5 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -35,8 +35,31 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - environment "ACTIONS_VERSION", 'alpine3.17-slim' - environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + // environment "ACTIONS_VERSION", 'alpine3.17-slim' + // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' + + def cmd = [ + 'source ../metadata-ingestion/venv/bin/activate && ', + 'datahub docker quickstart', + '--no-pull-images', + '--standalone_consumers', + '--version', "v${version}", + '--dump-logs-on-failure' + ] + + commandLine 'bash', '-c', cmd.join(" ") +} + +task quickstartSlim(type: Exec, dependsOn: ':metadata-ingestion:install') { + dependsOn(([':docker:datahub-ingestion'] + quickstart_modules).collect { it + ':dockerTag' }) + shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + + environment "DATAHUB_TELEMETRY_ENABLED", "false" + environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" + environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" + environment "ACTIONS_VERSION", "v${version}-slim" + environment "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' + environment "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', @@ -64,6 +87,7 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' + environment "DATAHUB_PRECREATE_TOPICS", "true" environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 9893d44caf460..3d47f79617370 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,3 +1,6 @@ +ARG APP_ENV=full +ARG BASE_IMAGE=base + FROM golang:1-alpine3.17 AS binary ENV DOCKERIZE_VERSION v0.6.1 @@ -16,9 +19,7 @@ ENV CONFLUENT_KAFKA_VERSION=1.6.1 ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update && apt-get install -y \ - && apt-get install -y -qq \ - # gcc \ +RUN apt-get update && apt-get install -y -qq \ make \ python3-ldap \ libldap2-dev \ @@ -31,15 +32,34 @@ RUN apt-get update && apt-get install -y \ zip \ unzip \ ldap-utils \ - openjdk-11-jre-headless \ - && python -m pip install --upgrade pip wheel setuptools==57.5.0 \ - && curl -Lk -o /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz https://github.com/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz \ - && tar -xzf /root/librdkafka-${LIBRDKAFKA_VERSION}.tar.gz -C /root \ - && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ - && ./configure --prefix /usr && make && make install && make clean && ./configure --clean \ - && apt-get remove -y make + && python -m pip install --no-cache --upgrade pip wheel setuptools \ + && wget -q https://github.com/edenhill/librdkafka/archive/v${LIBRDKAFKA_VERSION}.tar.gz -O - | \ + tar -xz -C /root \ + && cd /root/librdkafka-${LIBRDKAFKA_VERSION} \ + && ./configure --prefix /usr && make && make install && cd .. && rm -rf /root/librdkafka-${LIBRDKAFKA_VERSION} \ + && apt-get remove -y make \ + && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + +# compiled against newer golang for security fixes COPY --from=binary /go/bin/dockerize /usr/local/bin +COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh + +RUN pip install --no-cache -r requirements.txt && \ + pip uninstall -y acryl-datahub && \ + chmod +x /entrypoint.sh && \ + addgroup --gid 1000 datahub && \ + adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub + +ENTRYPOINT [ "/entrypoint.sh" ] + +FROM ${BASE_IMAGE} as full-install + +RUN apt-get update && apt-get install -y -qq \ + default-jre-headless \ + && rm -rf /var/lib/apt/lists/* /var/cache/apk/* + RUN if [ $(arch) = "x86_64" ]; then \ mkdir /opt/oracle && \ cd /opt/oracle && \ @@ -58,7 +78,10 @@ RUN if [ $(arch) = "x86_64" ]; then \ ldconfig; \ fi; -COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt +FROM ${BASE_IMAGE} as slim-install +# Do nothing else on top of base + +FROM ${APP_ENV}-install -RUN pip install -r requirements.txt && \ - pip uninstall -y acryl-datahub +USER datahub +ENV PATH="/datahub-ingestion/.local/bin:$PATH" \ No newline at end of file diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 3d9e0777e5ce0..82d9a93a9a2c3 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -1,3 +1,7 @@ +# Excluded for slim +# pyspark==3.0.3 +# pydeequ==1.0.1 + acryl-datahub-classify==0.0.6 acryl-iceberg-legacy==0.0.4 acryl-PyHive==0.6.13 @@ -253,7 +257,6 @@ pycryptodome==3.18.0 pycryptodomex==3.18.0 pydantic==1.10.8 pydash==7.0.3 -pydeequ==1.0.1 pydruid==0.6.5 Pygments==2.15.1 pymongo==4.3.3 @@ -261,7 +264,6 @@ PyMySQL==1.0.3 pyOpenSSL==22.0.0 pyparsing==3.0.9 pyrsistent==0.19.3 -pyspark==3.0.3 pyspnego==0.9.0 python-daemon==3.0.1 python-dateutil==2.8.2 diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index fe3c12a59886f..10cd2ee71cce3 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,14 +12,17 @@ ext { } docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" + name "${docker_registry}/${docker_repo}:v${version}-slim" + version "v${version}-slim" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } + buildArgs([APP_ENV: 'slim']) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -27,10 +30,11 @@ task mkdirBuildDocker { } } dockerClean.finalizedBy(mkdirBuildDocker) +dockerClean.dependsOn([':docker:datahub-ingestion:dockerClean']) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/datahub-ingestion-base/entrypoint.sh b/docker/datahub-ingestion-base/entrypoint.sh new file mode 100644 index 0000000000000..518bb21561467 --- /dev/null +++ b/docker/datahub-ingestion-base/entrypoint.sh @@ -0,0 +1,14 @@ +#!/usr/bin/bash + +if [ ! -z "$ACTIONS_EXTRA_PACKAGES" ]; then + pip install --user $ACTIONS_EXTRA_PACKAGES +fi + +if [[ ! -z "$ACTIONS_CONFIG" && ! -z "$ACTIONS_EXTRA_PACKAGES" ]]; then + mkdir -p /tmp/datahub/logs + curl -q "$ACTIONS_CONFIG" -o config.yaml + exec dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s \ + datahub actions --config config.yaml +else + exec datahub $@ +fi diff --git a/docker/datahub-ingestion-slim/Dockerfile b/docker/datahub-ingestion-slim/Dockerfile deleted file mode 100644 index 580dcc4277124..0000000000000 --- a/docker/datahub-ingestion-slim/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -# Defining environment -ARG APP_ENV=prod -ARG DOCKER_VERSION=latest - -FROM acryldata/datahub-ingestion:$DOCKER_VERSION as base - -USER 0 -RUN pip uninstall -y pyspark -USER datahub diff --git a/docker/datahub-ingestion-slim/build.gradle b/docker/datahub-ingestion-slim/build.gradle deleted file mode 100644 index f21b66b576a0c..0000000000000 --- a/docker/datahub-ingestion-slim/build.gradle +++ /dev/null @@ -1,39 +0,0 @@ -plugins { - id 'com.palantir.docker' - id 'java' // required for versioning -} - -apply from: "../../gradle/versioning/versioning.gradle" - -ext { - docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry - docker_repo = 'datahub-ingestion-slim' - docker_dir = 'datahub-ingestion-slim' -} - -docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") - files fileTree(rootProject.projectDir) { - include "docker/${docker_dir}/*" - } - buildArgs([DOCKER_VERSION: version]) - - buildx(false) -} -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion:docker']) - -task mkdirBuildDocker { - doFirst { - mkdir "${project.buildDir}/docker" - } -} -dockerClean.finalizedBy(mkdirBuildDocker) - -task cleanLocalDockerImages { - doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) - } -} -dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 45a98efb7f6fb..0ecc30d02ac3f 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,42 +1,27 @@ # Defining environment -ARG APP_ENV=prod +ARG APP_ENV=full +ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=latest -FROM acryldata/datahub-ingestion-base:$DOCKER_VERSION as base - -FROM eclipse-temurin:11 as prod-build -COPY . /datahub-src -WORKDIR /datahub-src -# We noticed that the gradle wrapper download failed frequently on in CI on arm64 machines. -# I suspect this was due because of the QEMU emulation slowdown, combined with the arm64 -# build being starved for CPU by the x86_64 build's codegen step. -# -# The middle step will attempt to download gradle wrapper 5 times with exponential backoff. -# The ./gradlew --version will force the download of the gradle wrapper but is otherwise a no-op. -# Note that the retry logic will always return success, so we should always attempt to run codegen. -# Inspired by https://github.com/gradle/gradle/issues/18124#issuecomment-958182335. -# and https://unix.stackexchange.com/a/82610/378179. -# This is a workaround for https://github.com/gradle/gradle/issues/18124. -RUN (for attempt in 1 2 3 4 5; do ./gradlew --version && break ; echo "Failed to download gradle wrapper (attempt $attempt)" && sleep $((2<<$attempt)) ; done ) && \ - ./gradlew :metadata-events:mxe-schemas:build - -FROM base as prod-codegen -COPY --from=prod-build /datahub-src /datahub-src -RUN cd /datahub-src/metadata-ingestion && \ - pip install -e ".[base]" && \ - ./scripts/codegen.sh - -FROM base as prod-install -COPY --from=prod-codegen /datahub-src/metadata-ingestion /datahub-ingestion -COPY --from=prod-codegen /root/.cache/pip /root/.cache/pip +FROM $BASE_IMAGE:$DOCKER_VERSION as base +USER 0 + +COPY ./metadata-ingestion /datahub-ingestion + ARG RELEASE_VERSION -RUN cd /datahub-ingestion && \ - sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ +WORKDIR /datahub-ingestion +RUN sed -i.bak "s/__version__ = \"0.0.0.dev0\"/__version__ = \"$RELEASE_VERSION\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py && \ - pip install ".[all]" && \ - pip freeze && \ - # This is required to fix security vulnerability in htrace-core4 - rm -f /usr/local/lib/python3.10/site-packages/pyspark/jars/htrace-core4-4.1.0-incubating.jar + chown -R datahub /datahub-ingestion + +USER datahub +ENV PATH="/datahub-ingestion/.local/bin:$PATH" + +FROM base as slim-install +RUN pip install --no-cache --user ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" + +FROM base as full-install +RUN pip install --no-cache --user ".[all]" FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. @@ -44,7 +29,5 @@ FROM base as dev-install FROM ${APP_ENV}-install as final -RUN addgroup --system datahub && adduser --system datahub --ingroup datahub USER datahub - -ENTRYPOINT [ "datahub" ] +ENV PATH="/datahub-ingestion/.local/bin:$PATH" diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 7a24d87794c0e..22531c0c4fd0e 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -11,24 +11,30 @@ ext { docker_dir = 'datahub-ingestion' } +dependencies { + project(':docker:datahub-ingestion-base') + project(':metadata-ingestion') +} + docker { - name "${docker_registry}/${docker_repo}:v${version}" - version "v${version}" + name "${docker_registry}/${docker_repo}:v${version}-slim" + version "v${version}-slim" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-ingestion/**" - include "metadata-events/**" - include "metadata-models/**" - include "li-utils/**" - include "docs/**" - include "gradle/**" - include "buildSrc/**" - include "*" + }.exclude { + i -> i.file.isHidden() || + i.file == buildDir || + i.file == project(':metadata-ingestion').buildDir } - buildArgs([DOCKER_VERSION: version]) + buildArgs([DOCKER_VERSION: version, + RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', ''), + APP_ENV: 'slim']) } -tasks.getByPath('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker']) +tasks.getByName('docker').dependsOn(['build', + ':docker:datahub-ingestion-base:docker', + ':metadata-ingestion:codegen']) task mkdirBuildDocker { doFirst { @@ -39,7 +45,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 5ea364dd31ca7..08f8cc1ec9c45 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -26,6 +26,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 10b3f3c0eca5e..a755eda21cbf5 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -27,6 +27,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 9228c11446ddf..d07ea5fa88f8b 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -26,6 +26,9 @@ services: hostname: actions image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env + environment: + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} depends_on: datahub-gms: condition: service_healthy diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index cc2fe1ec5c4db..ffee3b9c65cf4 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -17,6 +17,8 @@ docker { files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index 8cf9d0869dc9b..5707234b85f57 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -1,5 +1,7 @@ +ARG KAFKA_DOCKER_VERSION=7.4.1 + # Using as a base image because to get the needed jars for confluent utils -FROM confluentinc/cp-base-new@sha256:ac4e0f9bcaecdab728740529f37452231fa40760fcf561759fc3b219f46d2cc9 as confluent_base +FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base ARG MAVEN_REPO="https://repo1.maven.org/maven2" ARG SNAKEYAML_VERSION="2.0" @@ -16,12 +18,6 @@ ENV SCALA_VERSION 2.13 # Set the classpath for JARs required by `cub` ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"' -# Confluent Docker Utils Version (Namely the tag or branch to grab from git to install) -ARG PYTHON_CONFLUENT_DOCKER_UTILS_VERSION="v0.0.60" - -# This can be overriden for an offline/air-gapped builds -ARG PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC="git+https://github.com/confluentinc/confluent-docker-utils@${PYTHON_CONFLUENT_DOCKER_UTILS_VERSION}" - LABEL name="kafka" version=${KAFKA_VERSION} RUN apk add --no-cache bash coreutils @@ -39,7 +35,6 @@ RUN mkdir -p /opt \ && pip install --no-cache-dir --upgrade pip wheel setuptools \ && pip install jinja2 requests \ && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \ - && pip install --prefer-binary --prefix=/usr/local --upgrade "${PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC}" \ && rm -rf /tmp/* \ && apk del --purge .build-deps @@ -69,7 +64,8 @@ ENV USE_CONFLUENT_SCHEMA_REGISTRY="TRUE" COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh +COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh -RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh +RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh CMD ./kafka-setup.sh diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index a5d33457e45f7..573ef21c88bf9 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -16,6 +16,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -24,7 +26,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -35,7 +37,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/kafka-setup/kafka-ready.sh b/docker/kafka-setup/kafka-ready.sh new file mode 100755 index 0000000000000..ba87bde047ef5 --- /dev/null +++ b/docker/kafka-setup/kafka-ready.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +for i in {1..60} +do + kafka-broker-api-versions.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER + if [ $? -eq 0 ]; then + break + fi + if [ $i -eq 60 ]; then + echo "Kafka bootstrap server $KAFKA_BOOTSTRAP_SERVER not ready." + exit 1 + fi + sleep 5s +done diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh old mode 100644 new mode 100755 index 7b015421b7963..629e9bc9484ee --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -49,8 +49,8 @@ if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH fi -cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 - +# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 +. kafka-ready.sh ############################################################ # Start Topic Creation Logic diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 48a28f15a581d..0d8941cce4833 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index a5b0413ec4be8..8a026be09d2b4 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -17,6 +17,8 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -25,7 +27,7 @@ docker { load(true) push(false) } -tasks.getByPath('docker').dependsOn('build') +tasks.getByName('docker').dependsOn('build') task mkdirBuildDocker { doFirst { @@ -36,7 +38,7 @@ dockerClean.finalizedBy(mkdirBuildDocker) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 5a8edd6eacf19..38418bc8c41b9 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 6d51f2efcfcf2..cf879faa6a3f0 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 48f2d797bd8a4..007830078d2b4 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index bd30c359a2a76..390543b92123f 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -34,6 +34,8 @@ services: datahub-gms: condition: service_healthy environment: + - ACTIONS_CONFIG=${ACTIONS_CONFIG:-} + - ACTIONS_EXTRA_PACKAGES=${ACTIONS_EXTRA_PACKAGES:-} - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 - DATAHUB_GMS_PROTOCOL=http diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md index e1fadee6d371a..9c8f6761a62bc 100644 --- a/docs/advanced/no-code-modeling.md +++ b/docs/advanced/no-code-modeling.md @@ -211,7 +211,7 @@ record ServiceKey { * Name of the service */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string diff --git a/docs/authorization/access-policies-guide.md b/docs/authorization/access-policies-guide.md index 5820e513a83e3..1eabb64d2878f 100644 --- a/docs/authorization/access-policies-guide.md +++ b/docs/authorization/access-policies-guide.md @@ -110,10 +110,13 @@ In the second step, we can simply select the Privileges that this Platform Polic | Manage Tags | Allow the actor to create and remove any Tags | | Manage Public Views | Allow the actor to create, edit, and remove any public (shared) Views. | | Manage Ownership Types | Allow the actor to create, edit, and remove any Ownership Types. | +| Manage Platform Settings | (Acryl DataHub only) Allow the actor to manage global integrations and notification settings | +| Manage Monitors | (Acryl DataHub only) Allow the actor to create, remove, start, or stop any entity assertion monitors | | Restore Indices API[^1] | Allow the actor to restore indices for a set of entities via API | | Enable/Disable Writeability API[^1] | Allow the actor to enable or disable GMS writeability for use in data migrations | | Apply Retention API[^1] | Allow the actor to apply aspect retention via API | + [^1]: Only active if REST_API_AUTHORIZATION_ENABLED environment flag is enabled #### Step 3: Choose Policy Actors @@ -204,8 +207,15 @@ The common Metadata Privileges, which span across entity types, include: | Edit Status | Allow actor to edit the status of an entity (soft deleted or not). | | Edit Domain | Allow actor to edit the Domain of an entity. | | Edit Deprecation | Allow actor to edit the Deprecation status of an entity. | -| Edit Assertions | Allow actor to add and remove assertions from an entity. | -| Edit All | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | +| Edit Lineage | Allow actor to edit custom lineage edges for the entity. | +| Edit Data Product | Allow actor to edit the data product that an entity is part of | +| Propose Tags | (Acryl DataHub only) Allow actor to propose new Tags for the entity. | +| Propose Glossary Terms | (Acryl DataHub only) Allow actor to propose new Glossary Terms for the entity. | +| Propose Documentation | (Acryl DataHub only) Allow actor to propose new Documentation for the entity. | +| Manage Tag Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Tags for the entity. | +| Manage Glossary Terms Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Glossary Terms for the entity. | +| Manage Documentation Proposals | (Acryl DataHub only) Allow actor to accept or reject proposed Documentation for the entity | +| Edit Entity | Allow actor to edit any information about an entity. Super user privileges. Controls the ability to ingest using API when REST API Authorization is enabled. | | Get Timeline API[^1] | Allow actor to get the timeline of an entity via API. | | Get Entity API[^1] | Allow actor to get an entity via API. | | Get Timeseries Aspect API[^1] | Allow actor to get a timeseries aspect via API. | @@ -225,10 +235,19 @@ The common Metadata Privileges, which span across entity types, include: | Dataset | Edit Dataset Queries | Allow actor to edit the Highlighted Queries on the Queries tab of the dataset. | | Dataset | View Dataset Usage | Allow actor to access usage metadata about a dataset both in the UI and in the GraphQL API. This includes example queries, number of queries, etc. Also applies to REST APIs when REST API Authorization is enabled. | | Dataset | View Dataset Profile | Allow actor to access a dataset's profile both in the UI and in the GraphQL API. This includes snapshot statistics like #rows, #columns, null percentage per field, etc. | +| Dataset | Edit Assertions | Allow actor to change the assertions associated with a dataset. | +| Dataset | Edit Incidents | (Acryl DataHub only) Allow actor to change the incidents associated with a dataset. | +| Dataset | Edit Monitors | (Acryl DataHub only) Allow actor to change the assertion monitors associated with a dataset. | | Tag | Edit Tag Color | Allow actor to change the color of a Tag. | | Group | Edit Group Members | Allow actor to add and remove members to a group. | +| Group | Edit Contact Information | Allow actor to change email, slack handle associated with the group. | +| Group | Manage Group Subscriptions | (Acryl DataHub only) Allow actor to subscribe the group to entities. | +| Group | Manage Group Notifications | (Acryl DataHub only) Allow actor to change notification settings for the group. | | User | Edit User Profile | Allow actor to change the user's profile including display name, bio, title, profile image, etc. | | User + Group | Edit Contact Information | Allow actor to change the contact information such as email & chat handles. | +| Term Group | Manage Direct Glossary Children | Allow actor to change the direct child Term Groups or Terms of the group. | +| Term Group | Manage All Glossary Children | Allow actor to change any direct or indirect child Term Groups or Terms of the group. | + > **Still have questions about Privileges?** Let us know in [Slack](https://slack.datahubproject.io)! diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 2b6fd5571cc9e..7ba516c82cf1b 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -15,6 +15,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #8300: Clickhouse source now inherited from TwoTierSQLAlchemy. In old way we have platform_instance -> container -> co container db (None) -> container schema and now we have platform_instance -> container database. - #8300: Added `uri_opts` argument; now we can add any options for clickhouse client. +- #8659: BigQuery ingestion no longer creates DataPlatformInstance aspects by default. + This will only affect users that were depending on this aspect for custom functionality, + and can be enabled via the `include_data_platform_instance` config option. ## 0.10.5 diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index ef4071f89c585..21d59b777dd7c 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -62,6 +62,7 @@ lazy_load_plugins = False | datahub.cluster | prod | name of the airflow cluster | | datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | | datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | + | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | | datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | 5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion/src/datahub_provider/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). @@ -80,9 +81,7 @@ Emitting DataHub ... If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`. -The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. - - +The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. ```python class DbtOperator(BaseOperator): @@ -97,8 +96,8 @@ class DbtOperator(BaseOperator): def _get_lineage(self): # Do some processing to get inlets/outlets - - return inlets, outlets + + return inlets, outlets ``` If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) @@ -172,7 +171,6 @@ Take a look at this sample DAG: In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. - ## Debugging ### Incorrect URLs diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index 32951ab2e41eb..f47630f44e772 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -323,7 +323,7 @@ It takes the following parameters: annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define the set of mappings to be applied in the MappingsBuilder. - Thus far, we have implemented 10 fieldTypes: + Thus far, we have implemented 11 fieldTypes: 1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering @@ -332,20 +332,25 @@ It takes the following parameters: 3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial matching is expensive, so this field type should not be applied to fields with long values (like description) - 4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + 4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND + word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries + matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is + expensive, so should not be applied to fields with long values such as description. - 5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like + 5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths. + + 6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components - 6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. + 7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support. - 7. *BOOLEAN* - Boolean fields used for filtering. + 8. *BOOLEAN* - Boolean fields used for filtering. - 8. *COUNT* - Count fields used for filtering. + 9. *COUNT* - Count fields used for filtering. - 9. *DATETIME* - Datetime fields used to represent timestamps. + 10. *DATETIME* - Datetime fields used to represent timestamps. - 10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as + 11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a mapping explosion in Elasticsearch. diff --git a/docs/quickstart.md b/docs/quickstart.md index b93713c4efa5c..cd91dc8d1ac84 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -145,6 +145,27 @@ Please refer to [Change the default user datahub in quickstart](authentication/c We recommend deploying DataHub to production using Kubernetes. We provide helpful [Helm Charts](https://artifacthub.io/packages/helm/datahub/datahub) to help you quickly get up and running. Check out [Deploying DataHub to Kubernetes](./deploy/kubernetes.md) for a step-by-step walkthrough. +The `quickstart` method of running DataHub is intended for local development and a quick way to experience the features that DataHub has to offer. It is not +intended for a production environment. This recommendation is based on the following points. + +#### Default Credentials + +`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying +prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a +design choice to make development easier and is not best practice for a production environment. + +#### Exposed Ports + +DataHub's services, and it's backend data stores use the docker default behavior of binding to all interface addresses. +This makes it useful for development but is not recommended in a production environment. + +#### Performance & Management + +* `quickstart` is limited by the resources available on a single host, there is no ability to scale horizontally. +* Rollout of new versions requires downtime. +* The configuration is largely pre-determined and not easily managed. +* `quickstart`, by default, follows the most recent builds forcing updates to the latest released and unreleased builds. + ## Other Common Operations ### Stopping DataHub diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java index 2ffd9283ed456..8f2f42cd69cae 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/SearchableFieldSpecExtractor.java @@ -155,7 +155,8 @@ private void extractSearchableAnnotation(final Object annotationObj, final DataS annotation.getBoostScore(), annotation.getHasValuesFieldName(), annotation.getNumValuesFieldName(), - annotation.getWeightsPerFieldValue()); + annotation.getWeightsPerFieldValue(), + annotation.getFieldNameAliases()); } } log.debug("Searchable annotation for field: {} : {}", schemaPathSpec, annotation); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java index f2e65c771c6eb..d5e5044f95c23 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java @@ -4,7 +4,10 @@ import com.google.common.collect.ImmutableSet; import com.linkedin.data.schema.DataSchema; import com.linkedin.metadata.models.ModelValidationException; + +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -19,9 +22,10 @@ @Value public class SearchableAnnotation { + public static final String FIELD_NAME_ALIASES = "fieldNameAliases"; public static final String ANNOTATION_NAME = "Searchable"; private static final Set DEFAULT_QUERY_FIELD_TYPES = - ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL); + ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL); // Name of the field in the search index. Defaults to the field name in the schema String fieldName; @@ -47,6 +51,8 @@ public class SearchableAnnotation { Optional numValuesFieldName; // (Optional) Weights to apply to score for a given value Map weightsPerFieldValue; + // (Optional) Aliases for this given field that can be used for sorting etc. + List fieldNameAliases; public enum FieldType { KEYWORD, @@ -59,7 +65,8 @@ public enum FieldType { COUNT, DATETIME, OBJECT, - BROWSE_PATH_V2 + BROWSE_PATH_V2, + WORD_GRAM } @Nonnull @@ -93,6 +100,7 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob final Optional numValuesFieldName = AnnotationUtils.getField(map, "numValuesFieldName", String.class); final Optional weightsPerFieldValueMap = AnnotationUtils.getField(map, "weightsPerFieldValue", Map.class).map(m -> (Map) m); + final List fieldNameAliases = getFieldNameAliases(map); final FieldType resolvedFieldType = getFieldType(fieldType, schemaDataType); return new SearchableAnnotation( @@ -107,7 +115,8 @@ public static SearchableAnnotation fromPegasusAnnotationObject(@Nonnull final Ob boostScore.orElse(1.0), hasValuesFieldName, numValuesFieldName, - weightsPerFieldValueMap.orElse(ImmutableMap.of())); + weightsPerFieldValueMap.orElse(ImmutableMap.of()), + fieldNameAliases); } private static FieldType getFieldType(Optional maybeFieldType, DataSchema.Type schemaDataType) { @@ -155,4 +164,15 @@ private static String capitalizeFirstLetter(String str) { return str.substring(0, 1).toUpperCase() + str.substring(1); } } + + private static List getFieldNameAliases(Map map) { + final List aliases = new ArrayList<>(); + final Optional fieldNameAliases = AnnotationUtils.getField(map, FIELD_NAME_ALIASES, List.class); + if (fieldNameAliases.isPresent()) { + for (Object alias : fieldNameAliases.get()) { + aliases.add((String) alias); + } + } + return aliases; + } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java index 1ab5ff640ce32..3618108970afa 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java @@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName()); // Assert on Searchable Fields - assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size()); + assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10); assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get( @@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) { assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("textArrayField", "*").toString()) .getSearchableAnnotation().getFieldType()); + assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName()); + assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get( + new PathSpec("wordGramField").toString()) + .getSearchableAnnotation().getFieldType()); assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get( new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName()); assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get( diff --git a/gradle/docker/docker.gradle b/gradle/docker/docker.gradle index f0bb4a5500b33..db2979a8ff6dc 100644 --- a/gradle/docker/docker.gradle +++ b/gradle/docker/docker.gradle @@ -21,6 +21,7 @@ ext.getDockerContainers = { ext.cleanLocalDockerImages = { String docker_registry, String docker_repo, String docker_tag -> + println("Docker image string: ${docker_registry}/${docker_repo}:${docker_tag}") def containers = getDockerContainers(docker_registry, docker_repo, docker_tag) if(!containers.isEmpty()) { println "Stopping containers: $containers" @@ -35,6 +36,7 @@ ext.cleanLocalDockerImages = { if(!images.isEmpty()) { println "Removing images: $images" exec { + ignoreExitValue true // may not work if used by downstream image commandLine = ["docker", "rmi", "-f"] + images } } diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 5b40eb5f32232..6b08ac50a4c17 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -23,5 +23,8 @@ dependencies { implementation(externalDependency.log4jApi) { because("previous versions are vulnerable to CVE-2021-45105") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } } \ No newline at end of file diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index f636cf25c67f7..199ccc59c21e0 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -21,11 +21,13 @@ task checkPythonVersion(type: Exec) { } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { + def sentinel_file = "${venv_name}/.venv_environment_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") + outputs.file(sentinel_file) commandLine 'bash', '-c', "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0'" + "${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " + + "touch ${sentinel_file}" } task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { @@ -39,7 +41,6 @@ task runPreFlightScript(type: Exec, dependsOn: environmentSetup) { task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { def sentinel_file = "${venv_name}/.build_install_package_only_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . &&" + @@ -47,9 +48,12 @@ task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) { } task installPackage(type: Exec, dependsOn: installPackageOnly) { + def sentinel_file = "${venv_name}/.build_install_package_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") - commandLine 'bash', '-x', '-c', "${venv_name}/bin/pip install -e . ${extra_pip_requirements}" + outputs.file(sentinel_file) + commandLine 'bash', '-x', '-c', + "${venv_name}/bin/pip install -e . ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } task codegen(type: Exec, dependsOn: [environmentSetup, installPackage, ':metadata-events:mxe-schemas:build']) { @@ -63,7 +67,6 @@ task install(dependsOn: [installPackage, codegen]) task installDev(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_dev_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + @@ -75,7 +78,6 @@ task installDev(type: Exec, dependsOn: [install]) { task installAll(type: Exec, dependsOn: [install]) { def sentinel_file = "${venv_name}/.build_install_all_sentinel" inputs.file file('setup.py') - outputs.dir("${venv_name}") outputs.file(sentinel_file) commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md index 9d400460407c8..03bcef70e1860 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect.md @@ -1,5 +1,60 @@ ## Advanced Configurations +### Working with Platform Instances +If you've multiple instances of kafka OR source/sink systems that are referred in your `kafka-connect` setup, you'd need to configure platform instance for these systems in `kafka-connect` recipe to generate correct lineage edges. You must have already set `platform_instance` in recipes of original source/sink systems. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this. + +There are two options available to declare source/sink system's `platform_instance` in `kafka-connect` recipe. If single instance of platform is used across all `kafka-connect` connectors, you can use `platform_instance_map` to specify platform_instance to use for a platform when constructing URNs for lineage. + +Example: +```yml + # Map of platform name to platform instance + platform_instance_map: + snowflake: snowflake_platform_instance + mysql: mysql_platform_instance + +``` +If multiple instances of platform are used across `kafka-connect` connectors, you'd need to specify platform_instance to use for platform for every connector. + +#### Example - Multiple MySQL Source Connectors each reading from different mysql instance +```yml + # Map of platform name to platform instance per connector + connect_to_platform_map: + mysql_connector1: + mysql: mysql_instance1 + + mysql_connector2: + mysql: mysql_instance2 +``` +Here mysql_connector1 and mysql_connector2 are names of MySQL source connectors as defined in `kafka-connect` connector config. + +#### Example - Multiple MySQL Source Connectors each reading from difference mysql instance and writing to different kafka cluster +```yml + connect_to_platform_map: + mysql_connector1: + mysql: mysql_instance1 + kafka: kafka_instance1 + + mysql_connector2: + mysql: mysql_instance2 + kafka: kafka_instance2 +``` +You can also use combination of `platform_instance_map` and `connect_to_platform_map` in your recipe. Note that, the platform_instance specified for the connector in `connect_to_platform_map` will always take higher precedance even if platform_instance for same platform is set in `platform_instance_map`. + +If you do not use `platform_instance` in original source/sink recipes, you do not need to specify them in above configurations. + +Note that, you do not need to specify platform_instance for BigQuery. + +#### Example - Multiple BigQuery Sink Connectors each writing to different kafka cluster +```yml + connect_to_platform_map: + bigquery_connector1: + kafka: kafka_instance1 + + bigquery_connector2: + kafka: kafka_instance2 +``` + +### Provided Configurations from External Sources Kafka Connect supports pluggable configuration providers which can load configuration data from external sources at runtime. These values are not available to DataHub ingestion source through Kafka Connect APIs. If you are using such provided configurations to specify connection url (database, etc) in Kafka Connect connector configuration then you will need also add these in `provided_configs` section in recipe for DataHub to generate correct lineage. ```yml diff --git a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml index f5e33e661622d..cacbda5ca078a 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml +++ b/metadata-ingestion/docs/sources/kafka-connect/kafka-connect_recipe.yml @@ -3,14 +3,16 @@ source: config: # Coordinates connect_uri: "http://localhost:8083" - + # Credentials username: admin password: password # Optional - platform_instance_map: - bigquery: bigquery_platform_instance_id - + # Platform instance mapping to use when constructing URNs. + # Use if single instance of platform is referred across connectors. + platform_instance_map: + mysql: mysql_platform_instance + sink: - # sink configs \ No newline at end of file + # sink configs diff --git a/metadata-ingestion/docs/sources/mssql/mssql_pre.md b/metadata-ingestion/docs/sources/mssql/mssql_pre.md new file mode 100644 index 0000000000000..396581966e691 --- /dev/null +++ b/metadata-ingestion/docs/sources/mssql/mssql_pre.md @@ -0,0 +1,14 @@ +### Prerequisites + +If you want to ingest MSSQL Jobs and stored procedures (with code) the user credentials needs the proper privileges. + +Script for granting the privileges: +``` +USE MSDB +GRANT SELECT ON OBJECT::msdb.dbo.sysjobsteps TO 'USERNAME' +GRANT SELECT ON OBJECT::msdb.dbo.sysjobs TO 'USERNAME' + +USE 'DATA_DB_NAME' +GRANT VIEW DEFINITION TO 'USERNAME' +GRANT SELECT ON OBJECT::sys.sql_expression_dependencies TO 'USERNAME' +``` \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md index 9a381fb351aec..75bd579417a48 100644 --- a/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_pre.md @@ -99,6 +99,24 @@ The steps slightly differ based on which you decide to use. including `client_id` and `client_secret`, plus your Okta user's `Username` and `Password` * Note: the `username` and `password` config options are not nested under `oauth_config` +### Snowflake Shares +If you are using [Snowflake Shares](https://docs.snowflake.com/en/user-guide/data-sharing-provider) to share data across different snowflake accounts, and you have set up DataHub recipes for ingesting metadata from all these accounts, you may end up having multiple similar dataset entities corresponding to virtual versions of same table in different snowflake accounts. DataHub Snowflake connector can automatically link such tables together through Siblings and Lineage relationship if user provides information necessary to establish the relationship using configuration `shares` in recipe. + +#### Example +- Snowflake account `account1` (ingested as platform_instance `instance1`) owns a database `db1`. A share `X` is created in `account1` that includes database `db1` along with schemas and tables inside it. +- Now, `X` is shared with snowflake account `account2` (ingested as platform_instance `instance2`). A database `db1_from_X` is created from inbound share `X` in `account2`. In this case, all tables and views included in share `X` will also be present in `instance2`.`db1_from_X`. +- This can be represented in `shares` configuration section as + ```yaml + shares: + X: # name of the share + database_name: db1 + platform_instance: instance1 + consumers: # list of all databases created from share X + - database_name: db1_from_X + platform_instance: instance2 + + ``` +- If share `X` is shared with more snowflake accounts and database is created from share `X` in those account then additional entries need to be added in `consumers` list for share `X`, one per snowflake account. The same `shares` config can then be copied across recipes of all accounts. ### Caveats - Some of the features are only available in the Snowflake Enterprise Edition. This doc has notes mentioning where this applies. diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e08c41a8ac78c..ded9186e08a22 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -358,7 +358,7 @@ def get_long_description(): "mysql": mysql, # mariadb should have same dependency as mysql "mariadb": sql_common | {"pymysql>=1.0.2"}, - "okta": {"okta~=1.7.0"}, + "okta": {"okta~=1.7.0", "nest-asyncio"}, "oracle": sql_common | {"cx_Oracle"}, "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, "presto": sql_common | pyhive_common | trino, @@ -376,6 +376,7 @@ def get_long_description(): "salesforce": {"simple-salesforce"}, "snowflake": snowflake_common | usage_common | sqlglot_lib, "sqlalchemy": sql_common, + "sql-queries": usage_common | sqlglot_lib, "superset": { "requests", "sqlalchemy", @@ -388,7 +389,7 @@ def get_long_description(): "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, - "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"}, + "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"} | sqlglot_lib, "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8"}, "unity-catalog": databricks | sqllineage_lib, @@ -454,7 +455,7 @@ def get_long_description(): "mypy==1.0.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.9.0", + "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", @@ -608,6 +609,7 @@ def get_long_description(): "demo-data = datahub.ingestion.source.demo_data.DemoDataSource", "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource", "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource", + "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource", ], "datahub.ingestion.transformer.plugins": [ "simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership", diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 918f610ce4635..9fde47c82873c 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -893,6 +893,7 @@ def download_compose_files( tmp_file.write(quickstart_download_response.content) logger.debug(f"Copied to {path}") if kafka_setup: + base_url = get_docker_compose_base_url(compose_git_ref) kafka_setup_github_file = f"{base_url}/{KAFKA_SETUP_QUICKSTART_COMPOSE_FILE}" default_kafka_compose_file = ( diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py new file mode 100644 index 0000000000000..071d590f270f8 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -0,0 +1,289 @@ +import logging +import time +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime +from typing import Collection, Dict, Iterable, List, Optional, Set + +from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig, UsageAggregator +from datahub.metadata.schema_classes import ( + AuditStampClass, + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, + OperationClass, + OperationTypeClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult + +logger = logging.getLogger(__name__) + +# TODO: Use this over other sources' equivalent code, if possible + +DatasetUrn = str +FieldUrn = str +UserUrn = str + + +@dataclass +class LineageEdge: + """Stores information about a single lineage edge, from an upstream table to a downstream table.""" + + downstream_urn: DatasetUrn + upstream_urn: DatasetUrn + audit_stamp: Optional[datetime] + actor: Optional[UserUrn] + type: str = DatasetLineageTypeClass.TRANSFORMED + + # Maps downstream_col -> {upstream_col} + column_map: Dict[str, Set[str]] = field(default_factory=lambda: defaultdict(set)) + + def gen_upstream_aspect(self) -> UpstreamClass: + return UpstreamClass( + auditStamp=AuditStampClass( + time=int(self.audit_stamp.timestamp() * 1000), actor=self.actor or "" + ) + if self.audit_stamp + else None, + dataset=self.upstream_urn, + type=self.type, + ) + + def gen_fine_grained_lineage_aspects(self) -> Iterable[FineGrainedLineageClass]: + for downstream_col, upstream_cols in self.column_map.items(): + yield FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + # Sort to avoid creating multiple aspects in backend with same lineage but different order + upstreams=sorted( + make_schema_field_urn(self.upstream_urn, col) + for col in upstream_cols + ), + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + downstreams=[ + make_schema_field_urn(self.downstream_urn, downstream_col) + ], + ) + + +@dataclass +class SqlParsingBuilder: + # Open question: does it make sense to iterate over out_tables? When will we have multiple? + + generate_lineage: bool = True + generate_usage_statistics: bool = True + generate_operations: bool = True + usage_config: Optional[BaseUsageConfig] = None + + # TODO: Make inner dict a FileBackedDict and make LineageEdge frozen + # Builds up a single LineageEdge for each upstream -> downstream pair + _lineage_map: Dict[DatasetUrn, Dict[DatasetUrn, LineageEdge]] = field( + default_factory=lambda: defaultdict(dict), init=False + ) + + # TODO: Replace with FileBackedDict approach like in BigQuery usage + _usage_aggregator: UsageAggregator[DatasetUrn] = field(init=False) + + def __post_init__(self) -> None: + if self.usage_config: + self._usage_aggregator = UsageAggregator(self.usage_config) + else: + logger.info("No usage config provided, not generating usage statistics") + self.generate_usage_statistics = False + + def process_sql_parsing_result( + self, + result: SqlParsingResult, + *, + query: str, + query_timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + custom_operation_type: Optional[str] = None, + include_urns: Optional[Set[DatasetUrn]] = None, + ) -> Iterable[MetadataWorkUnit]: + """Process a single query and yield any generated workunits. + + Args: + result: The result of parsing the query, or a mock result if parsing failed. + query: The SQL query to parse and process. + query_timestamp: When the query was run. + is_view_ddl: Whether the query is a DDL statement that creates a view. + user: The urn of the user who ran the query. + custom_operation_type: Platform-specific operation type, used if the operation type can't be parsed. + include_urns: If provided, only generate workunits for these urns. + """ + downstreams_to_ingest = result.out_tables + upstreams_to_ingest = result.in_tables + if include_urns: + logger.debug(f"Skipping urns {set(downstreams_to_ingest) - include_urns}") + downstreams_to_ingest = list(set(downstreams_to_ingest) & include_urns) + upstreams_to_ingest = list(set(upstreams_to_ingest) & include_urns) + + if self.generate_lineage: + for downstream_urn in downstreams_to_ingest: + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=result.in_tables, + column_lineage=result.column_lineage, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=query_timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + if self.generate_usage_statistics and query_timestamp is not None: + upstream_fields = _compute_upstream_fields(result) + for upstream_urn in upstreams_to_ingest: + self._usage_aggregator.aggregate_event( + resource=upstream_urn, + start_time=query_timestamp, + query=query, + user=user, + fields=sorted(upstream_fields.get(upstream_urn, [])), + ) + + if self.generate_operations and query_timestamp is not None: + for downstream_urn in downstreams_to_ingest: + yield from _gen_operation_workunit( + result, + downstream_urn=downstream_urn, + query_timestamp=query_timestamp, + user=user, + custom_operation_type=custom_operation_type, + ) + + def add_lineage( + self, + downstream_urn: DatasetUrn, + upstream_urns: Collection[DatasetUrn], + timestamp: Optional[datetime] = None, + is_view_ddl: bool = False, + user: Optional[UserUrn] = None, + ) -> None: + """Manually add a single upstream -> downstream lineage edge, e.g. if sql parsing fails.""" + _merge_lineage_data( + downstream_urn=downstream_urn, + upstream_urns=upstream_urns, + column_lineage=None, + upstream_edges=self._lineage_map[downstream_urn], + query_timestamp=timestamp, + is_view_ddl=is_view_ddl, + user=user, + ) + + def gen_workunits(self) -> Iterable[MetadataWorkUnit]: + if self.generate_lineage: + yield from self._gen_lineage_workunits() + if self.generate_usage_statistics: + yield from self._gen_usage_statistics_workunits() + + def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: + for downstream_urn in self._lineage_map: + upstreams: List[UpstreamClass] = [] + fine_upstreams: List[FineGrainedLineageClass] = [] + for upstream_urn, edge in self._lineage_map[downstream_urn].items(): + upstreams.append(edge.gen_upstream_aspect()) + fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects()) + + upstream_lineage = UpstreamLineageClass( + upstreams=sorted(upstreams, key=lambda x: x.dataset), + fineGrainedLineages=sorted( + fine_upstreams, + key=lambda x: (x.downstreams, x.upstreams), + ) + or None, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=upstream_lineage + ).as_workunit() + + def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]: + yield from self._usage_aggregator.generate_workunits( + resource_urn_builder=lambda urn: urn, user_urn_builder=lambda urn: urn + ) + + +def _merge_lineage_data( + downstream_urn: DatasetUrn, + *, + upstream_urns: Collection[DatasetUrn], + column_lineage: Optional[List[ColumnLineageInfo]], + upstream_edges: Dict[DatasetUrn, LineageEdge], + query_timestamp: Optional[datetime], + is_view_ddl: bool, + user: Optional[UserUrn], +) -> None: + for upstream_urn in upstream_urns: + edge = upstream_edges.setdefault( + upstream_urn, + LineageEdge( + downstream_urn=downstream_urn, + upstream_urn=upstream_urn, + audit_stamp=query_timestamp, + actor=user, + type=DatasetLineageTypeClass.VIEW + if is_view_ddl + else DatasetLineageTypeClass.TRANSFORMED, + ), + ) + if query_timestamp and ( # Use the most recent query + edge.audit_stamp is None or query_timestamp > edge.audit_stamp + ): + edge.audit_stamp = query_timestamp + if user: + edge.actor = user + + # Note: Inefficient as we loop through all column_lineage entries for each downstream table + for cl in column_lineage or []: + if cl.downstream.table == downstream_urn: + for upstream_column_info in cl.upstreams: + if upstream_column_info.table not in upstream_urns: + continue + column_map = upstream_edges[upstream_column_info.table].column_map + column_map[cl.downstream.column].add(upstream_column_info.column) + + +def _compute_upstream_fields( + result: SqlParsingResult, +) -> Dict[DatasetUrn, Set[DatasetUrn]]: + upstream_fields: Dict[DatasetUrn, Set[DatasetUrn]] = defaultdict(set) + for cl in result.column_lineage or []: + for upstream in cl.upstreams: + upstream_fields[upstream.table].add(upstream.column) + return upstream_fields + + +def _gen_operation_workunit( + result: SqlParsingResult, + *, + downstream_urn: DatasetUrn, + query_timestamp: datetime, + user: Optional[UserUrn], + custom_operation_type: Optional[str], +) -> Iterable[MetadataWorkUnit]: + operation_type = result.query_type.to_operation_type() + # Filter out SELECT and other undesired statements + if operation_type is None: + return + elif operation_type == OperationTypeClass.UNKNOWN: + if custom_operation_type is None: + return + else: + operation_type = OperationTypeClass.CUSTOM + + aspect = OperationClass( + timestampMillis=int(time.time() * 1000), + operationType=operation_type, + lastUpdatedTimestamp=int(query_timestamp.timestamp() * 1000), + actor=user, + customOperationType=custom_operation_type, + ) + yield MetadataChangeProposalWrapper( + entityUrn=downstream_urn, aspect=aspect + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/api/registry.py b/metadata-ingestion/src/datahub/ingestion/api/registry.py index ec4884e7e805f..56ea716948199 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/registry.py +++ b/metadata-ingestion/src/datahub/ingestion/api/registry.py @@ -127,7 +127,7 @@ def _ensure_not_lazy(self, key: str) -> Union[Type[T], Exception]: plugin_class = import_path(path) self.register(key, plugin_class, override=True) return plugin_class - except (AssertionError, ImportError) as e: + except Exception as e: self.register_disabled(key, e, override=True) return e diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index 8e313e92cbf84..c943b83a887ed 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -435,6 +435,7 @@ def _field_from_complex_type( field_path._set_parent_type_if_not_exists( DataHubType(type=MapTypeClass, nested_type=value_type) ) + # FIXME: description not set. This is present in schema["description"]. yield from JsonSchemaTranslator.get_fields( JsonSchemaTranslator._get_type_from_schema( schema["additionalProperties"] diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 243c1848279c7..50ea69b6c13a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type from avro.schema import RecordSchema from deprecated import deprecated @@ -38,6 +38,8 @@ SystemMetadataClass, TelemetryClientIdClass, ) +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: @@ -957,7 +959,11 @@ def delete_references_to_urn( @functools.lru_cache() def _make_schema_resolver( - self, platform: str, platform_instance: Optional[str], env: str + self, + platform: str, + platform_instance: Optional[str], + env: str, + include_graph: bool = True, ) -> "SchemaResolver": from datahub.utilities.sqlglot_lineage import SchemaResolver @@ -965,8 +971,50 @@ def _make_schema_resolver( platform=platform, platform_instance=platform_instance, env=env, - graph=self, + graph=self if include_graph else None, + ) + + def initialize_schema_resolver_from_datahub( + self, platform: str, platform_instance: Optional[str], env: str + ) -> Tuple["SchemaResolver", Set[str]]: + logger.info("Initializing schema resolver") + + # TODO: Filter on platform instance? + logger.info(f"Fetching urns for platform {platform}, env {env}") + with PerfTimer() as timer: + urns = set( + self.get_urns_by_filter( + entity_types=[DatasetUrn.ENTITY_TYPE], + platform=platform, + env=env, + batch_size=3000, + ) + ) + logger.info( + f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds" + ) + + schema_resolver = self._make_schema_resolver( + platform, platform_instance, env, include_graph=False ) + with PerfTimer() as timer: + count = 0 + for i, urn in enumerate(urns): + if i % 1000 == 0: + logger.debug(f"Loaded {i} schema metadata") + try: + schema_metadata = self.get_aspect(urn, SchemaMetadataClass) + if schema_metadata: + schema_resolver.add_schema_metadata(urn, schema_metadata) + count += 1 + except Exception: + logger.warning("Failed to load schema metadata", exc_info=True) + logger.info( + f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds" + ) + + logger.info("Finished initializing schema resolver") + return schema_resolver, urns def parse_sql_lineage( self, @@ -982,9 +1030,7 @@ def parse_sql_lineage( # Cache the schema resolver to make bulk parsing faster. schema_resolver = self._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, + platform=platform, platform_instance=platform_instance, env=env ) return sqlglot_lineage( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d1f39a3ba1ba6..1446812c29216 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -129,6 +129,7 @@ # Handle table snapshots # See https://cloud.google.com/bigquery/docs/table-snapshots-intro. SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$") +CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN" # We can't use close as it is not called if the ingestion is not successful @@ -428,7 +429,9 @@ def get_dataplatform_instance_aspect( ) -> MetadataWorkUnit: aspect = DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn(self.platform, project_id), + instance=make_dataplatform_instance_urn(self.platform, project_id) + if self.config.include_data_platform_instance + else None, ) return MetadataChangeProposalWrapper( entityUrn=dataset_urn, aspect=aspect @@ -1151,6 +1154,21 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: field.description = col.comment schema_fields[idx] = field else: + tags = [] + if col.is_partition_column: + tags.append( + TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) + ) + + if col.cluster_column_position is not None: + tags.append( + TagAssociationClass( + make_tag_urn( + f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}" + ) + ) + ) + field = SchemaField( fieldPath=col.name, type=SchemaFieldDataType( @@ -1160,15 +1178,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: nativeDataType=col.data_type, description=col.comment, nullable=col.is_nullable, - globalTags=GlobalTagsClass( - tags=[ - TagAssociationClass( - make_tag_urn(Constants.TAG_PARTITION_KEY) - ) - ] - ) - if col.is_partition_column - else GlobalTagsClass(tags=[]), + globalTags=GlobalTagsClass(tags=tags), ) schema_fields.append(field) last_id = col.ordinal_position diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index e5730ee87daf4..0f2082c5e53bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -81,6 +81,13 @@ class BigQueryV2Config( description="Whether to populate BigQuery Console url to Datasets/Tables", ) + include_data_platform_instance: bool = Field( + default=False, + description="Whether to create a DataPlatformInstance aspect, equal to the BigQuery project id." + " If enabled, will cause redundancy in the browse path for BigQuery entities in the UI," + " because the project id is represented as the top-level container.", + ) + debug_include_full_payloads: bool = Field( default=False, description="Include full payload into events. It is only for debugging and internal use.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 2450dbd0e2391..f8256f8e6fed6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -33,6 +33,7 @@ class BigqueryTableType: class BigqueryColumn(BaseColumn): field_path: str is_partition_column: bool + cluster_column_position: Optional[int] RANGE_PARTITION_NAME: str = "RANGE" @@ -285,7 +286,8 @@ class BigqueryQuery: CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, description as comment, c.is_hidden as is_hidden, - c.is_partitioning_column as is_partitioning_column + c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, from `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMNS c join `{project_id}`.`{dataset_name}`.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS as cfp on cfp.table_name = c.table_name @@ -307,6 +309,7 @@ class BigqueryQuery: description as comment, c.is_hidden as is_hidden, c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, -- We count the columns to be able limit it later row_number() over (partition by c.table_catalog, c.table_schema, c.table_name order by c.ordinal_position asc, c.data_type DESC) as column_num, -- Getting the maximum shard for each table @@ -333,6 +336,7 @@ class BigqueryQuery: CASE WHEN CONTAINS_SUBSTR(field_path, ".") THEN NULL ELSE c.data_type END as data_type, c.is_hidden as is_hidden, c.is_partitioning_column as is_partitioning_column, + c.clustering_ordinal_position as clustering_ordinal_position, description as comment from `{table_identifier.project_id}`.`{table_identifier.dataset}`.INFORMATION_SCHEMA.COLUMNS as c @@ -583,6 +587,7 @@ def get_columns_for_dataset( data_type=column.data_type, comment=column.comment, is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, ) ) @@ -621,6 +626,7 @@ def get_columns_for_table( data_type=column.data_type, comment=column.comment, is_partition_column=column.is_partitioning_column == "YES", + cluster_column_position=column.clustering_ordinal_position, ) ) last_seen_table = column.table_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py index dcaec4e45737f..0bdcb115b377c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py +++ b/metadata-ingestion/src/datahub/ingestion/source/confluent_schema_registry.py @@ -113,7 +113,7 @@ def get_schema_str_replace_confluent_ref_avro( schema_seen = set() schema_str = self._compact_schema(schema.schema_str) for schema_ref in schema.references: - ref_subject = schema_ref["subject"] + ref_subject = schema_ref.subject if ref_subject in schema_seen: continue @@ -132,7 +132,7 @@ def get_schema_str_replace_confluent_ref_avro( # Replace only external type references with the reference schema recursively. # NOTE: The type pattern is dependent on _compact_schema. avro_type_kwd = '"type"' - ref_name = schema_ref["name"] + ref_name = schema_ref.name # Try by name first pattern_to_replace = f'{avro_type_kwd}:"{ref_name}"' if pattern_to_replace not in schema_str: @@ -164,7 +164,7 @@ def get_schemas_from_confluent_ref_protobuf( schema_ref: SchemaReference for schema_ref in schema.references: - ref_subject: str = schema_ref["subject"] + ref_subject: str = schema_ref.subject if ref_subject in schema_seen: continue reference_schema: RegisteredSchema = ( @@ -173,7 +173,7 @@ def get_schemas_from_confluent_ref_protobuf( schema_seen.add(ref_subject) all_schemas.append( ProtobufSchema( - name=schema_ref["name"], content=reference_schema.schema.schema_str + name=schema_ref.name, content=reference_schema.schema.schema_str ) ) return all_schemas @@ -192,19 +192,19 @@ def get_schemas_from_confluent_ref_json( schema_ref: SchemaReference for schema_ref in schema.references: - ref_subject: str = schema_ref["subject"] + ref_subject: str = schema_ref.subject if ref_subject in schema_seen: continue reference_schema: RegisteredSchema = ( self.schema_registry_client.get_version( - subject_name=ref_subject, version=schema_ref["version"] + subject_name=ref_subject, version=schema_ref.version ) ) schema_seen.add(ref_subject) all_schemas.extend( self.get_schemas_from_confluent_ref_json( reference_schema.schema, - name=schema_ref["name"], + name=schema_ref.name, subject=ref_subject, schema_seen=schema_seen, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 1cd5ed8164854..af9769bc9d94c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -162,9 +162,11 @@ class DBTCloudConfig(DBTCommonConfig): } _DBT_GRAPHQL_QUERY = """ -query DatahubMetadataQuery_{type}($jobId: Int!, $runId: Int) {{ - {type}(jobId: $jobId, runId: $runId) {{ +query DatahubMetadataQuery_{type}($jobId: BigInt!, $runId: BigInt) {{ + job(id: $jobId, runId: $runId) {{ + {type} {{ {fields} + }} }} }} """ @@ -218,7 +220,7 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: }, ) - raw_nodes.extend(data[node_type]) + raw_nodes.extend(data["job"][node_type]) nodes = [self._parse_into_dbt_node(node) for node in raw_nodes] diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index a299023b88e64..5805790fe8bb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -7,6 +7,7 @@ from time import sleep from typing import Dict, Iterable, List, Optional, Union +import nest_asyncio from okta.client import Client as OktaClient from okta.exceptions import OktaAPIException from okta.models import Group, GroupProfile, User, UserProfile, UserStatus @@ -51,6 +52,7 @@ ) logger = logging.getLogger(__name__) +nest_asyncio.apply() class OktaConfig(StatefulIngestionConfigBase, ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index c8a4c7a6ab8fa..b3fa5e3401c07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -626,12 +626,17 @@ def _extract_lineages(self): @dataclass class DebeziumSourceConnector: connector_manifest: ConnectorManifest + report: KafkaConnectSourceReport def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig + self, + connector_manifest: ConnectorManifest, + config: KafkaConnectSourceConfig, + report: KafkaConnectSourceReport, ) -> None: self.connector_manifest = connector_manifest self.config = config + self.report = report self._extract_lineages() @dataclass @@ -683,10 +688,19 @@ def get_parser( database_name=connector_manifest.config.get("database.dbname"), ) elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + parser = self.DebeziumParser( source_platform="mssql", server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), + database_name=database_name, ) elif connector_class == "io.debezium.connector.db2.Db2Connector": parser = self.DebeziumParser( @@ -707,29 +721,37 @@ def get_parser( def _extract_lineages(self): lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - if not self.connector_manifest.topic_names: - return lineages + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = r"({0})\.(\w+\.\w+)".format(server_name) - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) + if not self.connector_manifest.topic_names: + return lineages - if found: - table_name = get_dataset_name(database_name, found.group(2)) + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.connector_manifest.lineages = lineages + except Exception as e: + self.report.report_warning( + self.connector_manifest.name, f"Error resolving lineage: {e}" + ) + + return @dataclass @@ -1061,7 +1083,9 @@ def get_connectors_manifest(self) -> List[ConnectorManifest]: "io.debezium.connector" ): connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, config=self.config + connector_manifest=connector_manifest, + config=self.config, + report=self.report, ).connector_manifest elif ( connector_manifest.config.get(CONNECTOR_CLASS, "") diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 085878245c60d..e1d035a96d42f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -146,6 +146,11 @@ class LDAPSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): description="Use pagination while do search query (enabled by default).", ) + use_email_as_username: bool = Field( + default=False, + description="Use email for users' usernames instead of username (disabled by default). \ + If enabled, the user and group urn would be having email as the id part of the urn.", + ) # default mapping for attrs user_attrs_map: Dict[str, Any] = {} group_attrs_map: Dict[str, Any] = {} @@ -266,10 +271,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if dn is None: continue - if not attrs: + if not attrs or "objectClass" not in attrs: self.report.report_warning( "", - f"skipping {dn} because attrs is empty; check your permissions if this is unexpected", + f"skipping {dn} because attrs ({attrs}) does not contain expected data; " + f"check your permissions if this is unexpected", ) continue @@ -306,6 +312,7 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn work unit based on the information. """ manager_ldap = None + make_manager_urn = None if self.config.user_attrs_map["managerUrn"] in attrs: try: m_cn = attrs[self.config.user_attrs_map["managerUrn"]][0].decode() @@ -322,10 +329,19 @@ def handle_user(self, dn: str, attrs: Dict[str, Any]) -> Iterable[MetadataWorkUn result = self.ldap_client.result3(manager_msgid) if result[1]: _m_dn, m_attrs = result[1][0] + manager_ldap = guess_person_ldap(m_attrs, self.config, self.report) + + m_email = get_attr_or_none( + m_attrs, self.config.user_attrs_map["email"], manager_ldap + ) + make_manager_urn = ( + m_email if self.config.use_email_as_username else manager_ldap + ) + except ldap.LDAPError as e: self.report.report_warning(dn, f"manager LDAP search failed: {e}") - mce = self.build_corp_user_mce(dn, attrs, manager_ldap) + mce = self.build_corp_user_mce(dn, attrs, make_manager_urn) if mce: yield MetadataWorkUnit(dn, mce) else: @@ -387,8 +403,10 @@ def build_corp_user_mce( manager_urn = f"urn:li:corpuser:{manager_ldap}" if manager_ldap else None + make_user_urn = email if self.config.use_email_as_username else ldap_user + user_snapshot = CorpUserSnapshotClass( - urn=f"urn:li:corpuser:{ldap_user}", + urn=f"urn:li:corpuser:{make_user_urn}", aspects=[ CorpUserInfoClass( active=True, @@ -429,8 +447,10 @@ def build_corp_group_mce(self, attrs: dict) -> Optional[MetadataChangeEvent]: attrs, self.config.group_attrs_map["displayName"] ) + make_group_urn = email if self.config.use_email_as_username else full_name + group_snapshot = CorpGroupSnapshotClass( - urn=f"urn:li:corpGroup:{full_name}", + urn=f"urn:li:corpGroup:{make_group_urn}", aspects=[ CorpGroupInfoClass( email=email, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index d568ddcb02afa..40b90d216348c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -34,6 +34,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, FineGrainedLineageDownstreamType, @@ -76,6 +77,8 @@ from datahub.utilities.lossy_collections import LossyList, LossySet from datahub.utilities.url_util import remove_port_from_url +CORPUSER_DATAHUB = "urn:li:corpuser:datahub" + if TYPE_CHECKING: from datahub.ingestion.source.looker.lookml_source import ( LookerViewFileLoader, @@ -786,6 +789,7 @@ def _to_metadata_events( # noqa: C901 if self.upstream_views is not None: assert self.project_name is not None upstreams = [] + observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc) for view_ref in sorted(self.upstream_views): view_urn = LookerViewId( project_name=view_ref.project @@ -799,6 +803,10 @@ def _to_metadata_events( # noqa: C901 UpstreamClass( dataset=view_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStamp( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) ) view_name_to_urn_map[view_ref.include] = view_urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 362b4e5530638..1a32afa2b7fdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -6,7 +6,7 @@ import re import tempfile from dataclasses import dataclass, field as dataclass_field, replace -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import ( Any, ClassVar, @@ -50,6 +50,7 @@ from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.git.git_import import GitClone from datahub.ingestion.source.looker.looker_common import ( + CORPUSER_DATAHUB, LookerCommonConfig, LookerExplore, LookerUtil, @@ -83,6 +84,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( + AuditStampClass, DatasetPropertiesClass, FineGrainedLineageClass, FineGrainedLineageUpstreamTypeClass, @@ -1615,11 +1617,16 @@ def _get_upstream_lineage( # Generate the upstream + fine grained lineage objects. upstreams = [] + observed_lineage_ts = datetime.now(tz=timezone.utc) fine_grained_lineages: List[FineGrainedLineageClass] = [] for upstream_dataset_urn in upstream_dataset_urns: upstream = UpstreamClass( dataset=upstream_dataset_urn, type=DatasetLineageTypeClass.VIEW, + auditStamp=AuditStampClass( + time=int(observed_lineage_ts.timestamp() * 1000), + actor=CORPUSER_DATAHUB, + ), ) upstreams.append(upstream) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 31d067f984d2d..ffa685fb25826 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -121,6 +121,12 @@ class DataPlatformPair: powerbi_data_platform_name: str +@dataclass +class PowerBIPlatformDetail: + data_platform_pair: DataPlatformPair + data_platform_server: str + + class SupportedDataPlatform(Enum): POSTGRES_SQL = DataPlatformPair( powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" @@ -382,6 +388,15 @@ class PowerBiDashboardSourceConfig( description="The instance of the platform that all assets produced by this recipe belong to", ) + # Enable advance sql construct + enable_advance_lineage_sql_construct: bool = pydantic.Field( + default=False, + description="Whether to enable advance native sql construct for parsing like join, sub-queries. " + "along this flag , the native_query_parsing should be enabled. " + "By default convert_lineage_urns_to_lowercase is enabled, in-case if you have disabled it in previous ingestion execution then it may break lineage " + "as this option generates the upstream datasets URN in lowercase.", + ) + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index 396da2d79e3b7..baaa8d5b85ae1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -5,8 +5,8 @@ from datahub.ingestion.source.powerbi.config import ( PlatformDetail, PowerBiDashboardSourceConfig, + PowerBIPlatformDetail, ) -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable logger = logging.getLogger(__name__) @@ -14,7 +14,7 @@ class AbstractDataPlatformInstanceResolver(ABC): @abstractmethod def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: pass @@ -32,10 +32,10 @@ class ResolvePlatformInstanceFromDatasetTypeMapping( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: platform: Union[str, PlatformDetail] = self.config.dataset_type_mapping[ - dataplatform_table.data_platform_pair.powerbi_data_platform_name + data_platform_detail.data_platform_pair.powerbi_data_platform_name ] if isinstance(platform, PlatformDetail): @@ -48,13 +48,13 @@ class ResolvePlatformInstanceFromServerToPlatformInstance( BaseAbstractDataPlatformInstanceResolver ): def get_platform_instance( - self, dataplatform_table: DataPlatformTable + self, data_platform_detail: PowerBIPlatformDetail ) -> PlatformDetail: return ( self.config.server_to_platform_instance[ - dataplatform_table.datasource_server + data_platform_detail.data_platform_server ] - if dataplatform_table.datasource_server + if data_platform_detail.data_platform_server in self.config.server_to_platform_instance else PlatformDetail.parse_obj({}) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 640bc4bd60d80..021c429c3c633 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -1,8 +1,12 @@ import logging -from typing import List +from typing import List, Optional import sqlparse +import datahub.utilities.sqlglot_lineage as sqlglot_l +from datahub.ingestion.api.common import PipelineContext +from datahub.utilities.sqlglot_lineage import SqlParsingResult + SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] logger = logging.getLogger() @@ -45,3 +49,30 @@ def get_tables(native_query: str) -> List[str]: from_index = from_index + 1 return tables + + +def parse_custom_sql( + ctx: PipelineContext, + query: str, + schema: Optional[str], + database: Optional[str], + platform: str, + env: str, + platform_instance: Optional[str], +) -> Optional["SqlParsingResult"]: + + logger.debug("Using sqlglot_lineage to parse custom sql") + + sql_query = remove_special_characters(query) + + logger.debug(f"Parsing sql={sql_query}") + + return sqlglot_l.create_lineage_sql_parsed_result( + query=sql_query, + schema=schema, + database=database, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=ctx.graph, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 83106c04529d1..8cc38c366c42a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -6,7 +6,14 @@ import lark from lark import Lark, Tree -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import resolver, validator from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -45,7 +52,9 @@ def _parse_expression(expression: str) -> Tree: def get_upstream_tables( table: Table, reporter: PowerBiDashboardSourceReport, - native_query_enabled: bool = True, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, ) -> List[resolver.DataPlatformTable]: if table.expression is None: @@ -58,7 +67,7 @@ def get_upstream_tables( parse_tree: Tree = _parse_expression(table.expression) valid, message = validator.validate_parse_tree( - parse_tree, native_query_enabled=native_query_enabled + parse_tree, native_query_enabled=config.native_query_parsing ) if valid is False: assert message is not None @@ -84,7 +93,11 @@ def get_upstream_tables( parse_tree=parse_tree, reporter=reporter, parameters=parameters, - ).resolve_to_data_platform_table_list() + ).resolve_to_data_platform_table_list( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) except BaseException as e: reporter.report_warning(table.full_name, "Failed to process m-query expression") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index e2b448124c89d..479f1decff903 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -6,11 +6,19 @@ from lark import Tree +import datahub.emitter.mce_builder as builder +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( DataPlatformPair, + PlatformDetail, + PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, + PowerBIPlatformDetail, SupportedDataPlatform, ) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -19,19 +27,98 @@ IdentifierAccessor, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table +from datahub.utilities.sqlglot_lineage import SqlParsingResult logger = logging.getLogger(__name__) @dataclass class DataPlatformTable: - name: str - full_name: str - datasource_server: str data_platform_pair: DataPlatformPair + urn: str + + +def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + +def urn_creator( + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + data_platform_pair: DataPlatformPair, + server: str, + qualified_table_name: str, +) -> str: + + platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=server, + ) + ) + + return builder.make_dataset_urn_with_platform_instance( + platform=data_platform_pair.datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + name=urn_to_lowercase( + qualified_table_name, config.convert_lineage_urns_to_lowercase + ), + ) class AbstractDataPlatformTableCreator(ABC): + """ + Base class to share common functionalities among different dataplatform for M-Query parsing. + + To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and + the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query. + + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + + It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument + of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL. + + DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern + + data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to + find out database-name , schema-name and table-name also varies as per dataplatform. + + Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query + + let + Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) + in + Source + + In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. + + NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. + + """ + + ctx: PipelineContext + config: PowerBiDashboardSourceConfig + platform_instance_resolver: AbstractDataPlatformInstanceResolver + + def __init__( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> None: + super().__init__() + self.ctx = ctx + self.config = config + self.platform_instance_resolver = platform_instance_resolver + @abstractmethod def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail @@ -58,6 +145,49 @@ def get_db_detail_from_argument( return arguments[0], arguments[1] + def parse_custom_sql( + self, query: str, server: str, database: Optional[str], schema: Optional[str] + ) -> List[DataPlatformTable]: + + dataplatform_tables: List[DataPlatformTable] = [] + + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=self.get_platform_pair(), + data_platform_server=server, + ) + ) + ) + + parsed_result: Optional[ + "SqlParsingResult" + ] = native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) + + if parsed_result is None: + logger.debug("Failed to parse query") + return dataplatform_tables + + for urn in parsed_result.in_tables: + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") + + return dataplatform_tables + class AbstractDataAccessMQueryResolver(ABC): table: Table @@ -80,11 +210,29 @@ def __init__( self.data_access_functions = SupportedResolver.get_function_names() @abstractmethod - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: pass class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): + """ + This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail). + + This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. + + Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator + (see method resolve_to_data_platform_table_list). + + Classes which extended from AbstractDataPlatformTableCreator knows how to convert generated DataAccessFunctionDetail instance + to respective DataPlatformTable instance as per dataplatform. + + """ + def get_item_selector_tokens( self, expression_tree: Tree, @@ -318,9 +466,15 @@ def internal( return table_links - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + def resolve_to_data_platform_table_list( + self, + ctx: PipelineContext, + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] + # Find out output variable as we are doing backtracking in M-Query output_variable: Optional[str] = tree_function.get_output_variable( self.parse_tree ) @@ -332,12 +486,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) return data_platform_tables + # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail table_links: List[ DataAccessFunctionDetail ] = self.create_data_access_functional_detail(output_variable) # Each item is data-access function for f_detail in table_links: + # Get & Check if we support data-access-function available in M-Query supported_resolver = SupportedResolver.get_resolver( f_detail.data_access_function_name ) @@ -351,8 +507,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) continue + # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it + # & also pass additional information that will be need to generate urn table_full_name_creator: AbstractDataPlatformTableCreator = ( - supported_resolver.get_table_full_name_creator()() + supported_resolver.get_table_full_name_creator()( + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) data_platform_tables.extend( @@ -393,18 +555,24 @@ def two_level_access_pattern( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Item"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"Platform({self.get_platform_pair().datahub_data_platform_name}) full_table_name= {full_table_name}" + f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -420,9 +588,48 @@ def get_platform_pair(self) -> DataPlatformPair: class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources): + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo + def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.MS_SQL.value + def create_urn_using_old_parser( + self, query: str, db_name: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for table in tables: + schema_and_table: List[str] = table.split(".") + if len(schema_and_table) == 1: + # schema name is not present. set default schema + schema_and_table.insert(0, MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA) + + qualified_table_name = ( + f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" + ) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated upstream tables = {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -442,28 +649,20 @@ def create_dataplatform_tables( logger.debug("Unsupported case is found. Second index is not the Query") return dataplatform_tables - db_name: str = arguments[1] - - tables: List[str] = native_sql_parser.get_tables(arguments[3]) - for table in tables: - schema_and_table: List[str] = table.split(".") - if len(schema_and_table) == 1: - # schema name is not present. Default schema name in MS-SQL is dbo - # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 - schema_and_table.insert(0, "dbo") - - dataplatform_tables.append( - DataPlatformTable( - name=schema_and_table[1], - full_name=f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}", - datasource_server=arguments[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=arguments[3], + db_name=arguments[1], + server=arguments[0], ) - logger.debug("MS-SQL full-table-names %s", dataplatform_tables) - - return dataplatform_tables + return self.parse_custom_sql( + query=arguments[3], + database=arguments[1], + server=arguments[0], + schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, + ) class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -510,12 +709,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -547,14 +754,28 @@ def create_dataplatform_tables( db_name: str = value_dict["Database"] schema_name: str = value_dict["Schema"] table_name: str = value_dict["Table"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list) + if server is None: + logger.info( + f"server information is not available for {qualified_table_name}. Skipping upstream table" + ) + return [] + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server if server else "", data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -589,20 +810,26 @@ def create_dataplatform_tables( IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore ).items["Name"] - full_table_name: str = f"{db_name}.{schema_name}.{table_name}" + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" logger.debug( - f"{self.get_platform_pair().datahub_data_platform_name} full-table-name {full_table_name}" + f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" + ) + + server: str = self.get_datasource_server(arguments, data_access_func_detail) + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, ) return [ DataPlatformTable( - name=table_name, - full_name=full_table_name, - datasource_server=self.get_datasource_server( - arguments, data_access_func_detail - ), data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -654,12 +881,20 @@ def create_dataplatform_tables( cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return [ DataPlatformTable( - name=table_name, - full_name=f"{db_name}.{schema_name}.{table_name}", - datasource_server=server, data_platform_pair=self.get_platform_pair(), + urn=urn, ) ] @@ -681,6 +916,39 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool: in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM ) + def create_urn_using_old_parser( + self, query: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for qualified_table_name in tables: + if len(qualified_table_name.split(".")) != 3: + logger.debug( + f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" + ) + continue + + urn = urn_creator( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") + + return dataplatform_tables + def create_dataplatform_tables( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[DataPlatformTable]: @@ -727,25 +995,21 @@ def create_dataplatform_tables( 0 ] # Remove any whitespaces and double quotes character - for table in native_sql_parser.get_tables(sql_query): - if len(table.split(".")) != 3: - logger.debug( - f"Skipping table {table} as it is not as per full_table_name format" - ) - continue + server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] - dataplatform_tables.append( - DataPlatformTable( - name=table.split(".")[2], - full_name=table, - datasource_server=tree_function.strip_char_from_list( - [data_access_tokens[2]] - )[0], - data_platform_pair=self.get_platform_pair(), - ) + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=sql_query, + server=server, ) - return dataplatform_tables + return self.parse_custom_sql( + query=sql_query, + server=server, + database=None, # database and schema is available inside custom sql as per PowerBI Behavior + schema=None, + ) class FunctionName(Enum): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 919cb83e4d832..5d477ee090e7e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -28,7 +28,6 @@ ) from datahub.ingestion.source.powerbi.config import ( Constant, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, ) @@ -96,10 +95,12 @@ def __hash__(self): def __init__( self, + ctx: PipelineContext, config: PowerBiDashboardSourceConfig, reporter: PowerBiDashboardSourceReport, dataplatform_instance_resolver: AbstractDataPlatformInstanceResolver, ): + self.__ctx = ctx self.__config = config self.__reporter = reporter self.__dataplatform_instance_resolver = dataplatform_instance_resolver @@ -172,43 +173,40 @@ def extract_lineage( # table.dataset should always be set, but we check it just in case. parameters = table.dataset.parameters if table.dataset else {} - upstreams: List[UpstreamClass] = [] - upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( - table, self.__reporter, parameters=parameters + upstream: List[UpstreamClass] = [] + + upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + table=table, + reporter=self.__reporter, + platform_instance_resolver=self.__dataplatform_instance_resolver, + ctx=self.__ctx, + config=self.__config, + parameters=parameters, ) + logger.debug( - f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_tables}" + f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}" ) - for upstream_table in upstream_tables: + + for upstream_dpt in upstream_dpts: if ( - upstream_table.data_platform_pair.powerbi_data_platform_name + upstream_dpt.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping.keys() ): logger.debug( - f"Skipping upstream table for {ds_urn}. The platform {upstream_table.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", ) continue - platform_detail: PlatformDetail = ( - self.__dataplatform_instance_resolver.get_platform_instance( - upstream_table - ) - ) - upstream_urn = builder.make_dataset_urn_with_platform_instance( - platform=upstream_table.data_platform_pair.datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - name=self.lineage_urn_to_lowercase(upstream_table.full_name), - ) - upstream_table_class = UpstreamClass( - upstream_urn, + upstream_dpt.urn, DatasetLineageTypeClass.TRANSFORMED, ) - upstreams.append(upstream_table_class) - if len(upstreams) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + upstream.append(upstream_table_class) + + if len(upstream) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstream) logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}") mcp = MetadataChangeProposalWrapper( entityType=Constant.DATASET, @@ -1107,7 +1105,9 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): ) # Exit pipeline as we are not able to connect to PowerBI API Service. This exit will avoid raising # unwanted stacktrace on console - self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver) + self.mapper = Mapper( + ctx, config, self.reporter, self.dataplatform_instance_resolver + ) # Create and register the stateful ingestion use-case handler. self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py index 2d2d9f527788f..0d41ab00c66f5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/data_classes.py @@ -1,3 +1,4 @@ +import dataclasses from dataclasses import dataclass from enum import Enum from typing import Any, Dict, List, Optional, Union @@ -105,7 +106,7 @@ class Measure: dataType: str = "measure" datahubDataType: Union[ BooleanTypeClass, DateTypeClass, NullTypeClass, NumberTypeClass, StringTypeClass - ] = NullTypeClass() + ] = dataclasses.field(default_factory=NullTypeClass) description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 35d39e40fc913..4247ee9330cfb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -37,6 +37,7 @@ from datahub.emitter.mce_builder import ( make_data_platform_urn, + make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -81,6 +82,7 @@ TimeTypeClass, ) from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, DatasetPropertiesClass, MapTypeClass, OperationClass, @@ -562,6 +564,15 @@ def ingest_table( self.source_config.env, ) + if self.source_config.platform_instance: + data_platform_instance = DataPlatformInstanceClass( + platform=data_platform_urn, + instance=make_dataplatform_instance_urn( + self.source_config.platform, self.source_config.platform_instance + ), + ) + aspects.append(data_platform_instance) + customProperties = {"schema_inferred_from": str(table_data.full_path)} if not path_spec.sample_files: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index e8e80e172a9ce..a7d946e99d806 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -1,10 +1,12 @@ import logging +from collections import defaultdict +from dataclasses import dataclass from enum import Enum -from typing import Dict, List, Optional, cast +from typing import Dict, List, Optional, Set, cast from pydantic import Field, SecretStr, root_validator, validator -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.pattern_utils import UUID_REGEX from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -42,6 +44,31 @@ class TagOption(str, Enum): skip = "skip" +@dataclass(frozen=True) +class DatabaseId: + database: str = Field( + description="Database created from share in consumer account." + ) + platform_instance: str = Field( + description="Platform instance of consumer snowflake account." + ) + + +class SnowflakeShareConfig(ConfigModel): + database: str = Field(description="Database from which share is created.") + platform_instance: str = Field( + description="Platform instance for snowflake account in which share is created." + ) + + consumers: Set[DatabaseId] = Field( + description="List of databases created in consumer accounts." + ) + + @property + def source_database(self) -> DatabaseId: + return DatabaseId(self.database, self.platform_instance) + + class SnowflakeV2Config( SnowflakeConfig, SnowflakeUsageConfig, @@ -91,13 +118,8 @@ class SnowflakeV2Config( description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) - use_legacy_lineage_method: bool = Field( - default=False, - description=( - "Whether to use the legacy lineage computation method. " - "By default, uses new optimised lineage extraction method that requires less ingestion process memory. " - "Table-to-view and view-to-view column-level lineage are not supported with the legacy method." - ), + _use_legacy_lineage_method_removed = pydantic_removed_field( + "use_legacy_lineage_method" ) validate_upstreams_against_patterns: bool = Field( @@ -113,13 +135,20 @@ class SnowflakeV2Config( # This is required since access_history table does not capture whether the table was temporary table. temporary_tables_pattern: List[str] = Field( default=DEFAULT_TABLES_DENY_LIST, - description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`", + description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools.", ) rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( "upstreams_deny_pattern", "temporary_tables_pattern" ) + shares: Optional[Dict[str, SnowflakeShareConfig]] = Field( + default=None, + description="Required if current account owns or consumes snowflake share." + " If specified, connector creates lineage and siblings relationship between current account's database tables and consumer/producer account's database tables." + " Map of share name -> details of share.", + ) + email_as_user_identifier: bool = Field( default=True, description="Format user urns as an email, if the snowflake user's email is set. If `email_domain` is provided, generates email addresses for snowflake users with unset emails, based on their username.", @@ -197,3 +226,77 @@ def get_sql_alchemy_url( @property def parse_view_ddl(self) -> bool: return self.include_view_column_lineage + + @validator("shares") + def validate_shares( + cls, shares: Optional[Dict[str, SnowflakeShareConfig]], values: Dict + ) -> Optional[Dict[str, SnowflakeShareConfig]]: + current_platform_instance = values.get("platform_instance") + + if shares: + # Check: platform_instance should be present + assert current_platform_instance is not None, ( + "Did you forget to set `platform_instance` for current ingestion ? " + "It is required to use `platform_instance` when ingesting from multiple snowflake accounts." + ) + + databases_included_in_share: List[DatabaseId] = [] + databases_created_from_share: List[DatabaseId] = [] + + for share_details in shares.values(): + shared_db = DatabaseId( + share_details.database, share_details.platform_instance + ) + assert all( + consumer.platform_instance != share_details.platform_instance + for consumer in share_details.consumers + ), "Share's platform_instance can not be same as consumer's platform instance. Self-sharing not supported in Snowflake." + + databases_included_in_share.append(shared_db) + databases_created_from_share.extend(share_details.consumers) + + for db_from_share in databases_created_from_share: + assert ( + db_from_share not in databases_included_in_share + ), "Database included in a share can not be present as consumer in any share." + assert ( + databases_created_from_share.count(db_from_share) == 1 + ), "Same database can not be present as consumer in more than one share." + + return shares + + def outbounds(self) -> Dict[str, Set[DatabaseId]]: + """ + Returns mapping of + database included in current account's outbound share -> all databases created from this share in other accounts + """ + outbounds: Dict[str, Set[DatabaseId]] = defaultdict(set) + if self.shares: + for share_name, share_details in self.shares.items(): + if share_details.platform_instance == self.platform_instance: + logger.debug( + f"database {share_details.database} is included in outbound share(s) {share_name}." + ) + outbounds[share_details.database].update(share_details.consumers) + return outbounds + + def inbounds(self) -> Dict[str, DatabaseId]: + """ + Returns mapping of + database created from an current account's inbound share -> other-account database from which this share was created + """ + inbounds: Dict[str, DatabaseId] = {} + if self.shares: + for share_name, share_details in self.shares.items(): + for consumer in share_details.consumers: + if consumer.platform_instance == self.platform_instance: + logger.debug( + f"database {consumer.database} is created from inbound share {share_name}." + ) + inbounds[consumer.database] = share_details.source_database + break + else: + logger.info( + f"Skipping Share {share_name}, as it does not include current platform instance {self.platform_instance}", + ) + return inbounds diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py deleted file mode 100644 index 832a072c619f8..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py +++ /dev/null @@ -1,664 +0,0 @@ -import json -import logging -from collections import defaultdict -from dataclasses import dataclass, field -from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set - -from pydantic import Field -from pydantic.error_wrappers import ValidationError -from snowflake.connector import SnowflakeConnection - -import datahub.emitter.mce_builder as builder -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn -from datahub.ingestion.source.snowflake.constants import ( - LINEAGE_PERMISSION_ERROR, - SnowflakeEdition, - SnowflakeObjectDomain, -) -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report -from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( - SnowflakeColumnReference, -) -from datahub.ingestion.source.snowflake.snowflake_utils import ( - SnowflakeCommonMixin, - SnowflakeConnectionMixin, - SnowflakePermissionError, - SnowflakeQueryMixin, -) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( - FineGrainedLineage, - FineGrainedLineageDownstreamType, - FineGrainedLineageUpstreamType, - UpstreamLineage, -) -from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass -from datahub.utilities.perf_timer import PerfTimer - -logger: logging.Logger = logging.getLogger(__name__) - - -class SnowflakeColumnWithLineage(SnowflakeColumnReference): - class Config: - # This is for backward compatibility and can be removed later - allow_population_by_field_name = True - - directSourceColumns: Optional[List[SnowflakeColumnReference]] = Field( - default=None, alias="directSources" - ) - - -@dataclass(frozen=True) -class SnowflakeColumnId: - columnName: str - objectName: str - objectDomain: Optional[str] = None - - -@dataclass(frozen=True) -class SnowflakeColumnFineGrainedLineage: - """ - Fie grained upstream of column, - which represents a transformation applied on input columns""" - - inputColumns: FrozenSet[SnowflakeColumnId] - # Transform function, query etc can be added here - - -@dataclass -class SnowflakeColumnUpstreams: - """All upstreams of a column""" - - upstreams: Set[SnowflakeColumnFineGrainedLineage] = field( - default_factory=set, init=False - ) - - def update_column_lineage( - self, directSourceColumns: List[SnowflakeColumnReference] - ) -> None: - input_columns = frozenset( - [ - SnowflakeColumnId( - upstream_col.columnName, - upstream_col.objectName, - upstream_col.objectDomain, - ) - for upstream_col in directSourceColumns - if upstream_col.objectName - ] - ) - if not input_columns: - return - upstream = SnowflakeColumnFineGrainedLineage(inputColumns=input_columns) - if upstream not in self.upstreams: - self.upstreams.add(upstream) - - -@dataclass -class SnowflakeUpstreamTable: - upstreamDataset: str - upstreamColumns: List[SnowflakeColumnReference] - downstreamColumns: List[SnowflakeColumnWithLineage] - - @classmethod - def from_dict( - cls, - dataset: str, - upstreams_columns_json: Optional[str], - downstream_columns_json: Optional[str], - ) -> "SnowflakeUpstreamTable": - try: - upstreams_columns_list = [] - downstream_columns_list = [] - if upstreams_columns_json is not None: - upstreams_columns_list = json.loads(upstreams_columns_json) - if downstream_columns_json is not None: - downstream_columns_list = json.loads(downstream_columns_json) - - table_with_upstreams = cls( - dataset, - [ - SnowflakeColumnReference.parse_obj(col) - for col in upstreams_columns_list - ], - [ - SnowflakeColumnWithLineage.parse_obj(col) - for col in downstream_columns_list - ], - ) - except ValidationError: - # Earlier versions of column lineage did not include columnName, only columnId - table_with_upstreams = cls(dataset, [], []) - return table_with_upstreams - - -@dataclass -class SnowflakeTableLineage: - # key: upstream table name - upstreamTables: Dict[str, SnowflakeUpstreamTable] = field( - default_factory=dict, init=False - ) - - # key: downstream column name - columnLineages: Dict[str, SnowflakeColumnUpstreams] = field( - default_factory=lambda: defaultdict(SnowflakeColumnUpstreams), init=False - ) - - def update_lineage( - self, table: SnowflakeUpstreamTable, include_column_lineage: bool = True - ) -> None: - if table.upstreamDataset not in self.upstreamTables.keys(): - self.upstreamTables[table.upstreamDataset] = table - - if include_column_lineage and table.downstreamColumns: - for col in table.downstreamColumns: - if col.directSourceColumns: - self.columnLineages[col.columnName].update_column_lineage( - col.directSourceColumns - ) - - -class SnowflakeLineageExtractor( - SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin -): - """ - Extracts Lineage from Snowflake. - Following lineage edges are considered. - - 1. "Table to View" lineage via `snowflake.account_usage.object_dependencies` view - 2. "S3 to Table" lineage via `show external tables` query. - 3. "View to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 4. "Table to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - 5. "S3 to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above) - - Edition Note - Snowflake Standard Edition does not have Access History Feature. So it does not support lineage extraction for edges 3, 4, 5 mentioned above. - """ - - def __init__( - self, - config: SnowflakeV2Config, - report: SnowflakeV2Report, - dataset_urn_builder: Callable[[str], str], - ) -> None: - self._lineage_map: Dict[str, SnowflakeTableLineage] = defaultdict( - SnowflakeTableLineage - ) - self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set) - self.config = config - self.report = report - self.logger = logger - self.dataset_urn_builder = dataset_urn_builder - self.connection: Optional[SnowflakeConnection] = None - - # Kwargs used by new snowflake lineage extractor need to be ignored here - def get_workunits( - self, discovered_tables: List[str], discovered_views: List[str], **_kwargs: Any - ) -> Iterable[MetadataWorkUnit]: - self.connection = self.create_connection() - if self.connection is None: - return - - self._populate_table_lineage() - - if self.config.include_view_lineage: - if len(discovered_views) > 0: - self._populate_view_lineage() - else: - logger.info("No views found. Skipping View Lineage Extraction.") - - self._populate_external_lineage() - - if ( - len(self._lineage_map.keys()) == 0 - and len(self._external_lineage_map.keys()) == 0 - ): - logger.debug("No lineage found.") - return - - yield from self.get_table_upstream_workunits(discovered_tables) - yield from self.get_view_upstream_workunits(discovered_views) - - def _populate_table_lineage(self): - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. Table to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_lineage() - self.report.table_lineage_query_secs = timer.elapsed_seconds() - - def get_table_upstream_workunits(self, discovered_tables): - if self.config.include_table_lineage: - for dataset_name in discovered_tables: - upstream_lineage = self._get_upstream_lineage_info(dataset_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(dataset_name), - aspect=upstream_lineage, - ).as_workunit() - - def get_view_upstream_workunits(self, discovered_views): - if self.config.include_view_lineage: - for view_name in discovered_views: - upstream_lineage = self._get_upstream_lineage_info(view_name) - if upstream_lineage is not None: - yield MetadataChangeProposalWrapper( - entityUrn=self.dataset_urn_builder(view_name), - aspect=upstream_lineage, - ).as_workunit() - - def _get_upstream_lineage_info( - self, dataset_name: str - ) -> Optional[UpstreamLineage]: - lineage = self._lineage_map[dataset_name] - external_lineage = self._external_lineage_map[dataset_name] - if not (lineage.upstreamTables or lineage.columnLineages or external_lineage): - logger.debug(f"No lineage found for {dataset_name}") - return None - - upstream_tables: List[UpstreamClass] = [] - finegrained_lineages: List[FineGrainedLineage] = [] - - # Populate the table-lineage in aspect - self.update_upstream_tables_lineage(upstream_tables, lineage) - - # Populate the column-lineage in aspect - self.update_upstream_columns_lineage( - self.dataset_urn_builder(dataset_name), finegrained_lineages, lineage - ) - - # Populate the external-table-lineage(s3->snowflake) in aspect - self.update_external_tables_lineage(upstream_tables, external_lineage) - - if len(upstream_tables) > 0: - logger.debug( - f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" - ) - if self.config.upstream_lineage_in_report: - self.report.upstream_lineage[dataset_name] = [ - u.dataset for u in upstream_tables - ] - return UpstreamLineage( - upstreams=upstream_tables, - fineGrainedLineages=sorted( - finegrained_lineages, key=lambda x: (x.downstreams, x.upstreams) - ) - or None, - ) - else: - return None - - def _populate_view_lineage(self) -> None: - with PerfTimer() as timer: - self._populate_view_upstream_lineage() - self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds() - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. View to Table Lineage Feature is not supported." - ) # See Edition Note above for why - else: - with PerfTimer() as timer: - self._populate_view_downstream_lineage() - self.report.view_downstream_lineage_query_secs = timer.elapsed_seconds() - - def _populate_external_lineage(self) -> None: - with PerfTimer() as timer: - self.report.num_external_table_edges_scanned = 0 - - if self.report.edition == SnowflakeEdition.STANDARD: - logger.info( - "Snowflake Account is Standard Edition. External Lineage Feature via Access History is not supported." - ) # See Edition Note above for why - else: - self._populate_external_lineage_from_access_history() - - self._populate_external_lineage_from_show_query() - - logger.info( - f"Found {self.report.num_external_table_edges_scanned} external lineage edges." - ) - - self.report.external_lineage_queries_secs = timer.elapsed_seconds() - - # Handles the case for explicitly created external tables. - # NOTE: Snowflake does not log this information to the access_history table. - def _populate_external_lineage_from_show_query(self): - external_tables_query: str = SnowflakeQuery.show_external_tables() - try: - for db_row in self.query(external_tables_query): - key = self.get_dataset_identifier( - db_row["name"], db_row["schema_name"], db_row["database_name"] - ) - - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ): - continue - self._external_lineage_map[key].add(db_row["location"]) - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via show external tables" - ) - self.report.num_external_table_edges_scanned += 1 - except Exception as e: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating external table lineage from Snowflake failed due to error {e}.", - ) - - # Handles the case where a table is populated from an external location via copy. - # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...') pattern='.*.csv'; - def _populate_external_lineage_from_access_history(self): - query: str = SnowflakeQuery.external_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - ) - - try: - for db_row in self.query(query): - self._process_external_lineage_result_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "external_lineage", - f"Populating table external lineage from Snowflake failed due to error {e}.", - ) - - def _process_external_lineage_result_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed(key, SnowflakeObjectDomain.TABLE): - return - - if db_row["UPSTREAM_LOCATIONS"] is not None: - external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"]) - - for loc in external_locations: - if loc not in self._external_lineage_map[key]: - self._external_lineage_map[key].add(loc) - self.report.num_external_table_edges_scanned += 1 - - logger.debug( - f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via access_history" - ) - - def _populate_lineage(self) -> None: - query: str = SnowflakeQuery.table_to_table_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned = 0 - try: - for db_row in self.query(query): - self._process_table_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "table-lineage", - f"Extracting lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_table_edges_scanned} Table->Table edges found" - f" for {len(self._lineage_map)} downstream tables.", - ) - - def _process_table_lineage_row(self, db_row): - # key is the down-stream table name - key: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - upstream_table_name = self.get_dataset_identifier_from_qualified_name( - db_row["UPSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - key, SnowflakeObjectDomain.TABLE - ) or not ( - self._is_dataset_pattern_allowed( - upstream_table_name, SnowflakeObjectDomain.TABLE, is_upstream=True - ) - ): - return - self._lineage_map[key].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - upstream_table_name, - db_row["UPSTREAM_TABLE_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_table_to_table_edges_scanned += 1 - logger.debug(f"Lineage[Table(Down)={key}]:Table(Up)={self._lineage_map[key]}") - - def _populate_view_upstream_lineage(self) -> None: - # NOTE: This query captures only the upstream lineage of a view (with no column lineage). - # For more details see: https://docs.snowflake.com/en/user-guide/object-dependencies.html#object-dependencies - # and also https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - view_upstream_lineage_query: str = SnowflakeQuery.view_dependencies() - - self.report.num_table_to_view_edges_scanned = 0 - - try: - for db_row in self.query(view_upstream_lineage_query): - self._process_view_upstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get table to view lineage. Please grant imported privileges on SNOWFLAKE database." - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-upstream-lineage", - f"Extracting the upstream view lineage from Snowflake failed due to error {e}.", - ) - logger.info( - f"A total of {self.report.num_table_to_view_edges_scanned} View upstream edges found." - ) - - def _process_view_upstream_lineage_row(self, db_row): - # Process UpstreamTable/View/ExternalTable/Materialized View->View edge. - view_upstream: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_UPSTREAM"] - ) - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_VIEW"] - ) - - if not self._is_dataset_pattern_allowed( - dataset_name=view_name, - dataset_type=db_row["REFERENCING_OBJECT_DOMAIN"], - ) or not self._is_dataset_pattern_allowed( - view_upstream, db_row["REFERENCED_OBJECT_DOMAIN"], is_upstream=True - ): - return - # key is the downstream view name - self._lineage_map[view_name].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict(view_upstream, None, None), - self.config.include_column_lineage, - ) - self.report.num_table_to_view_edges_scanned += 1 - logger.debug( - f"Upstream->View: Lineage[View(Down)={view_name}]:Upstream={view_upstream}" - ) - - def _populate_view_downstream_lineage(self) -> None: - # This query captures the downstream table lineage for views. - # See https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views. - # Eg: For viewA->viewB->ViewC->TableD, snowflake does not yet log intermediate view logs, resulting in only the viewA->TableD edge. - view_lineage_query: str = SnowflakeQuery.view_lineage_history( - start_time_millis=int(self.config.start_time.timestamp() * 1000) - if not self.config.ignore_start_time_lineage - else 0, - end_time_millis=int(self.config.end_time.timestamp() * 1000), - include_column_lineage=self.config.include_column_lineage, - ) - - self.report.num_view_to_table_edges_scanned = 0 - - try: - for db_row in self.query(view_lineage_query): - self._process_view_downstream_lineage_row(db_row) - except Exception as e: - if isinstance(e, SnowflakePermissionError): - error_msg = "Failed to get view to table lineage. Please grant imported privileges on SNOWFLAKE database. " - self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg) - else: - logger.debug(e, exc_info=e) - self.report_warning( - "view-downstream-lineage", - f"Extracting the view lineage from Snowflake failed due to error {e}.", - ) - - logger.info( - f"Found {self.report.num_view_to_table_edges_scanned} View->Table edges." - ) - - def _process_view_downstream_lineage_row(self, db_row): - view_name: str = self.get_dataset_identifier_from_qualified_name( - db_row["VIEW_NAME"] - ) - downstream_table: str = self.get_dataset_identifier_from_qualified_name( - db_row["DOWNSTREAM_TABLE_NAME"] - ) - if not self._is_dataset_pattern_allowed( - view_name, db_row["VIEW_DOMAIN"], is_upstream=True - ) or not self._is_dataset_pattern_allowed( - downstream_table, db_row["DOWNSTREAM_TABLE_DOMAIN"] - ): - return - - # Capture view->downstream table lineage. - self._lineage_map[downstream_table].update_lineage( - # (, , ) - SnowflakeUpstreamTable.from_dict( - view_name, - db_row["VIEW_COLUMNS"], - db_row["DOWNSTREAM_TABLE_COLUMNS"], - ), - self.config.include_column_lineage, - ) - self.report.num_view_to_table_edges_scanned += 1 - - logger.debug( - f"View->Table: Lineage[Table(Down)={downstream_table}]:View(Up)={self._lineage_map[downstream_table]}" - ) - - def update_upstream_tables_lineage( - self, upstream_tables: List[UpstreamClass], lineage: SnowflakeTableLineage - ) -> None: - for lineage_entry in sorted( - lineage.upstreamTables.values(), key=lambda x: x.upstreamDataset - ): - upstream_table_name = lineage_entry.upstreamDataset - upstream_table = UpstreamClass( - dataset=self.dataset_urn_builder(upstream_table_name), - type=DatasetLineageTypeClass.TRANSFORMED, - ) - upstream_tables.append(upstream_table) - - def update_upstream_columns_lineage( - self, - dataset_urn: str, - finegrained_lineages: List[FineGrainedLineage], - lineage: SnowflakeTableLineage, - ) -> None: - # For every column for which upstream lineage is available - for col, col_upstreams in lineage.columnLineages.items(): - # For every upstream of column - self.update_upstream_columns_lineage_of_column( - dataset_urn, col, finegrained_lineages, col_upstreams - ) - - def update_upstream_columns_lineage_of_column( - self, - dataset_urn: str, - col: str, - finegrained_lineages: List[FineGrainedLineage], - col_upstreams: SnowflakeColumnUpstreams, - ) -> None: - for fine_upstream in col_upstreams.upstreams: - finegrained_lineage_entry = self.build_finegrained_lineage( - dataset_urn, col, fine_upstream - ) - if finegrained_lineage_entry.upstreams: - finegrained_lineages.append(finegrained_lineage_entry) - - def build_finegrained_lineage( - self, - dataset_urn: str, - col: str, - fine_upstream: SnowflakeColumnFineGrainedLineage, - ) -> FineGrainedLineage: - fieldPath = col - - column_upstreams = self.build_finegrained_lineage_upstreams(fine_upstream) - finegrained_lineage_entry = FineGrainedLineage( - upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, - # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend - # even if the lineage is same but the order is different. - upstreams=sorted(column_upstreams), - downstreamType=FineGrainedLineageDownstreamType.FIELD, - downstreams=[ - builder.make_schema_field_urn( - dataset_urn, self.snowflake_identifier(fieldPath) - ) - ], - ) - - return finegrained_lineage_entry - - def build_finegrained_lineage_upstreams( - self, fine_upstream: SnowflakeColumnFineGrainedLineage - ) -> List[str]: - column_upstreams = [] - for upstream_col in fine_upstream.inputColumns: - if ( - upstream_col.objectName - and upstream_col.columnName - and self._is_dataset_pattern_allowed( - upstream_col.objectName, upstream_col.objectDomain, is_upstream=True - ) - ): - upstream_dataset_name = self.get_dataset_identifier_from_qualified_name( - upstream_col.objectName - ) - column_upstreams.append( - builder.make_schema_field_urn( - self.dataset_urn_builder(upstream_dataset_name), - self.snowflake_identifier(upstream_col.columnName), - ) - ) - return column_upstreams - - def update_external_tables_lineage( - self, upstream_tables: List[UpstreamClass], external_lineage: Set[str] - ) -> None: - for external_lineage_entry in sorted(external_lineage): - # For now, populate only for S3 - if external_lineage_entry.startswith("s3://"): - external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), - type=DatasetLineageTypeClass.COPY, - ) - upstream_tables.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 039eac1e93819..0f89324f5efc6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -1,5 +1,6 @@ from typing import List, Optional +from datahub.configuration.time_window_config import BucketDuration from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain from datahub.ingestion.source.snowflake.snowflake_config import DEFAULT_TABLES_DENY_LIST @@ -505,35 +506,6 @@ def view_dependencies_v2() -> str: def show_external_tables() -> str: return "show external tables in account" - # Note - This method should be removed once legacy lineage is removed - @staticmethod - def external_table_lineage_history( - start_time_millis: int, end_time_millis: int - ) -> str: - return f""" - WITH external_table_lineage_history AS ( - SELECT - r.value:"locations" AS upstream_locations, - w.value:"objectName"::varchar AS downstream_table_name, - w.value:"objectDomain"::varchar AS downstream_table_domain, - w.value:"columns" AS downstream_table_columns, - t.query_start_time AS query_start_time - FROM - (SELECT * from snowflake.account_usage.access_history) t, - lateral flatten(input => t.BASE_OBJECTS_ACCESSED) r, - lateral flatten(input => t.OBJECTS_MODIFIED) w - WHERE r.value:"locations" IS NOT NULL - AND w.value:"objectId" IS NOT NULL - AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3) - AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)) - SELECT - upstream_locations AS "UPSTREAM_LOCATIONS", - downstream_table_name AS "DOWNSTREAM_TABLE_NAME", - downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS" - FROM external_table_lineage_history - WHERE downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}' - QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name ORDER BY query_start_time DESC) = 1""" - @staticmethod def copy_lineage_history( start_time_millis: int, @@ -575,14 +547,17 @@ def get_access_history_date_range() -> str: def usage_per_object_per_time_bucket_for_time_window( start_time_millis: int, end_time_millis: int, - time_bucket_size: str, + time_bucket_size: BucketDuration, use_base_objects: bool, top_n_queries: int, include_top_n_queries: bool, ) -> str: if not include_top_n_queries: top_n_queries = 0 - assert time_bucket_size == "DAY" or time_bucket_size == "HOUR" + assert ( + time_bucket_size == BucketDuration.DAY + or time_bucket_size == BucketDuration.HOUR + ) objects_column = ( "BASE_OBJECTS_ACCESSED" if use_base_objects else "DIRECT_OBJECTS_ACCESSED" ) @@ -629,7 +604,7 @@ def usage_per_object_per_time_bucket_for_time_window( SELECT object_name, ANY_VALUE(object_domain) AS object_domain, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries, count( distinct(user_name) ) AS total_users FROM @@ -644,7 +619,7 @@ def usage_per_object_per_time_bucket_for_time_window( SELECT object_name, column_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries FROM field_access_history @@ -658,7 +633,7 @@ def usage_per_object_per_time_bucket_for_time_window( ( SELECT object_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, count(distinct(query_id)) AS total_queries, user_name, ANY_VALUE(users.email) AS user_email @@ -677,7 +652,7 @@ def usage_per_object_per_time_bucket_for_time_window( ( SELECT object_name, - DATE_TRUNC('{time_bucket_size}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, + DATE_TRUNC('{time_bucket_size.value}', CONVERT_TIMEZONE('UTC', query_start_time)) AS bucket_start_time, query_history.query_text AS query_text, count(distinct(access_history.query_id)) AS total_queries FROM diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index dab46645bffcc..e5b214ba35e4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -261,6 +261,7 @@ def get_tables_for_database( for table in cur: if table["TABLE_SCHEMA"] not in tables: tables[table["TABLE_SCHEMA"]] = [] + tables[table["TABLE_SCHEMA"]].append( SnowflakeTable( name=table["TABLE_NAME"], diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py new file mode 100644 index 0000000000000..6f7520bbf1988 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_shares.py @@ -0,0 +1,158 @@ +import logging +from typing import Callable, Iterable, List + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import SnowflakeDatabase +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetLineageType, + Upstream, + UpstreamLineage, +) + +logger: logging.Logger = logging.getLogger(__name__) + + +class SnowflakeSharesHandler(SnowflakeCommonMixin): + def __init__( + self, + config: SnowflakeV2Config, + report: SnowflakeV2Report, + dataset_urn_builder: Callable[[str], str], + ) -> None: + self.config = config + self.report = report + self.logger = logger + self.dataset_urn_builder = dataset_urn_builder + + def get_shares_workunits( + self, databases: List[SnowflakeDatabase] + ) -> Iterable[MetadataWorkUnit]: + inbounds = self.config.inbounds() + outbounds = self.config.outbounds() + # None of the databases are shared + if not (inbounds or outbounds): + return + + logger.debug("Checking databases for inbound or outbound shares.") + for db in databases: + is_inbound = db.name in inbounds + is_outbound = db.name in outbounds + + if not (is_inbound or is_outbound): + logger.debug(f"database {db.name} is not shared.") + continue + + sibling_dbs = ( + list(outbounds[db.name]) if is_outbound else [inbounds[db.name]] + ) + + for schema in db.schemas: + for table_name in schema.tables + schema.views: + # TODO: If this is outbound database, + # 1. attempt listing shares using `show shares` to identify name of share associated with this database (cache query result). + # 2. if corresponding share is listed, then run `show grants to share ` to identify exact tables, views included in share. + # 3. emit siblings only for the objects listed above. + # This will work only if the configured role has accountadmin role access OR is owner of share. + # Otherwise ghost nodes may be shown in "Composed Of" section for tables/views in original database which are not granted to share. + yield from self.gen_siblings( + db.name, + schema.name, + table_name, + is_outbound, + sibling_dbs, + ) + + if is_inbound: + assert len(sibling_dbs) == 1 + # SnowflakeLineageExtractor is unaware of database->schema->table hierarchy + # hence this lineage code is not written in SnowflakeLineageExtractor + # also this is not governed by configs include_table_lineage and include_view_lineage + yield self.get_upstream_lineage_with_primary_sibling( + db.name, schema.name, table_name, sibling_dbs[0] + ) + + self.report_missing_databases( + databases, list(inbounds.keys()), list(outbounds.keys()) + ) + + def report_missing_databases( + self, + databases: List[SnowflakeDatabase], + inbounds: List[str], + outbounds: List[str], + ) -> None: + db_names = [db.name for db in databases] + missing_dbs = [db for db in inbounds + outbounds if db not in db_names] + + if missing_dbs: + self.report_warning( + "snowflake-shares", + f"Databases {missing_dbs} were not ingested. Siblings/Lineage will not be set for these.", + ) + + def gen_siblings( + self, + database_name: str, + schema_name: str, + table_name: str, + primary: bool, + sibling_databases: List[DatabaseId], + ) -> Iterable[MetadataWorkUnit]: + if not sibling_databases: + return + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + sibling_urns = [ + make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, sibling_db.database + ), + sibling_db.platform_instance, + ) + for sibling_db in sibling_databases + ] + + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=Siblings(primary=primary, siblings=sorted(sibling_urns)), + ).as_workunit() + + def get_upstream_lineage_with_primary_sibling( + self, + database_name: str, + schema_name: str, + table_name: str, + primary_sibling_db: DatabaseId, + ) -> MetadataWorkUnit: + dataset_identifier = self.get_dataset_identifier( + table_name, schema_name, database_name + ) + urn = self.dataset_urn_builder(dataset_identifier) + + upstream_urn = make_dataset_urn_with_platform_instance( + self.platform, + self.get_dataset_identifier( + table_name, schema_name, primary_sibling_db.database + ), + primary_sibling_db.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=UpstreamLineage( + upstreams=[Upstream(dataset=upstream_urn, type=DatasetLineageType.COPY)] + ), + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 3605205b6055c..f8dfa612952d8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -356,7 +356,6 @@ def _check_usage_date_ranges(self) -> Any: def _get_operation_aspect_work_unit( self, event: SnowflakeJoinedAccessEvent, discovered_datasets: List[str] ) -> Iterable[MetadataWorkUnit]: - if event.query_start_time and event.query_type: start_time = event.query_start_time query_type = event.query_type diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 7dd51d5b20e8e..2cb4b37fdd696 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -51,9 +51,6 @@ SnowflakeV2Config, TagOption, ) -from datahub.ingestion.source.snowflake.snowflake_lineage_legacy import ( - SnowflakeLineageExtractor as SnowflakeLineageLegacyExtractor, -) from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import ( SnowflakeLineageExtractor, ) @@ -71,6 +68,7 @@ SnowflakeTag, SnowflakeView, ) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler from datahub.ingestion.source.snowflake.snowflake_tag import SnowflakeTagExtractor from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeUsageExtractor, @@ -240,19 +238,10 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary() - self.lineage_extractor: Union[ - SnowflakeLineageExtractor, SnowflakeLineageLegacyExtractor - ] if config.include_table_lineage: - # For lineage - if self.config.use_legacy_lineage_method: - self.lineage_extractor = SnowflakeLineageLegacyExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) - else: - self.lineage_extractor = SnowflakeLineageExtractor( - config, self.report, dataset_urn_builder=self.gen_dataset_urn - ) + self.lineage_extractor = SnowflakeLineageExtractor( + config, self.report, dataset_urn_builder=self.gen_dataset_urn + ) if config.include_usage_stats or config.include_operational_stats: self.usage_extractor = SnowflakeUsageExtractor( @@ -503,9 +492,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: return self.data_dictionary.set_connection(self.connection) - databases = self.get_databases() + databases: List[SnowflakeDatabase] = [] - if databases is None or len(databases) == 0: + for database in self.get_databases() or []: + self.report.report_entity_scanned(database.name, "database") + if not self.config.database_pattern.allowed(database.name): + self.report.report_dropped(f"{database.name}.*") + else: + databases.append(database) + + if len(databases) == 0: return for snowflake_db in databases: @@ -532,25 +528,22 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # TODO: The checkpoint state for stale entity detection can be committed here. + if self.config.shares: + yield from SnowflakeSharesHandler( + self.config, self.report, self.gen_dataset_urn + ).get_shares_workunits(databases) + discovered_tables: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.tables - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.TABLE, - ) ] discovered_views: List[str] = [ self.get_dataset_identifier(table_name, schema.name, db.name) for db in databases for schema in db.schemas for table_name in schema.views - if self._is_dataset_pattern_allowed( - self.get_dataset_identifier(table_name, schema.name, db.name), - SnowflakeObjectDomain.VIEW, - ) ] if len(discovered_tables) == 0 and len(discovered_views) == 0: @@ -654,11 +647,6 @@ def get_databases_from_ischema(self, databases): def _process_database( self, snowflake_db: SnowflakeDatabase ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_db.name, "database") - if not self.config.database_pattern.allowed(snowflake_db.name): - self.report.report_dropped(f"{snowflake_db.name}.*") - return - db_name = snowflake_db.name try: @@ -704,11 +692,22 @@ def _process_database( if self.config.is_profiling_enabled() and self.db_tables: yield from self.profiler.get_workunits(snowflake_db, self.db_tables) - def fetch_schemas_for_database(self, snowflake_db, db_name): + def fetch_schemas_for_database( + self, snowflake_db: SnowflakeDatabase, db_name: str + ) -> None: + schemas: List[SnowflakeSchema] = [] try: - snowflake_db.schemas = self.data_dictionary.get_schemas_for_database( - db_name - ) + for schema in self.data_dictionary.get_schemas_for_database(db_name): + self.report.report_entity_scanned(schema.name, "schema") + if not is_schema_allowed( + self.config.schema_pattern, + schema.name, + db_name, + self.config.match_fully_qualified_names, + ): + self.report.report_dropped(f"{db_name}.{schema.name}.*") + else: + schemas.append(schema) except Exception as e: if isinstance(e, SnowflakePermissionError): error_msg = f"Failed to get schemas for database {db_name}. Please check permissions." @@ -724,25 +723,17 @@ def fetch_schemas_for_database(self, snowflake_db, db_name): db_name, ) - if not snowflake_db.schemas: + if not schemas: self.report_warning( "No schemas found in database. If schemas exist, please grant USAGE permissions on them.", db_name, ) + else: + snowflake_db.schemas = schemas def _process_schema( self, snowflake_schema: SnowflakeSchema, db_name: str ) -> Iterable[MetadataWorkUnit]: - self.report.report_entity_scanned(snowflake_schema.name, "schema") - if not is_schema_allowed( - self.config.schema_pattern, - snowflake_schema.name, - db_name, - self.config.match_fully_qualified_names, - ): - self.report.report_dropped(f"{db_name}.{snowflake_schema.name}.*") - return - schema_name = snowflake_schema.name if self.config.extract_tags != TagOption.skip: @@ -784,9 +775,20 @@ def _process_schema( f"{db_name}.{schema_name}", ) - def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_views_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeView]: try: - views = self.get_views_for_schema(schema_name, db_name) + views: List[SnowflakeView] = [] + for view in self.get_views_for_schema(schema_name, db_name): + view_name = self.get_dataset_identifier(view.name, schema_name, db_name) + + self.report.report_entity_scanned(view_name, "view") + + if not self.config.view_pattern.allowed(view_name): + self.report.report_dropped(view_name) + else: + views.append(view) snowflake_schema.views = [view.name for view in views] return views except Exception as e: @@ -804,10 +806,22 @@ def fetch_views_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get views for schema", f"{db_name}.{schema_name}", ) + return [] - def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): + def fetch_tables_for_schema( + self, snowflake_schema: SnowflakeSchema, db_name: str, schema_name: str + ) -> List[SnowflakeTable]: try: - tables = self.get_tables_for_schema(schema_name, db_name) + tables: List[SnowflakeTable] = [] + for table in self.get_tables_for_schema(schema_name, db_name): + table_identifier = self.get_dataset_identifier( + table.name, schema_name, db_name + ) + self.report.report_entity_scanned(table_identifier) + if not self.config.table_pattern.allowed(table_identifier): + self.report.report_dropped(table_identifier) + else: + tables.append(table) snowflake_schema.tables = [table.name for table in tables] return tables except Exception as e: @@ -824,6 +838,7 @@ def fetch_tables_for_schema(self, snowflake_schema, db_name, schema_name): "Failed to get tables for schema", f"{db_name}.{schema_name}", ) + return [] def _process_table( self, @@ -833,12 +848,6 @@ def _process_table( ) -> Iterable[MetadataWorkUnit]: table_identifier = self.get_dataset_identifier(table.name, schema_name, db_name) - self.report.report_entity_scanned(table_identifier) - - if not self.config.table_pattern.allowed(table_identifier): - self.report.report_dropped(table_identifier) - return - self.fetch_columns_for_table(table, schema_name, db_name, table_identifier) self.fetch_pk_for_table(table, schema_name, db_name, table_identifier) @@ -950,12 +959,6 @@ def _process_view( ) -> Iterable[MetadataWorkUnit]: view_name = self.get_dataset_identifier(view.name, schema_name, db_name) - self.report.report_entity_scanned(view_name, "view") - - if not self.config.view_pattern.allowed(view_name): - self.report.report_dropped(view_name) - return - try: view.columns = self.get_columns_for_table(view.name, schema_name, db_name) if self.config.extract_tags != TagOption.skip: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py deleted file mode 100644 index a9afd40fd45b6..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py +++ /dev/null @@ -1,278 +0,0 @@ -import logging -import urllib.parse -from typing import Any, Dict, Iterable, List, Optional, Tuple - -import pydantic -import sqlalchemy.dialects.mssql - -# This import verifies that the dependencies are available. -import sqlalchemy_pytds # noqa: F401 -from pydantic.fields import Field -from sqlalchemy import create_engine, inspect -from sqlalchemy.engine.base import Connection -from sqlalchemy.engine.reflection import Inspector - -from datahub.configuration.common import AllowDenyPattern -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.source.sql.sql_common import ( - SQLAlchemySource, - register_custom_type, -) -from datahub.ingestion.source.sql.sql_config import ( - BasicSQLAlchemyConfig, - make_sqlalchemy_uri, -) -from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass - -logger: logging.Logger = logging.getLogger(__name__) - -register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) -register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) - - -class SQLServerConfig(BasicSQLAlchemyConfig): - # defaults - host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") - scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) - use_odbc: bool = Field( - default=False, - description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", - ) - uri_args: Dict[str, str] = Field( - default={}, - description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", - ) - database_pattern: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="Regex patterns for databases to filter in ingestion.", - ) - database: Optional[str] = Field( - default=None, - description="database (catalog). If set to Null, all databases will be considered for ingestion.", - ) - convert_urns_to_lowercase: bool = Field( - default=False, - description="Enable to convert the SQL Server assets urns to lowercase", - ) - - @pydantic.validator("uri_args") - def passwords_match(cls, v, values, **kwargs): - if values["use_odbc"] and "driver" not in v: - raise ValueError("uri_args must contain a 'driver' option") - elif not values["use_odbc"] and v: - raise ValueError("uri_args is not supported when ODBC is disabled") - return v - - def get_sql_alchemy_url( - self, - uri_opts: Optional[Dict[str, Any]] = None, - current_db: Optional[str] = None, - ) -> str: - if self.use_odbc: - # Ensure that the import is available. - import pyodbc # noqa: F401 - - self.scheme = "mssql+pyodbc" - - uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( - self.scheme, # type: ignore - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, # type: ignore - current_db if current_db else self.database, - uri_opts=uri_opts, - ) - if self.use_odbc: - uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" - return uri - - -@platform_name("Microsoft SQL Server", id="mssql") -@config_class(SQLServerConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") -@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") -@capability( - SourceCapability.USAGE_STATS, - "Not provided by this module, use `bigquery-usage` for that.", - supported=False, -) -@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") -class SQLServerSource(SQLAlchemySource): - """ - This plugin extracts the following: - - - Metadata for databases, schemas, views and tables - - Column types associated with each table/view - - Table, row, and column statistics via optional SQL profiling - - We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. - """ - - def __init__(self, config: SQLServerConfig, ctx: PipelineContext): - super().__init__(config, ctx, "mssql") - # Cache the table and column descriptions - self.config: SQLServerConfig = config - self.current_database = None - self.table_descriptions: Dict[str, str] = {} - self.column_descriptions: Dict[str, str] = {} - for inspector in self.get_inspectors(): - db_name: str = self.get_db_name(inspector) - with inspector.engine.connect() as conn: - if self.config.use_odbc: - self._add_output_converters(conn) - self._populate_table_descriptions(conn, db_name) - self._populate_column_descriptions(conn, db_name) - - @staticmethod - def _add_output_converters(conn: Connection) -> None: - def handle_sql_variant_as_string(value): - return value.decode("utf-16le") - - # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported - # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy - try: - conn.connection.add_output_converter(-150, handle_sql_variant_as_string) - except AttributeError as e: - logger.debug( - f"Failed to mount output converter for MSSQL data type -150 due to {e}" - ) - - def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: - # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name - # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ - table_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - EP.VALUE AS table_description - FROM sys.tables AS T - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = 0 - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in table_metadata: - self.table_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}" - ] = row["table_description"] - - def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: - column_metadata = conn.execute( - """ - SELECT - SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, - T.NAME AS table_name, - C.NAME AS column_name , - EP.VALUE AS column_description - FROM sys.tables AS T - INNER JOIN sys.all_columns AS C - ON C.OBJECT_ID = T.[OBJECT_ID] - INNER JOIN sys.extended_properties AS EP - ON EP.MAJOR_ID = T.[OBJECT_ID] - AND EP.MINOR_ID = C.COLUMN_ID - AND EP.NAME = 'MS_Description' - AND EP.CLASS = 1 - """ - ) - for row in column_metadata: - self.column_descriptions[ - f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" - ] = row["column_description"] - - @classmethod - def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": - config = SQLServerConfig.parse_obj(config_dict) - return cls(config, ctx) - - # override to get table descriptions - def get_table_properties( - self, inspector: Inspector, schema: str, table: str - ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: - description, properties, location_urn = super().get_table_properties( - inspector, schema, table - ) - # Update description if available. - db_name: str = self.get_db_name(inspector) - description = self.table_descriptions.get( - f"{db_name}.{schema}.{table}", description - ) - return description, properties, location_urn - - # override to get column descriptions - def _get_columns( - self, dataset_name: str, inspector: Inspector, schema: str, table: str - ) -> List[Dict]: - columns: List[Dict] = super()._get_columns( - dataset_name, inspector, schema, table - ) - # Update column description if available. - db_name: str = self.get_db_name(inspector) - for column in columns: - description: Optional[str] = self.column_descriptions.get( - f"{db_name}.{schema}.{table}.{column['name']}", - ) - if description: - column["comment"] = description - return columns - - def get_inspectors(self) -> Iterable[Inspector]: - # This method can be overridden in the case that you want to dynamically - # run on multiple databases. - url = self.config.get_sql_alchemy_url() - logger.debug(f"sql_alchemy_url={url}") - engine = create_engine(url, **self.config.options) - with engine.connect() as conn: - if self.config.database and self.config.database != "": - inspector = inspect(conn) - yield inspector - else: - databases = conn.execute( - "SELECT name FROM master.sys.databases WHERE name NOT IN \ - ('master', 'model', 'msdb', 'tempdb', 'Resource', \ - 'distribution' , 'reportserver', 'reportservertempdb'); " - ) - for db in databases: - if self.config.database_pattern.allowed(db["name"]): - url = self.config.get_sql_alchemy_url(current_db=db["name"]) - with create_engine( - url, **self.config.options - ).connect() as conn: - inspector = inspect(conn) - self.current_database = db["name"] - yield inspector - - def get_identifier( - self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any - ) -> str: - regular = f"{schema}.{entity}" - - qualified_table_name = regular - - if self.config.database: - if self.config.database_alias: - qualified_table_name = f"{self.config.database_alias}.{regular}" - else: - qualified_table_name = f"{self.config.database}.{regular}" - - if self.current_database: - qualified_table_name = f"{self.current_database}.{regular}" - - return ( - qualified_table_name.lower() - if self.config.convert_urns_to_lowercase - else qualified_table_name - ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py new file mode 100644 index 0000000000000..8db89505a9cf6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/__init__.py @@ -0,0 +1 @@ +from datahub.ingestion.source.sql.mssql.source import SQLServerConfig, SQLServerSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py new file mode 100644 index 0000000000000..8aeb5421891aa --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union + +from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.metadata.schema_classes import ( + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, +) + + +@dataclass +class ProcedureDependency: + db: str + schema: str + name: str + type: str + env: str + server: str + source: str = "mssql" + + +@dataclass +class ProcedureLineageStream: + dependencies: List[ProcedureDependency] + + @property + def as_property(self) -> Dict[str, str]: + return { + f"{dep.db}.{dep.schema}.{dep.name}": dep.type for dep in self.dependencies + } + + +@dataclass +class MSSQLJob: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.formatted_name},{self.env})" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + +@dataclass +class MSSQLProceduresContainer: + db: str + platform_instance: str + name: str + env: str + source: str = "mssql" + type: str = "JOB" + + @property + def formatted_name(self) -> str: + return f"{self.formatted_platform_instance}.{self.name.replace(',', '-')}" + + @property + def orchestrator(self) -> str: + return self.source + + @property + def formatted_platform_instance(self) -> str: + return self.platform_instance.replace(".", "/") + + @property + def cluster(self) -> str: + return f"{self.env}" + + @property + def full_type(self) -> str: + return f"({self.source},{self.name},{self.env})" + + +@dataclass +class ProcedureParameter: + name: str + type: str + + @property + def properties(self) -> Dict[str, str]: + return {"type": self.type} + + +@dataclass +class StoredProcedure: + db: str + schema: str + name: str + flow: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "STORED_PROCEDURE" + source: str = "mssql" + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def formatted_name(self) -> str: + return self.name.replace(",", "-") + + @property + def full_name(self) -> str: + return f"{self.db}.{self.schema}.{self.formatted_name}" + + @property + def escape_full_name(self) -> str: + return f"[{self.db}].[{self.schema}].[{self.formatted_name}]" + + +@dataclass +class JobStep: + job_name: str + step_name: str + flow: MSSQLJob + type: str = "JOB_STEP" + source: str = "mssql" + + @property + def formatted_step(self) -> str: + return self.step_name.replace(",", "-").replace(" ", "_").lower() + + @property + def formatted_name(self) -> str: + return self.job_name.replace(",", "-") + + @property + def full_type(self) -> str: + return self.source.upper() + "_" + self.type + + @property + def full_name(self) -> str: + return f"{self.formatted_name}.{self.formatted_name}" + + +@dataclass +class MSSQLDataJob: + entity: Union[StoredProcedure, JobStep] + type: str = "dataJob" + source: str = "mssql" + external_url: str = "" + description: Optional[str] = None + status: Optional[str] = None + incoming: List[str] = field(default_factory=list) + outgoing: List[str] = field(default_factory=list) + input_jobs: List[str] = field(default_factory=list) + job_properties: Dict[str, str] = field(default_factory=dict) + + @property + def urn(self) -> str: + return make_data_job_urn( + orchestrator=self.entity.flow.orchestrator, + flow_id=self.entity.flow.formatted_name, + job_id=self.entity.formatted_name, + cluster=self.entity.flow.cluster, + ) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.job_properties[name] = value + + @property + def valued_properties(self) -> Dict[str, str]: + if self.job_properties: + return {k: v for k, v in self.job_properties.items() if v is not None} + return self.job_properties + + @property + def as_datajob_input_output_aspect(self) -> DataJobInputOutputClass: + return DataJobInputOutputClass( + inputDatasets=sorted(self.incoming), + outputDatasets=sorted(self.outgoing), + inputDatajobs=sorted(self.input_jobs), + ) + + @property + def as_datajob_info_aspect(self) -> DataJobInfoClass: + return DataJobInfoClass( + name=self.entity.full_name, + type=self.entity.full_type, + description=self.description, + customProperties=self.valued_properties, + externalUrl=self.external_url, + status=self.status, + ) + + +@dataclass +class MSSQLDataFlow: + entity: Union[MSSQLJob, MSSQLProceduresContainer] + type: str = "dataFlow" + source: str = "mssql" + external_url: str = "" + flow_properties: Dict[str, str] = field(default_factory=dict) + + def add_property( + self, + name: str, + value: str, + ) -> None: + self.flow_properties[name] = value + + @property + def urn(self) -> str: + return make_data_flow_urn( + orchestrator=self.entity.orchestrator, + flow_id=self.entity.formatted_name, + cluster=self.entity.cluster, + ) + + @property + def as_dataflow_info_aspect(self) -> DataFlowInfoClass: + return DataFlowInfoClass( + name=self.entity.formatted_name, + customProperties=self.flow_properties, + externalUrl=self.external_url, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py new file mode 100644 index 0000000000000..3c7701d93edeb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -0,0 +1,665 @@ +import logging +import re +import urllib.parse +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import pydantic +import sqlalchemy.dialects.mssql + +# This import verifies that the dependencies are available. +import sqlalchemy_pytds # noqa: F401 +from pydantic.fields import Field +from sqlalchemy import create_engine, inspect +from sqlalchemy.engine.base import Connection +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.exc import ProgrammingError, ResourceClosedError + +from datahub.configuration.common import AllowDenyPattern +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.sql.mssql.job_models import ( + JobStep, + MSSQLDataFlow, + MSSQLDataJob, + MSSQLJob, + MSSQLProceduresContainer, + ProcedureDependency, + ProcedureLineageStream, + ProcedureParameter, + StoredProcedure, +) +from datahub.ingestion.source.sql.sql_common import ( + SQLAlchemySource, + SqlWorkUnit, + register_custom_type, +) +from datahub.ingestion.source.sql.sql_config import ( + BasicSQLAlchemyConfig, + make_sqlalchemy_uri, +) +from datahub.metadata.schema_classes import BooleanTypeClass, UnionTypeClass + +logger: logging.Logger = logging.getLogger(__name__) + +register_custom_type(sqlalchemy.dialects.mssql.BIT, BooleanTypeClass) +register_custom_type(sqlalchemy.dialects.mssql.SQL_VARIANT, UnionTypeClass) + + +class SQLServerConfig(BasicSQLAlchemyConfig): + # defaults + host_port: str = Field(default="localhost:1433", description="MSSQL host URL.") + scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True) + include_stored_procedures: bool = Field( + default=True, + description="Include ingest of stored procedures. Requires access to the 'sys' schema.", + ) + include_stored_procedures_code: bool = Field( + default=True, description="Include information about object code." + ) + include_jobs: bool = Field( + default=True, + description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.", + ) + include_descriptions: bool = Field( + default=True, description="Include table descriptions information." + ) + use_odbc: bool = Field( + default=False, + description="See https://docs.sqlalchemy.org/en/14/dialects/mssql.html#module-sqlalchemy.dialects.mssql.pyodbc.", + ) + uri_args: Dict[str, str] = Field( + default={}, + description="Arguments to URL-encode when connecting. See https://docs.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver15.", + ) + database_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for databases to filter in ingestion.", + ) + database: Optional[str] = Field( + default=None, + description="database (catalog). If set to Null, all databases will be considered for ingestion.", + ) + convert_urns_to_lowercase: bool = Field( + default=False, + description="Enable to convert the SQL Server assets urns to lowercase", + ) + + @pydantic.validator("uri_args") + def passwords_match(cls, v, values, **kwargs): + if values["use_odbc"] and "driver" not in v: + raise ValueError("uri_args must contain a 'driver' option") + elif not values["use_odbc"] and v: + raise ValueError("uri_args is not supported when ODBC is disabled") + return v + + def get_sql_alchemy_url( + self, + uri_opts: Optional[Dict[str, Any]] = None, + current_db: Optional[str] = None, + ) -> str: + if self.use_odbc: + # Ensure that the import is available. + import pyodbc # noqa: F401 + + self.scheme = "mssql+pyodbc" + + uri: str = self.sqlalchemy_uri or make_sqlalchemy_uri( + self.scheme, # type: ignore + self.username, + self.password.get_secret_value() if self.password else None, + self.host_port, # type: ignore + current_db if current_db else self.database, + uri_opts=uri_opts, + ) + if self.use_odbc: + uri = f"{uri}?{urllib.parse.urlencode(self.uri_args)}" + return uri + + @property + def host(self): + return self.platform_instance or self.host_port.split(":")[0] + + @property + def db(self): + return self.database_alias or self.database + + +@platform_name("Microsoft SQL Server", id="mssql") +@config_class(SQLServerConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") +class SQLServerSource(SQLAlchemySource): + """ + This plugin extracts the following: + - Metadata for databases, schemas, views and tables + - Column types associated with each table/view + - Table, row, and column statistics via optional SQL profiling + We have two options for the underlying library used to connect to SQL Server: (1) [python-tds](https://github.com/denisenkom/pytds) and (2) [pyodbc](https://github.com/mkleehammer/pyodbc). The TDS library is pure Python and hence easier to install, but only PyODBC supports encrypted connections. + """ + + def __init__(self, config: SQLServerConfig, ctx: PipelineContext): + super().__init__(config, ctx, "mssql") + # Cache the table and column descriptions + self.config: SQLServerConfig = config + self.current_database = None + self.table_descriptions: Dict[str, str] = {} + self.column_descriptions: Dict[str, str] = {} + if self.config.include_descriptions: + for inspector in self.get_inspectors(): + db_name: str = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + if self.config.use_odbc: + self._add_output_converters(conn) + self._populate_table_descriptions(conn, db_name) + self._populate_column_descriptions(conn, db_name) + + @staticmethod + def _add_output_converters(conn: Connection) -> None: + def handle_sql_variant_as_string(value): + try: + return value.decode("utf-16le") + except UnicodeDecodeError: + return value.decode("Windows-1251") + + # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported + # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy + try: + conn.connection.add_output_converter(-150, handle_sql_variant_as_string) + except AttributeError as e: + logger.debug( + f"Failed to mount output converter for MSSQL data type -150 due to {e}" + ) + + def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: + # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name + # also see https://www.mssqltips.com/sqlservertip/5384/working-with-sql-server-extended-properties/ + table_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + EP.VALUE AS table_description + FROM sys.tables AS T + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = 0 + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in table_metadata: + self.table_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}" + ] = row["table_description"] + + def _populate_column_descriptions(self, conn: Connection, db_name: str) -> None: + column_metadata = conn.execute( + """ + SELECT + SCHEMA_NAME(T.SCHEMA_ID) AS schema_name, + T.NAME AS table_name, + C.NAME AS column_name , + EP.VALUE AS column_description + FROM sys.tables AS T + INNER JOIN sys.all_columns AS C + ON C.OBJECT_ID = T.[OBJECT_ID] + INNER JOIN sys.extended_properties AS EP + ON EP.MAJOR_ID = T.[OBJECT_ID] + AND EP.MINOR_ID = C.COLUMN_ID + AND EP.NAME = 'MS_Description' + AND EP.CLASS = 1 + """ + ) + for row in column_metadata: + self.column_descriptions[ + f"{db_name}.{row['schema_name']}.{row['table_name']}.{row['column_name']}" + ] = row["column_description"] + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "SQLServerSource": + config = SQLServerConfig.parse_obj(config_dict) + return cls(config, ctx) + + # override to get table descriptions + def get_table_properties( + self, inspector: Inspector, schema: str, table: str + ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: + description, properties, location_urn = super().get_table_properties( + inspector, schema, table + ) + # Update description if available. + db_name: str = self.get_db_name(inspector) + description = self.table_descriptions.get( + f"{db_name}.{schema}.{table}", description + ) + return description, properties, location_urn + + # override to get column descriptions + def _get_columns( + self, dataset_name: str, inspector: Inspector, schema: str, table: str + ) -> List[Dict]: + columns: List[Dict] = super()._get_columns( + dataset_name, inspector, schema, table + ) + # Update column description if available. + db_name: str = self.get_db_name(inspector) + for column in columns: + description: Optional[str] = self.column_descriptions.get( + f"{db_name}.{schema}.{table}.{column['name']}", + ) + if description: + column["comment"] = description + return columns + + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from super().get_database_level_workunits( + inspector=inspector, + database=database, + ) + if self.config.include_jobs: + try: + yield from self.loop_jobs(inspector, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from super().get_schema_level_workunits( + inspector=inspector, + schema=schema, + database=database, + ) + if self.config.include_stored_procedures: + try: + yield from self.loop_stored_procedures(inspector, schema, self.config) + except Exception as e: + self.report.report_failure( + "jobs", + f"Failed to list jobs due to error {e}", + ) + + def _get_jobs(self, conn: Connection, db_name: str) -> Dict[str, Dict[str, Any]]: + jobs_data = conn.execute( + f""" + SELECT + job.job_id, + job.name, + job.description, + job.date_created, + job.date_modified, + steps.step_id, + steps.step_name, + steps.subsystem, + steps.command, + steps.database_name + FROM + msdb.dbo.sysjobs job + INNER JOIN + msdb.dbo.sysjobsteps steps + ON + job.job_id = steps.job_id + where database_name = '{db_name}' + """ + ) + jobs: Dict[str, Dict[str, Any]] = {} + for row in jobs_data: + step_data = dict( + job_id=row["job_id"], + job_name=row["name"], + description=row["description"], + date_created=row["date_created"], + date_modified=row["date_modified"], + step_id=row["step_id"], + step_name=row["step_name"], + subsystem=row["subsystem"], + command=row["command"], + ) + if row["name"] in jobs: + jobs[row["name"]][row["step_id"]] = step_data + else: + jobs[row["name"]] = {row["step_id"]: step_data} + return jobs + + def loop_jobs( + self, + inspector: Inspector, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop MS SQL jobs as dataFlow-s. + :return: + """ + db_name = self.get_db_name(inspector) + with inspector.engine.connect() as conn: + jobs = self._get_jobs(conn, db_name) + for job_name, job_steps in jobs.items(): + job = MSSQLJob( + name=job_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=job) + yield from self.construct_flow_workunits(data_flow=data_flow) + yield from self.loop_job_steps(job, job_steps) + + def loop_job_steps( + self, job: MSSQLJob, job_steps: Dict[str, Any] + ) -> Iterable[MetadataWorkUnit]: + for step_id, step_data in job_steps.items(): + step = JobStep( + job_name=job.formatted_name, + step_name=step_data["step_name"], + flow=job, + ) + data_job = MSSQLDataJob(entity=step) + for data_name, data_value in step_data.items(): + data_job.add_property(name=data_name, value=str(data_value)) + yield from self.construct_job_workunits(data_job) + + def loop_stored_procedures( # noqa: C901 + self, + inspector: Inspector, + schema: str, + sql_config: SQLServerConfig, + ) -> Iterable[MetadataWorkUnit]: + """ + Loop schema data for get stored procedures as dataJob-s. + """ + db_name = self.get_db_name(inspector) + procedure_flow_name = f"{db_name}.{schema}.stored_procedures" + mssql_default_job = MSSQLProceduresContainer( + name=procedure_flow_name, + env=sql_config.env, + db=db_name, + platform_instance=sql_config.host, + ) + data_flow = MSSQLDataFlow(entity=mssql_default_job) + with inspector.engine.connect() as conn: + procedures_data_list = self._get_stored_procedures(conn, db_name, schema) + procedures = [ + StoredProcedure(flow=mssql_default_job, **procedure_data) + for procedure_data in procedures_data_list + ] + if procedures: + yield from self.construct_flow_workunits(data_flow=data_flow) + for procedure in procedures: + upstream = self._get_procedure_upstream(conn, procedure) + downstream = self._get_procedure_downstream(conn, procedure) + data_job = MSSQLDataJob( + entity=procedure, + ) + # TODO: because of this upstream and downstream are more dependencies, + # can't be used as DataJobInputOutput. + # Should be reorganized into lineage. + data_job.add_property("procedure_depends_on", str(upstream.as_property)) + data_job.add_property( + "depending_on_procedure", str(downstream.as_property) + ) + procedure_definition, procedure_code = self._get_procedure_code( + conn, procedure + ) + if procedure_definition: + data_job.add_property("definition", procedure_definition) + if sql_config.include_stored_procedures_code and procedure_code: + data_job.add_property("code", procedure_code) + procedure_inputs = self._get_procedure_inputs(conn, procedure) + properties = self._get_procedure_properties(conn, procedure) + data_job.add_property( + "input parameters", str([param.name for param in procedure_inputs]) + ) + for param in procedure_inputs: + data_job.add_property( + f"parameter {param.name}", str(param.properties) + ) + for property_name, property_value in properties.items(): + data_job.add_property(property_name, str(property_value)) + yield from self.construct_job_workunits(data_job) + + @staticmethod + def _get_procedure_downstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + downstream_data = conn.execute( + f""" + SELECT DISTINCT OBJECT_SCHEMA_NAME ( referencing_id ) AS [schema], + OBJECT_NAME(referencing_id) AS [name], + o.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referenced_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND o.type_desc in ('TABLE_TYPE', 'VIEW', 'USER_TABLE') + """ + ) + downstream_dependencies = [] + for row in downstream_data: + downstream_dependencies.append( + ProcedureDependency( + db=procedure.db, + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=downstream_dependencies) + + @staticmethod + def _get_procedure_upstream( + conn: Connection, procedure: StoredProcedure + ) -> ProcedureLineageStream: + upstream_data = conn.execute( + f""" + SELECT DISTINCT + coalesce(lower(referenced_database_name), db_name()) AS db, + referenced_schema_name AS [schema], + referenced_entity_name AS [name], + o1.type_desc AS [type] + FROM sys.sql_expression_dependencies AS sed + INNER JOIN sys.objects AS o ON sed.referencing_id = o.object_id + left join sys.objects o1 on sed.referenced_id = o1.object_id + WHERE referencing_id = OBJECT_ID(N'{procedure.escape_full_name}') + AND referenced_schema_name is not null + AND o1.type_desc in ('TABLE_TYPE', 'VIEW', 'SQL_STORED_PROCEDURE', 'USER_TABLE') + """ + ) + upstream_dependencies = [] + for row in upstream_data: + upstream_dependencies.append( + ProcedureDependency( + db=row["db"], + schema=row["schema"], + name=row["name"], + type=row["type"], + env=procedure.flow.env, + server=procedure.flow.platform_instance, + ) + ) + return ProcedureLineageStream(dependencies=upstream_dependencies) + + @staticmethod + def _get_procedure_inputs( + conn: Connection, procedure: StoredProcedure + ) -> List[ProcedureParameter]: + inputs_data = conn.execute( + f""" + SELECT + name, + type_name(user_type_id) AS 'type' + FROM sys.parameters + WHERE object_id = object_id('{procedure.escape_full_name}') + """ + ) + inputs_list = [] + for row in inputs_data: + inputs_list.append(ProcedureParameter(name=row["name"], type=row["type"])) + return inputs_list + + @staticmethod + def _get_procedure_code( + conn: Connection, procedure: StoredProcedure + ) -> Tuple[Optional[str], Optional[str]]: + query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.full_name}'" + try: + code_data = conn.execute(query) + except ProgrammingError: + logger.warning( + "Denied permission for read text from procedure '%s'", + procedure.full_name, + ) + return None, None + code_list = [] + code_slice_index = 0 + code_slice_text = "create procedure" + try: + for index, row in enumerate(code_data): + code_list.append(row["Text"]) + if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip(): + code_slice_index = index + definition = "\n".join(code_list[:code_slice_index]) + code = "\n".join(code_list[code_slice_index:]) + except ResourceClosedError: + logger.warning( + "Connection was closed from procedure '%s'", + procedure.full_name, + ) + return None, None + return definition, code + + @staticmethod + def _get_procedure_properties( + conn: Connection, procedure: StoredProcedure + ) -> Dict[str, Any]: + properties_data = conn.execute( + f""" + SELECT + create_date as date_created, + modify_date as date_modified + FROM sys.procedures + WHERE object_id = object_id('{procedure.full_name}') + """ + ) + properties = {} + for row in properties_data: + properties = dict( + date_created=row["date_created"], date_modified=row["date_modified"] + ) + return properties + + @staticmethod + def _get_stored_procedures( + conn: Connection, db_name: str, schema: str + ) -> List[Dict[str, str]]: + stored_procedures_data = conn.execute( + f""" + SELECT + pr.name as procedure_name, + s.name as schema_name + FROM + [{db_name}].[sys].[procedures] pr + INNER JOIN + [{db_name}].[sys].[schemas] s ON pr.schema_id = s.schema_id + where s.name = '{schema}' + """ + ) + procedures_list = [] + for row in stored_procedures_data: + procedures_list.append( + dict(db=db_name, schema=row["schema_name"], name=row["procedure_name"]) + ) + return procedures_list + + def construct_job_workunits( + self, + data_job: MSSQLDataJob, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_info_aspect, + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_input_output_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def construct_flow_workunits( + self, + data_flow: MSSQLDataFlow, + ) -> Iterable[MetadataWorkUnit]: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_flow.as_dataflow_info_aspect, + ).as_workunit() + # TODO: Add SubType when it appear + + def get_inspectors(self) -> Iterable[Inspector]: + # This method can be overridden in the case that you want to dynamically + # run on multiple databases. + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + engine = create_engine(url, **self.config.options) + with engine.connect() as conn: + if self.config.database and self.config.database != "": + inspector = inspect(conn) + yield inspector + else: + databases = conn.execute( + "SELECT name FROM master.sys.databases WHERE name NOT IN \ + ('master', 'model', 'msdb', 'tempdb', 'Resource', \ + 'distribution' , 'reportserver', 'reportservertempdb'); " + ) + for db in databases: + if self.config.database_pattern.allowed(db["name"]): + url = self.config.get_sql_alchemy_url(current_db=db["name"]) + with create_engine( + url, **self.config.options + ).connect() as conn: + inspector = inspect(conn) + self.current_database = db["name"] + yield inspector + + def get_identifier( + self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any + ) -> str: + regular = f"{schema}.{entity}" + qualified_table_name = regular + if self.config.database: + if self.config.database_alias: + qualified_table_name = f"{self.config.database_alias}.{regular}" + else: + qualified_table_name = f"{self.config.database}.{regular}" + if self.current_database: + qualified_table_name = f"{self.current_database}.{regular}" + return ( + qualified_table_name.lower() + if self.config.convert_urns_to_lowercase + else qualified_table_name + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 280f4f47adcdf..b5458a42192fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -478,6 +478,27 @@ def add_table_to_schema_container( parent_container_key=schema_container_key, ) + def get_database_level_workunits( + self, + inspector: Inspector, + database: str, + ) -> Iterable[MetadataWorkUnit]: + yield from self.gen_database_containers(database=database) + + def get_schema_level_workunits( + self, + inspector: Inspector, + schema: str, + database: str, + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from self.gen_schema_containers(schema=schema, database=database) + + if self.config.include_tables: + yield from self.loop_tables(inspector, schema, self.config) + + if self.config.include_views: + yield from self.loop_views(inspector, schema, self.config) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -516,27 +537,20 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit ) db_name = self.get_db_name(inspector) - yield from self.gen_database_containers( + yield from self.get_database_level_workunits( + inspector=inspector, database=db_name, ) for schema in self.get_allowed_schemas(inspector, db_name): self.add_information_for_schema(inspector, schema) - yield from self.gen_schema_containers( - database=db_name, + yield from self.get_schema_level_workunits( + inspector=inspector, schema=schema, - extra_properties=self.get_schema_properties( - inspector=inspector, schema=schema, database=db_name - ), + database=db_name, ) - if sql_config.include_tables: - yield from self.loop_tables(inspector, schema, sql_config) - - if sql_config.include_views: - yield from self.loop_views(inspector, schema, sql_config) - if profiler: profile_requests += list( self.loop_profiler_requests(inspector, schema, sql_config) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py new file mode 100644 index 0000000000000..2fcc93292c2ef --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -0,0 +1,223 @@ +import json +import logging +import os +from dataclasses import dataclass +from datetime import datetime, timezone +from functools import partial +from typing import Iterable, List, Optional, Set + +from pydantic import Field + +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.emitter.mce_builder import ( + make_dataset_urn_with_platform_instance, + make_user_urn, +) +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit_reporter +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage + +logger = logging.getLogger(__name__) + + +class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): + query_file: str = Field(description="Path to file to ingest") + + platform: str = Field( + description="The platform for which to generate data, e.g. snowflake" + ) + + usage: BaseUsageConfig = Field( + description="The usage config to use when generating usage statistics", + default=BaseUsageConfig(), + ) + + use_schema_resolver: bool = Field( + description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.", + default=True, + hidden_from_docs=True, + ) + default_db: Optional[str] = Field( + description="The default database to use for unqualified table names", + default=None, + ) + default_schema: Optional[str] = Field( + description="The default schema to use for unqualified table names", + default=None, + ) + + +class SqlQueriesSourceReport(SourceReport): + num_queries_parsed: int = 0 + num_table_parse_failures: int = 0 + num_column_parse_failures: int = 0 + + def compute_stats(self) -> None: + super().compute_stats() + self.table_failure_rate = ( + f"{self.num_table_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + self.column_failure_rate = ( + f"{self.num_column_parse_failures / self.num_queries_parsed:.4f}" + if self.num_queries_parsed + else "0" + ) + + +@platform_name("SQL Queries") +@config_class(SqlQueriesSourceConfig) +@support_status(SupportStatus.TESTING) +class SqlQueriesSource(Source): + # TODO: Documentation + urns: Optional[Set[str]] + schema_resolver: SchemaResolver + builder: SqlParsingBuilder + + def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig): + if not ctx.graph: + raise ValueError( + "SqlQueriesSource needs a datahub_api from which to pull schema metadata" + ) + + self.graph: DataHubGraph = ctx.graph + self.ctx = ctx + self.config = config + self.report = SqlQueriesSourceReport() + + self.builder = SqlParsingBuilder(usage_config=self.config.usage) + + if self.config.use_schema_resolver: + schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.schema_resolver = schema_resolver + self.urns = urns + else: + self.schema_resolver = self.graph._make_schema_resolver( + platform=self.config.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self.urns = None + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> "SqlQueriesSource": + config = SqlQueriesSourceConfig.parse_obj(config_dict) + return cls(ctx, config) + + def get_report(self) -> SqlQueriesSourceReport: + return self.report + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [partial(auto_workunit_reporter, self.get_report())] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + logger.info(f"Parsing queries from {os.path.basename(self.config.query_file)}") + with open(self.config.query_file) as f: + for line in f: + try: + query_dict = json.loads(line, strict=False) + entry = QueryEntry.create(query_dict, config=self.config) + yield from self._process_query(entry) + except Exception as e: + logger.warning("Error processing query", exc_info=True) + self.report.report_warning("process-query", str(e)) + + logger.info("Generating workunits") + yield from self.builder.gen_workunits() + + def _process_query(self, entry: "QueryEntry") -> Iterable[MetadataWorkUnit]: + self.report.num_queries_parsed += 1 + if self.report.num_queries_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} queries") + + result = sqlglot_lineage( + sql=entry.query, + schema_resolver=self.schema_resolver, + default_db=self.config.default_db, + default_schema=self.config.default_schema, + ) + if result.debug_info.table_error: + logger.info(f"Error parsing table lineage, {result.debug_info.table_error}") + self.report.num_table_parse_failures += 1 + for downstream_urn in set(entry.downstream_tables): + self.builder.add_lineage( + downstream_urn=downstream_urn, + upstream_urns=entry.upstream_tables, + timestamp=entry.timestamp, + user=entry.user, + ) + return + elif result.debug_info.column_error: + logger.debug( + f"Error parsing column lineage, {result.debug_info.column_error}" + ) + self.report.num_column_parse_failures += 1 + + yield from self.builder.process_sql_parsing_result( + result, + query=entry.query, + query_timestamp=entry.timestamp, + user=entry.user, + custom_operation_type=entry.operation_type, + include_urns=self.urns, + ) + + +@dataclass +class QueryEntry: + query: str + timestamp: Optional[datetime] + user: Optional[str] + operation_type: Optional[str] + downstream_tables: List[str] + upstream_tables: List[str] + + @classmethod + def create( + cls, entry_dict: dict, *, config: SqlQueriesSourceConfig + ) -> "QueryEntry": + return cls( + query=entry_dict["query"], + timestamp=datetime.fromtimestamp(entry_dict["timestamp"], tz=timezone.utc) + if "timestamp" in entry_dict + else None, + user=make_user_urn(entry_dict["user"]) if "user" in entry_dict else None, + operation_type=entry_dict.get("operation_type"), + downstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("downstream_tables", []) + ], + upstream_tables=[ + make_dataset_urn_with_platform_instance( + name=table, + platform=config.platform, + platform_instance=config.platform_instance, + env=config.env, + ) + for table in entry_dict.get("upstream_tables", []) + ], + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 6752bdf519830..ec0af37089b1d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -31,6 +31,7 @@ from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError import datahub.emitter.mce_builder as builder +import datahub.utilities.sqlglot_lineage as sqlglot_l from datahub.configuration.common import ( AllowDenyPattern, ConfigModel, @@ -136,12 +137,7 @@ ViewPropertiesClass, ) from datahub.utilities import config_clean -from datahub.utilities.sqlglot_lineage import ( - ColumnLineageInfo, - SchemaResolver, - SqlParsingResult, - sqlglot_lineage, -) +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger: logging.Logger = logging.getLogger(__name__) @@ -1585,42 +1581,14 @@ def parse_custom_sql( f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}" ) - parsed_result: Optional["SqlParsingResult"] = None - try: - schema_resolver = ( - self.ctx.graph._make_schema_resolver( - platform=platform, - platform_instance=platform_instance, - env=env, - ) - if self.ctx.graph is not None - else SchemaResolver( - platform=platform, - platform_instance=platform_instance, - env=env, - graph=None, - ) - ) - - if schema_resolver.graph is None: - logger.warning( - "Column Level Lineage extraction would not work as DataHub graph client is None." - ) - - parsed_result = sqlglot_lineage( - query, - schema_resolver=schema_resolver, - default_db=upstream_db, - ) - except Exception as e: - self.report.report_warning( - key="csql-lineage", - reason=f"Unable to retrieve lineage from query. " - f"Query: {query} " - f"Reason: {str(e)} ", - ) - - return parsed_result + return sqlglot_l.create_lineage_sql_parsed_result( + query=query, + database=upstream_db, + platform=platform, + platform_instance=platform_instance, + env=env, + graph=self.ctx.graph, + ) def _create_lineage_from_unsupported_csql( self, csql_urn: str, csql: dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index d5da93c7be35e..49f56b46fb012 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -176,10 +176,8 @@ def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInf for table in runner.target_tables ], ) - except Exception: - logger.info( - f"Could not parse query via lineage runner, {query}", exc_info=True - ) + except Exception as e: + logger.info(f"Could not parse query via lineage runner, {query}: {e!r}") return None @staticmethod @@ -202,8 +200,8 @@ def _parse_query_via_spark_sql_plan(self, query: str) -> Optional[StringTableInf return GenericTableInfo( source_tables=[t for t in tables if t], target_tables=[] ) - except Exception: - logger.info(f"Could not parse query via spark plan, {query}", exc_info=True) + except Exception as e: + logger.info(f"Could not parse query via spark plan, {query}: {e!r}") return None @staticmethod diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index e5a9954802019..534cac5cef2aa 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -7,7 +7,6 @@ from collections import defaultdict from typing import Dict, List, Optional, Set, Tuple, Union -import pydantic import pydantic.dataclasses import sqlglot import sqlglot.errors @@ -23,7 +22,7 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.metadata.schema_classes import SchemaMetadataClass +from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -34,6 +33,8 @@ # A lightweight table schema: column -> type mapping. SchemaInfo = Dict[str, str] +SQL_PARSE_RESULT_CACHE_SIZE = 1000 + class QueryType(enum.Enum): CREATE = "CREATE" @@ -45,6 +46,22 @@ class QueryType(enum.Enum): UNKNOWN = "UNKNOWN" + def to_operation_type(self) -> Optional[str]: + if self == QueryType.CREATE: + return OperationTypeClass.CREATE + elif self == QueryType.INSERT: + return OperationTypeClass.INSERT + elif self == QueryType.UPDATE: + return OperationTypeClass.UPDATE + elif self == QueryType.DELETE: + return OperationTypeClass.DELETE + elif self == QueryType.MERGE: + return OperationTypeClass.UPDATE + elif self == QueryType.SELECT: + return None + else: + return OperationTypeClass.UNKNOWN + def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: # UPGRADE: Once we use Python 3.10, replace this with a match expression. @@ -623,16 +640,21 @@ def _translate_internal_column_lineage( ) +def _get_dialect(platform: str) -> str: + # TODO: convert datahub platform names to sqlglot dialect + if platform == "presto-on-hive": + return "hive" + else: + return platform + + def _sqlglot_lineage_inner( sql: str, schema_resolver: SchemaResolver, default_db: Optional[str] = None, default_schema: Optional[str] = None, ) -> SqlParsingResult: - # TODO: convert datahub platform names to sqlglot dialect - # TODO: Pull the platform name from the schema resolver? - dialect = schema_resolver.platform - + dialect = _get_dialect(schema_resolver.platform) if dialect == "snowflake": # in snowflake, table identifiers must be uppercased to match sqlglot's behavior. if default_db: @@ -755,6 +777,7 @@ def _sqlglot_lineage_inner( ) +@functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE) def sqlglot_lineage( sql: str, schema_resolver: SchemaResolver, @@ -825,3 +848,43 @@ def sqlglot_lineage( table_error=e, ), ) + + +def create_lineage_sql_parsed_result( + query: str, + database: Optional[str], + platform: str, + platform_instance: Optional[str], + env: str, + schema: Optional[str] = None, + graph: Optional[DataHubGraph] = None, +) -> Optional["SqlParsingResult"]: + + parsed_result: Optional["SqlParsingResult"] = None + try: + schema_resolver = ( + graph._make_schema_resolver( + platform=platform, + platform_instance=platform_instance, + env=env, + ) + if graph is not None + else SchemaResolver( + platform=platform, + platform_instance=platform_instance, + env=env, + graph=None, + ) + ) + + parsed_result = sqlglot_lineage( + query, + schema_resolver=schema_resolver, + default_db=database, + default_schema=schema, + ) + except Exception as e: + logger.debug(f"Fail to prase query {query}", exc_info=e) + logger.warning("Fail to parse custom SQL") + + return parsed_result diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index 3bda6c5cce84b..cc3ee1f6ceaa4 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -61,6 +61,7 @@ def test_bigquery_v2_ingest( "project_ids": ["project-id-1"], "include_usage_statistics": False, "include_table_lineage": False, + "include_data_platform_instance": True, } pipeline_config_dict: Dict[str, Any] = { diff --git a/metadata-ingestion/tests/integration/ldap/test_ldap.py b/metadata-ingestion/tests/integration/ldap/test_ldap.py index 148a3a6128013..3e76f13fc823d 100644 --- a/metadata-ingestion/tests/integration/ldap/test_ldap.py +++ b/metadata-ingestion/tests/integration/ldap/test_ldap.py @@ -100,3 +100,54 @@ def test_ldap_memberof_ingest(docker_compose_runner, pytestconfig, tmp_path, moc output_path=tmp_path / "ldap_memberof_mces.json", golden_path=test_resources_dir / "ldap_memberof_mces_golden.json", ) + + +@pytest.mark.integration +def test_ldap_ingest_with_email_as_username( + docker_compose_runner, pytestconfig, tmp_path, mock_time +): + test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap" + + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "ldap" + ) as docker_services: + # The openldap container loads the sample data after exposing the port publicly. As such, + # we must wait a little bit extra to ensure that the sample data is loaded. + wait_for_port(docker_services, "openldap", 389) + time.sleep(5) + + pipeline = Pipeline.create( + { + "run_id": "ldap-test", + "source": { + "type": "ldap", + "config": { + "ldap_server": "ldap://localhost", + "ldap_user": "cn=admin,dc=example,dc=org", + "ldap_password": "admin", + "base_dn": "dc=example,dc=org", + "user_attrs_map": {"email": "mail"}, + "group_attrs_map": { + "members": "memberUid", + "email": "mail", + }, + "use_email_as_username": True, + "custom_props_list": ["givenName"], + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/ldap_mces.json", + }, + }, + } + ) + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "ldap_mces.json", + golden_path=test_resources_dir / "ldap_mces_golden.json", + ) diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 6167c63e6c9b8..dee85b40bb7a8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -262,8 +262,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -412,8 +412,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index e66ec4bb89d8c..72db36e63daf7 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index 11e0760decae3..e5508bdb06b9e 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,datahub-demo.view.faa_flights,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index ddfd102cb15b0..91e13debfa028 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -279,8 +279,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" @@ -429,8 +429,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 54624986216b8..e93079119e4f4 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 6cab0db8c33cf..a9c8efa7cdb98 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -206,32 +206,32 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 9a088a7a8baef..edd15624a14cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -206,24 +206,24 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_joined_view_original_name,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" }, { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view_has_no_fields,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index f8e2565e492e1..aebc89b609a08 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -206,8 +206,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 32d4f7bc64ab4..34bded3cf691e 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -158,8 +158,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index cdf520cc23a30..b53d5857f1d66 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 73edecbe62205..238f4c2580cdf 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 9aa6a952c40b4..45d5d839e9d21 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 6ce6d809ae8f5..187cedaefb6b2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -450,8 +450,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -557,8 +557,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -664,8 +664,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -816,8 +816,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -923,8 +923,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1123,8 +1123,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1230,8 +1230,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1416,8 +1416,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1615,8 +1615,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1854,8 +1854,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json index 1016d4e211458..a323118666940 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index fc91c97a53003..c2c879e38f37b 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json index 8635a570c0621..c1ac54b0fb588 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -128,8 +128,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -235,8 +235,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -387,8 +387,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -494,8 +494,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index 19168aa323142..f602ca37b3160 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.events,DEV)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.autodetect_sql_name_based_on_view_name,DEV)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.looker_schema.include_able,DEV)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.ecommerce.ability,DEV)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -1732,8 +1732,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1971,8 +1971,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.flightstats.accidents,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index d4ced76a7475d..104bd365669e3 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" @@ -261,8 +261,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -480,8 +480,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -588,8 +588,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -696,8 +696,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.events,PROD)", "type": "VIEW" @@ -849,8 +849,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -957,8 +957,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1065,8 +1065,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" @@ -1248,8 +1248,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", "type": "VIEW" @@ -1356,8 +1356,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.ecommerce.ability,PROD)", "type": "VIEW" @@ -1543,8 +1543,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.owners,PROD)", "type": "VIEW" @@ -1743,8 +1743,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -1983,8 +1983,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 2bae6452145df..37a6c94c6952e 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.owners,DEV)", "type": "VIEW" @@ -459,8 +459,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json index a5c316f365d4b..49831ee554ab1 100644 --- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -303,8 +303,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.book,PROD)", "type": "VIEW" @@ -410,8 +410,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.order,PROD)", "type": "VIEW" @@ -607,8 +607,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.issue_history,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index de303d50e7acd..dc5e1aa9096f8 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -21,8 +21,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", "type": "VIEW" @@ -260,8 +260,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" @@ -478,8 +478,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -585,8 +585,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -692,8 +692,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", "type": "VIEW" @@ -844,8 +844,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" @@ -951,8 +951,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", "type": "VIEW" @@ -1058,8 +1058,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", "type": "VIEW" @@ -1240,8 +1240,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", "type": "VIEW" @@ -1347,8 +1347,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", "type": "VIEW" @@ -1533,8 +1533,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", "type": "VIEW" @@ -1764,8 +1764,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view_explore,PROD)", "type": "VIEW" @@ -2003,8 +2003,8 @@ "upstreams": [ { "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", "type": "VIEW" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 5c9553402a8c4..e77a12aa4088e 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,17 +1,22 @@ import logging import sys -from typing import List +from typing import List, Tuple import pytest from lark import Tree import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport -from datahub.ingestion.source.powerbi.m_query import parser, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import ( - DataPlatformTable, - SupportedDataPlatform, +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, + create_dataplatform_instance_resolver, ) +from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -38,9 +43,31 @@ 'let\n Source = AmazonRedshift.Database("redshift-url","dev"),\n public = Source{[Name="public"]}[Data],\n category1 = public{[Name="category"]}[Data]\nin\n category1', 'let\n Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) \n in Source', 'let\n Source = Databricks.Catalogs("adb-123.azuredatabricks.net", "/sql/1.0/endpoints/12345dc91aa25844", [Catalog=null, Database=null]),\n hive_metastore_Database = Source{[Name="hive_metastore",Kind="Database"]}[Data],\n sandbox_revenue_Schema = hive_metastore_Database{[Name="sandbox_revenue",Kind="Schema"]}[Data],\n public_consumer_price_index_Table = sandbox_revenue_Schema{[Name="public_consumer_price_index",Kind="Table"]}[Data],\n #"Renamed Columns" = Table.RenameColumns(public_consumer_price_index_Table,{{"Country", "country"}, {"Metric", "metric"}}),\n #"Inserted Year" = Table.AddColumn(#"Renamed Columns", "ID", each Date.Year([date_id]) + Date.Month([date_id]), Text.Type),\n #"Added Custom" = Table.AddColumn(#"Inserted Year", "Custom", each Text.Combine({Number.ToText(Date.Year([date_id])), Number.ToText(Date.Month([date_id])), [country]})),\n #"Removed Columns" = Table.RemoveColumns(#"Added Custom",{"ID"}),\n #"Renamed Columns1" = Table.RenameColumns(#"Removed Columns",{{"Custom", "ID"}}),\n #"Filtered Rows" = Table.SelectRows(#"Renamed Columns1", each ([metric] = "Consumer Price Index") and (not Number.IsNaN([value])))\nin\n #"Filtered Rows"', + "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source", ] +def get_default_instances( + override_config: dict = {}, +) -> Tuple[ + PipelineContext, PowerBiDashboardSourceConfig, AbstractDataPlatformInstanceResolver +]: + config: PowerBiDashboardSourceConfig = PowerBiDashboardSourceConfig.parse_obj( + { + "tenant_id": "fake", + "client_id": "foo", + "client_secret": "bar", + **override_config, + } + ) + + platform_instance_resolver: AbstractDataPlatformInstanceResolver = ( + create_dataplatform_instance_resolver(config) + ) + + return PipelineContext(run_id="fake"), config, platform_instance_resolver + + @pytest.mark.integration def test_parse_m_query1(): expression: str = M_QUERIES[0] @@ -145,20 +172,20 @@ def test_snowflake_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,pbi_test.test.testtable,PROD)" ) @@ -174,17 +201,21 @@ def test_postgres_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "order_date" - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) @@ -200,19 +231,21 @@ def test_databricks_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "public_consumer_price_index" assert ( - data_platform_tables[0].full_name - == "hive_metastore.sandbox_revenue.public_consumer_price_index" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.DATABRICK_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:databricks,hive_metastore.sandbox_revenue.public_consumer_price_index,PROD)" ) @@ -228,17 +261,21 @@ def test_oracle_regular_case(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "EMPLOYEES" - assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" - assert data_platform_tables[0].datasource_server == "localhost:1521" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.hr.employees,PROD)" ) @@ -255,17 +292,20 @@ def test_mssql_regular_case(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "book_issue" - assert data_platform_tables[0].full_name == "library.dbo.book_issue" - assert data_platform_tables[0].datasource_server == "localhost" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)" ) @@ -280,14 +320,16 @@ def test_mssql_with_query(): M_QUERIES[11], ] expected_tables = [ - "COMMOPSDB.dbo.V_OIP_ENT_2022", - "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022", - "COMMOPSDB.dbo.V_ARR_ADDS", - "COMMOPSDB.dbo.V_PS_CD_RETENTION", - "COMMOPSDB.dbo.V_TPV_LEADERBOARD", - "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_oip_ent_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_invoice_booking_2022,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_arr_adds,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_tpv_leaderboard,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_enterprise_invoiced_revenue,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(mssql_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -299,17 +341,15 @@ def test_mssql_with_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert data_platform_tables[0].datasource_server == "AUPRDWHDB" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] @pytest.mark.integration @@ -322,12 +362,14 @@ def test_snowflake_native_query(): ] expected_tables = [ - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", - "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)", ] + ctx, config, platform_instance_resolver = get_default_instances() + for index, query in enumerate(snowflake_queries): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], @@ -339,20 +381,15 @@ def test_snowflake_native_query(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == expected_tables[index].split(".")[2] - assert data_platform_tables[0].full_name == expected_tables[index] - assert ( - data_platform_tables[0].datasource_server - == "bu10758.ap-unknown-2.fakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) + assert data_platform_tables[0].urn == expected_tables[index] def test_google_bigquery_1(): @@ -363,16 +400,20 @@ def test_google_bigquery_1(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "seraphic-music-344307" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,seraphic-music-344307.school_dataset.first,PROD)" ) @@ -387,23 +428,24 @@ def test_google_bigquery_2(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "my-test-project" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.gcp_billing.gcp_table,PROD)" ) @@ -416,23 +458,24 @@ def test_for_each_expression_1(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "Parameter - Source": "my-test-project", "My bq project": "gcp_billing", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].datasource_server == "my-test-project" - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.d_wh_date,PROD)" ) @@ -445,22 +488,23 @@ def test_for_each_expression_2(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, - native_query_enabled=False, parameters={ "dwh-prod": "originally-not-a-variable-ref-and-not-resolved", }, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name - assert data_platform_tables[0].datasource_server == "dwh-prod" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:bigquery,dwh-prod.gcp_billing.d_gcp_custom_label,PROD)" ) @@ -476,8 +520,14 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + config.native_query_parsing = False data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -493,26 +543,25 @@ def test_multi_source_table(): ) reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].datasource_server == "localhost" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name - ) - - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)" ) assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst_view,PROD)" ) @@ -521,36 +570,33 @@ def test_table_combine(): table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], measures=[], - expression=M_QUERIES[16], # 1st index has the native query + expression=M_QUERIES[16], name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 2 - assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST" - assert ( - data_platform_tables[0].datasource_server - == "ghh48144.snowflakefakecomputing.com" - ) - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name - ) - assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST" assert ( - data_platform_tables[1].datasource_server - == "ghh48144.snowflakefakecomputing.com" + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)" ) + assert ( - data_platform_tables[1].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)" ) @@ -574,8 +620,14 @@ def test_expression_is_none(): reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) assert len(data_platform_tables) == 0 @@ -589,15 +641,20 @@ def test_redshift_regular_case(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" ) @@ -609,13 +666,60 @@ def test_redshift_native_query(): ) reporter = PowerBiDashboardSourceReport() + ctx, config, platform_instance_resolver = get_default_instances() + + config.native_query_parsing = True + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=True + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, ) + assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == table.full_name.split(".")[2] - assert data_platform_tables[0].full_name == table.full_name assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)" + ) + + +def test_sqlglot_parser(): + table: powerbi_data_classes.Table = powerbi_data_classes.Table( + expression=M_QUERIES[24], + name="SALES_TARGET", + full_name="dev.public.sales", + ) + reporter = PowerBiDashboardSourceReport() + + ctx, config, platform_instance_resolver = get_default_instances( + override_config={ + "server_to_platform_instance": { + "bu10758.ap-unknown-2.fakecomputing.com": { + "platform_instance": "sales_deployment", + "env": "PROD", + } + }, + "native_query_parsing": True, + "enable_advance_lineage_sql_construct": True, + } + ) + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) + + assert len(data_platform_tables) == 2 + assert ( + data_platform_tables[0].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit,PROD)" + ) + assert ( + data_platform_tables[1].urn + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json index d042c3fbb158b..63efc79941d82 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_file_without_extension.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "file_without_extension.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json index 8e4fcb80ff855..ceec764bfbc86 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", @@ -2740,6 +2756,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", @@ -3277,6 +3309,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", @@ -3852,6 +3900,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", @@ -4178,6 +4242,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", @@ -4571,6 +4651,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/small.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/small.csv,DEV)", @@ -7590,6 +7686,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:file", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:file,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:file,test-platform-instance.tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json index 58b81065c190f..d50f00efacaa0 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_file_without_extension.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "file_without_extension.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/no_extension/small,DEV)", diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index 0c1d92ed58e3d..36d3ba1b3510d 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -1,4 +1,20 @@ [ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv,DEV)", @@ -945,6 +961,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro,DEV)", @@ -1110,6 +1142,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv,DEV)", @@ -1319,6 +1367,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/countries_json.json,DEV)", @@ -1482,6 +1546,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/food_parquet.parquet,DEV)", @@ -1647,6 +1727,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/small.csv,DEV)", @@ -2282,6 +2378,22 @@ "runId": "multiple_files.json" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:s3", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "multiple_files.json" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.my-test-bucket/folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv,DEV)", diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 43f5e04fbc89f..81e307a78ae9e 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -434,11 +434,6 @@ def default_query_results( # noqa: C901 } for op_idx in range(1, num_ops + 1) ] - elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654473600000, - 1654586220000, - ): - return [] elif query in [ snowflake_query.SnowflakeQuery.view_dependencies(), ]: @@ -509,10 +504,6 @@ def default_query_results( # noqa: C901 } ] elif query in [ - snowflake_query.SnowflakeQuery.external_table_lineage_history( - 1654473600000, - 1654586220000, - ), snowflake_query.SnowflakeQuery.view_dependencies_v2(), snowflake_query.SnowflakeQuery.view_dependencies(), snowflake_query.SnowflakeQuery.show_external_tables(), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 53b2bcb236cd9..6135b0b3b3274 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -121,7 +121,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): include_table_lineage=True, include_view_lineage=True, include_usage_stats=True, - use_legacy_lineage_method=False, validate_upstreams_against_patterns=False, include_operational_stats=True, email_as_user_identifier=True, @@ -213,7 +212,6 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ include_column_lineage=False, include_views=False, include_view_lineage=False, - use_legacy_lineage_method=False, include_usage_stats=False, include_operational_stats=False, start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index 73a261bb3cb6e..4963e71ae4d96 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -55,7 +55,6 @@ def snowflake_pipeline_config(tmp_path): schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_view_lineage=False, include_usage_stats=False, - use_legacy_lineage_method=False, start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( tzinfo=timezone.utc ), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py deleted file mode 100644 index a5993793e574d..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py +++ /dev/null @@ -1,291 +0,0 @@ -from datetime import datetime, timezone -from typing import cast -from unittest import mock - -from freezegun import freeze_time -from pytest import fixture - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.snowflake import snowflake_query -from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config -from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery -from tests.integration.snowflake.common import ( - FROZEN_TIME, - NUM_TABLES, - default_query_results, -) - - -def query_permission_error_override(fn, override_for_query, error_msg): - def my_function(query): - if query in override_for_query: - raise Exception(error_msg) - else: - return fn(query) - - return my_function - - -def query_permission_response_override(fn, override_for_query, response): - def my_function(query): - if query in override_for_query: - return response - else: - return fn(query) - - return my_function - - -@fixture(scope="function") -def snowflake_pipeline_legacy_lineage_config(tmp_path): - output_file = tmp_path / "snowflake_test_events_permission_error.json" - config = PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - role="TEST_ROLE", - warehouse="TEST_WAREHOUSE", - include_technical_schema=True, - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_view_lineage=False, - include_usage_stats=False, - use_legacy_lineage_method=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc), - ), - ), - sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}), - ) - return config - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_role_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - # Snowflake connection fails role not granted error - mock_connect.side_effect = Exception( - "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC" - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_warehouse_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Current warehouse query leads to blank result - sf_cursor.execute.side_effect = query_permission_response_override( - default_query_results, - [SnowflakeQuery.current_warehouse()], - [(None,)], - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_databases_with_access_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.get_databases("TEST_DB")], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_no_tables_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing databases - no_tables_fn = query_permission_response_override( - default_query_results, - [SnowflakeQuery.tables_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - sf_cursor.execute.side_effect = query_permission_response_override( - no_tables_fn, - [SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB")], - [], - ) - - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_columns_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing columns - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - SnowflakeQuery.columns_for_table( - "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB" - ) - for tbl_idx in range(1, NUM_TABLES + 1) - ], - "Database 'TEST_DB' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get columns for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_list_primary_keys_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in listing keys leads to warning - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [SnowflakeQuery.show_primary_keys_for_schema("TEST_SCHEMA", "TEST_DB")], - "Insufficient privileges to operate on TEST_DB", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert ( - "Failed to get primary key for table" - in pipeline.source.get_report().warnings.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [ - snowflake_query.SnowflakeQuery.table_to_table_lineage_history( - 1654473600000, 1654586220000, True - ), - ], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert ( - "lineage-permission-error" in pipeline.source.get_report().failures.keys() - ) - - -@freeze_time(FROZEN_TIME) -def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failure( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting access history date range - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.get_access_history_date_range()], - "Database 'SNOWFLAKE' does not exist or not authorized.", - ) - pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config) - pipeline.run() - assert "usage-permission-error" in pipeline.source.get_report().failures.keys() - - -@freeze_time(FROZEN_TIME) -def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warning( - pytestconfig, - snowflake_pipeline_legacy_lineage_config, -): - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - # Error in getting view lineage - sf_cursor.execute.side_effect = query_permission_error_override( - default_query_results, - [snowflake_query.SnowflakeQuery.view_dependencies()], - "Unexpected Error", - ) - - snowflake_pipeline_config1 = snowflake_pipeline_legacy_lineage_config.copy() - cast( - SnowflakeV2Config, - cast(PipelineConfig, snowflake_pipeline_config1).source.config, - ).include_view_lineage = True - pipeline = Pipeline(snowflake_pipeline_config1) - pipeline.run() - pipeline.raise_from_status() # pipeline should not fail - assert "view-upstream-lineage" in pipeline.source.get_report().warnings.keys() diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py deleted file mode 100644 index 59da7ddf695d8..0000000000000 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py +++ /dev/null @@ -1,207 +0,0 @@ -import random -from datetime import datetime, timezone -from unittest import mock - -import pandas as pd -import pytest -from freezegun import freeze_time - -from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig -from datahub.ingestion.glossary.classifier import ( - ClassificationConfig, - DynamicTypedClassifierConfig, -) -from datahub.ingestion.glossary.datahub_classifier import ( - DataHubClassifierConfig, - InfoTypeConfig, - PredictionFactorsAndWeights, - ValuesFactorConfig, -) -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig -from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig -from datahub.ingestion.source.snowflake.snowflake_config import ( - SnowflakeV2Config, - TagOption, -) -from tests.integration.snowflake.common import FROZEN_TIME, default_query_results -from tests.integration.snowflake.test_snowflake import random_cloud_region, random_email -from tests.test_helpers import mce_helpers - - -@pytest.mark.integration -def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_test_events.json" - golden_file = test_resources_dir / "snowflake_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch( - "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table" - ) as mock_sample_values: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - - sf_cursor.execute.side_effect = default_query_results - - mock_sample_values.return_value = pd.DataFrame( - data={ - "col_1": [random.randint(1, 80) for i in range(20)], - "col_2": [random_email() for i in range(20)], - "col_3": [random_cloud_region() for i in range(20)], - } - ) - - datahub_classifier_config = DataHubClassifierConfig( - minimum_values_threshold=10, - confidence_level_threshold=0.58, - info_types_config={ - "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 - ) - ), - "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, - ), - Values=ValuesFactorConfig( - prediction_type="regex", - regex=[ - r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" - ], - ), - ), - }, - ) - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.aws", - username="TST_USR", - password="TST_PWD", - match_fully_qualified_names=True, - schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_view_lineage=True, - include_usage_stats=True, - use_legacy_lineage_method=True, - validate_upstreams_against_patterns=False, - include_operational_stats=True, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - classification=ClassificationConfig( - enabled=True, - classifiers=[ - DynamicTypedClassifierConfig( - type="datahub", config=datahub_classifier_config - ) - ], - ), - profiling=GEProfilingConfig( - enabled=True, - profile_if_updated_since_days=None, - profile_table_row_limit=None, - profile_table_size_limit=None, - profile_table_level_only=True, - ), - extract_tags=TagOption.without_lineage, - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[ - r"root\[\d+\]\['aspect'\]\['json'\]\['timestampMillis'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['created'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['lastModified'\]", - r"root\[\d+\]\['aspect'\]\['json'\]\['fields'\]\[\d+\]\['glossaryTerms'\]\['auditStamp'\]\['time'\]", - r"root\[\d+\]\['systemMetadata'\]", - ], - ) - - -@freeze_time(FROZEN_TIME) -@pytest.mark.integration -def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph): - test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" - - # Run the metadata ingestion pipeline. - output_file = tmp_path / "snowflake_privatelink_test_events.json" - golden_file = test_resources_dir / "snowflake_privatelink_golden.json" - - with mock.patch("snowflake.connector.connect") as mock_connect: - sf_connection = mock.MagicMock() - sf_cursor = mock.MagicMock() - mock_connect.return_value = sf_connection - sf_connection.cursor.return_value = sf_cursor - sf_cursor.execute.side_effect = default_query_results - - pipeline = Pipeline( - config=PipelineConfig( - source=SourceConfig( - type="snowflake", - config=SnowflakeV2Config( - account_id="ABC12345.ap-south-1.privatelink", - username="TST_USR", - password="TST_PWD", - schema_pattern=AllowDenyPattern(allow=["test_schema"]), - include_technical_schema=True, - include_table_lineage=True, - include_column_lineage=False, - include_views=False, - include_view_lineage=False, - use_legacy_lineage_method=True, - include_usage_stats=False, - include_operational_stats=False, - start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace( - tzinfo=timezone.utc - ), - ), - ), - sink=DynamicTypedConfig( - type="file", config={"filename": str(output_file)} - ), - ) - ) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - # Verify the output. - - mce_helpers.check_golden_file( - pytestconfig, - output_path=output_file, - golden_path=golden_file, - ignore_paths=[], - ) diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index be4ae9e047aea..67a563baa561c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -3985,6 +4111,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index bc81ce9633432..ef6033dd91943 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 8be2fe134dca1..8098accebb424 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -66,6 +66,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-03-10 16:27:54.970000", + "date_modified": "2023-03-10 16:27:55.097000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", @@ -1740,6 +1804,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-03-10 16:27:54.907000", + "date_modified": "2023-03-10 16:27:54.907000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)", @@ -2053,6 +2179,66 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index ba2ab7330fded..d32002fb5648c 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -81,6 +81,70 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "job_id": "b6a0c1e2-f90a-4c86-a226-bf7ca59ad79f", + "job_name": "Weekly Demo Data Backup", + "description": "No description available.", + "date_created": "2023-08-06 21:01:05.157000", + "date_modified": "2023-08-06 21:01:05.283000", + "step_id": "1", + "step_name": "Set database to read only", + "subsystem": "TSQL", + "command": "ALTER DATABASE DemoData SET READ_ONLY" + }, + "externalUrl": "", + "name": "localhost.Weekly Demo Data Backup.localhost.Weekly Demo Data Backup", + "type": { + "string": "MSSQL_JOB_STEP" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f1b4c0e379c4b2e2e09a8ecd6c1b6dec", @@ -1764,6 +1828,68 @@ "runId": "mssql-test" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "", + "name": "localhost.demodata.Foo.stored_procedures" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "procedure_depends_on": "{}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "input parameters": "['@ID']", + "parameter @ID": "{'type': 'int'}", + "date_created": "2023-08-06 21:01:05.093000", + "date_modified": "2023-08-06 21:01:05.093000" + }, + "externalUrl": "", + "name": "demodata.Foo.DBs", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:a6bea84fba7b05fb5d12630c8e6306ac", @@ -2072,5 +2198,65 @@ "lastObserved": 1615443388097, "runId": "mssql-test" } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index 612de3eb1583c..2ff46e249007a 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -44,6 +44,10 @@ CREATE TABLE Foo.SalesReason ) ; GO +CREATE PROCEDURE Foo.DBs @ID INT +AS + SELECT @ID AS ThatDB; +GO GO EXEC sys.sp_addextendedproperty @@ -59,5 +63,31 @@ EXEC sys.sp_addextendedproperty @value = N'Description for column LastName of table Persons of schema Foo.', @level0type = N'SCHEMA', @level0name = 'Foo', @level1type = N'TABLE', @level1name = 'Persons', -@level2type = N'COLUMN',@level2name = 'LastName'; -GO \ No newline at end of file +@level2type = N'COLUMN',@level2name = 'LastName'; +GO +USE msdb ; +GO +EXEC dbo.sp_add_job + @job_name = N'Weekly Demo Data Backup' ; +GO +EXEC sp_add_jobstep + @job_name = N'Weekly Demo Data Backup', + @step_name = N'Set database to read only', + @database_name = N'DemoData', + @subsystem = N'TSQL', + @command = N'ALTER DATABASE DemoData SET READ_ONLY', + @retry_attempts = 5, + @retry_interval = 5 ; +GO +EXEC dbo.sp_add_schedule + @schedule_name = N'RunOnce', + @freq_type = 1, + @active_start_time = 233000 ; +GO +EXEC sp_attach_schedule + @job_name = N'Weekly Demo Data Backup', + @schedule_name = N'RunOnce'; +GO +EXEC dbo.sp_add_jobserver + @job_name = N'Weekly Demo Data Backup' +GO diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 3e7b75edd4878..099690fed34c2 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -50,4 +50,9 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir / f"golden_files/golden_mces_{config_file.replace('yml','json')}", + ignore_paths=[ + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['job_id'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_created'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]", + ], ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index d04c8d905b439..71428a7847953 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -791,11 +791,9 @@ def test_tableau_unsupported_csql(mock_datahub_graph): database_override_map={"production database": "prod"} ) - with mock.patch( - "datahub.ingestion.source.tableau.sqlglot_lineage" - ) as sqlglot_lineage: + with mock.patch("datahub.ingestion.source.tableau.sqlglot_l") as sqlglot_lineage: - sqlglot_lineage.return_value = SqlParsingResult( # type:ignore + sqlglot_lineage.create_lineage_sql_parsed_result.return_value = SqlParsingResult( # type:ignore in_tables=[ "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.userdetail,PROD)" ], diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml index ddaf206f236cf..84af5c32a60e3 100644 --- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml +++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml @@ -1,6 +1,7 @@ version: "3.9" services: vertica: + platform: linux/amd64 environment: APP_DB_USER: "dbadmin" APP_DB_PASSWORD: "abc123" @@ -18,6 +19,3 @@ services: volumes: vertica-data: - - - diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py index db8bfd247313b..fe306d1d0b2b8 100644 --- a/metadata-ingestion/tests/integration/vertica/test_vertica.py +++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py @@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir): # Test needs more work to be done , currently it is working fine. @freeze_time(FROZEN_TIME) +@pytest.mark.skip("Failing in CI, cmd failing with exit code 1") @pytest.mark.integration def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica" diff --git a/metadata-ingestion/tests/unit/test_bigquery_profiler.py b/metadata-ingestion/tests/unit/test_bigquery_profiler.py index a2aec8df93d09..44ce5f0a02e37 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_profiler.py +++ b/metadata-ingestion/tests/unit/test_bigquery_profiler.py @@ -37,6 +37,7 @@ def test_generate_day_partitioned_partition_profiler_query(): ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) @@ -79,6 +80,7 @@ def test_generate_day_partitioned_partition_profiler_query_with_set_partition_ti ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) @@ -120,6 +122,7 @@ def test_generate_hour_partitioned_partition_profiler_query(): ordinal_position=1, data_type="TIMESTAMP", is_partition_column=True, + cluster_column_position=None, comment=None, is_nullable=False, ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index fc8ca166b105a..47418d9a989bb 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -138,13 +138,12 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,{project_id})" ) - config = BigQueryV2Config.parse_obj({}) + config = BigQueryV2Config.parse_obj({"include_data_platform_instance": True}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) data_platform_instance = source.get_dataplatform_instance_aspect( "urn:li:test", project_id ) - metadata = data_platform_instance.get_metadata()["metadata"] assert data_platform_instance is not None @@ -152,6 +151,20 @@ def test_get_dataplatform_instance_aspect_returns_project_id(): assert metadata.aspect.instance == expected_instance +def test_get_dataplatform_instance_default_no_instance(): + config = BigQueryV2Config.parse_obj({}) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + + data_platform_instance = source.get_dataplatform_instance_aspect( + "urn:li:test", "project_id" + ) + metadata = data_platform_instance.get_metadata()["metadata"] + + assert data_platform_instance is not None + assert metadata.aspectName == "dataPlatformInstance" + assert metadata.aspect.instance is None + + @patch("google.cloud.bigquery.client.Client") def test_get_projects_with_single_project_id(client_mock): config = BigQueryV2Config.parse_obj({"project_id": "test-3"}) diff --git a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py index a71e07b68d898..b047cd16c52a9 100644 --- a/metadata-ingestion/tests/unit/test_confluent_schema_registry.py +++ b/metadata-ingestion/tests/unit/test_confluent_schema_registry.py @@ -4,6 +4,7 @@ from confluent_kafka.schema_registry.schema_registry_client import ( RegisteredSchema, Schema, + SchemaReference, ) from datahub.ingestion.source.confluent_schema_registry import ConfluentSchemaRegistry @@ -90,7 +91,9 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: schema_str=schema_str_orig, schema_type="AVRO", references=[ - dict(name="TestTopic1", subject="schema_subject_1", version=1) + SchemaReference( + name="TestTopic1", subject="schema_subject_1", version=1 + ) ], ) ) @@ -109,7 +112,9 @@ def new_get_latest_version(subject_name: str) -> RegisteredSchema: schema_str=schema_str_orig, schema_type="AVRO", references=[ - dict(name="schema_subject_1", subject="TestTopic1", version=1) + SchemaReference( + name="schema_subject_1", subject="TestTopic1", version=1 + ) ], ) ) diff --git a/metadata-ingestion/tests/unit/test_snowflake_shares.py b/metadata-ingestion/tests/unit/test_snowflake_shares.py new file mode 100644 index 0000000000000..7de86139baf39 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_snowflake_shares.py @@ -0,0 +1,348 @@ +from typing import List + +import pytest + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.source.snowflake.snowflake_config import ( + DatabaseId, + SnowflakeShareConfig, + SnowflakeV2Config, +) +from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report +from datahub.ingestion.source.snowflake.snowflake_schema import ( + SnowflakeDatabase, + SnowflakeSchema, +) +from datahub.ingestion.source.snowflake.snowflake_shares import SnowflakeSharesHandler +from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings +from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage +from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeProposal + + +@pytest.fixture(scope="module") +def snowflake_databases() -> List[SnowflakeDatabase]: + return [ + SnowflakeDatabase( + name="db1", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema11", + created=None, + comment=None, + last_altered=None, + tables=["table111", "table112"], + views=["view111"], + ), + SnowflakeSchema( + name="schema12", + created=None, + comment=None, + last_altered=None, + tables=["table121", "table122"], + views=["view121"], + ), + ], + ), + SnowflakeDatabase( + name="db2", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema21", + created=None, + comment=None, + last_altered=None, + tables=["table211", "table212"], + views=["view211"], + ), + SnowflakeSchema( + name="schema22", + created=None, + comment=None, + last_altered=None, + tables=["table221", "table222"], + views=["view221"], + ), + ], + ), + SnowflakeDatabase( + name="db3", + created=None, + comment=None, + last_altered=None, + schemas=[ + SnowflakeSchema( + name="schema31", + created=None, + comment=None, + last_altered=None, + tables=["table311", "table312"], + views=["view311"], + ) + ], + ), + ] + + +def make_snowflake_urn(table_name, instance_name=None): + return make_dataset_urn_with_platform_instance( + "snowflake", table_name, instance_name + ) + + +def test_snowflake_shares_workunit_no_shares( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config(account_id="abc12345", platform_instance="instance1") + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x) + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + assert len(wus) == 0 + + +def test_same_database_inbound_and_outbound_invalid_config() -> None: + with pytest.raises( + ValueError, + match="Same database can not be present as consumer in more than one share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance3", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + }, + ) + + with pytest.raises( + ValueError, + match="Database included in a share can not be present as consumer in any share", + ): + SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db1", + platform_instance="instance1", + consumers=[ + DatabaseId(database="db1", platform_instance="instance3") + ], + ), + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[ + DatabaseId(database="db1", platform_instance="instance1") + ], + ), + }, + ) + + +def test_snowflake_shares_workunit_inbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling and 6 upstreamLineage aspects + assert len(wus) == 12 + upstream_lineage_aspect_entity_urns = set() + sibling_aspect_entity_urns = set() + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + upstream_lineage_aspect_entity_urns.add(wu.get_urn()) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + sibling_aspect_entity_urns.add(wu.get_urn()) + + assert upstream_lineage_aspect_entity_urns == sibling_aspect_entity_urns + + +def test_snowflake_shares_workunit_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ) + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 2 schemas - 2 tables and 1 view in each schema making total 6 datasets + # Hence 6 Sibling aspects + assert len(wus) == 6 + entity_urns = set() + + for wu in wus: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] + entity_urns.add(wu.get_urn()) + + assert len((entity_urns)) == 6 + + +def test_snowflake_shares_workunit_inbound_and_outbound_share( + snowflake_databases: List[SnowflakeDatabase], +) -> None: + config = SnowflakeV2Config( + account_id="abc12345", + platform_instance="instance1", + shares={ + "share1": SnowflakeShareConfig( + database="db1", + platform_instance="instance2", + consumers=[DatabaseId(database="db1", platform_instance="instance1")], + ), + "share2": SnowflakeShareConfig( + database="db2", + platform_instance="instance1", + consumers=[ + DatabaseId( + database="db2_from_share", platform_instance="instance2" + ), + DatabaseId(database="db2", platform_instance="instance3"), + ], + ), + }, + ) + + report = SnowflakeV2Report() + shares_handler = SnowflakeSharesHandler( + config, report, lambda x: make_snowflake_urn(x, "instance1") + ) + + wus = list(shares_handler.get_shares_workunits(snowflake_databases)) + + # 6 Sibling and 6 upstreamLineage aspects for db1 tables + # 6 Sibling aspects for db2 tables + assert len(wus) == 18 + + for wu in wus: + assert isinstance( + wu.metadata, (MetadataChangeProposal, MetadataChangeProposalWrapper) + ) + if wu.metadata.aspectName == "upstreamLineage": + upstream_aspect = wu.get_aspect_of_type(UpstreamLineage) + assert upstream_aspect is not None + assert len(upstream_aspect.upstreams) == 1 + assert upstream_aspect.upstreams[0].dataset == wu.get_urn().replace( + "instance1.db1", "instance2.db1" + ) + else: + siblings_aspect = wu.get_aspect_of_type(Siblings) + assert siblings_aspect is not None + if "db1" in wu.get_urn(): + assert len(siblings_aspect.siblings) == 1 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db1", "instance2.db1") + ] + else: + assert len(siblings_aspect.siblings) == 2 + assert siblings_aspect.siblings == [ + wu.get_urn().replace("instance1.db2", "instance2.db2_from_share"), + wu.get_urn().replace("instance1.db2", "instance3.db2"), + ] diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 025273fc9263e..82273427974af 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -235,3 +235,7 @@ sourceSets.main.java.srcDir "${generateOpenApiPojos.outputDir}/src/main/java" sourceSets.main.resources.srcDir "${generateOpenApiPojos.outputDir}/src/main/resources" checkstyleMain.exclude '**/generated/**' + +clean { + project.delete("$projectDir/generated") +} \ No newline at end of file diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ae56cd4cb8a96..507351f933cf0 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -88,6 +88,9 @@ dependencies { implementation(externalDependency.jettison) { because("previous versions are vulnerable") } + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 555acb2ffdd3b..4bbff3915aca9 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -42,6 +43,13 @@ public static Map getPartialNgramConfigWithOverrides(Map getMappingsForField(@Nonnull final Searchable mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); // Add keyword subfield without lowercase filter mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP)); - } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) { + } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { mappingForField.put(TYPE, KEYWORD); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); Map subFields = new HashMap<>(); - if (fieldType == FieldType.TEXT_PARTIAL) { + if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { subFields.put(NGRAM, getPartialNgramConfigWithOverrides( ImmutableMap.of( ANALYZER, PARTIAL_ANALYZER ) )); + if (fieldType == FieldType.WORD_GRAM) { + for (Map.Entry entry : Map.of( + WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER, + WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER, + WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) { + String fieldName = entry.getKey(); + String analyzerName = entry.getValue(); + subFields.put(fieldName, ImmutableMap.of( + TYPE, TEXT, + ANALYZER, analyzerName, + SEARCH_ANALYZER, analyzerName + )); + } + } } subFields.put(DELIMITED, ImmutableMap.of( TYPE, TEXT, @@ -163,6 +185,7 @@ private static Map getMappingsForField(@Nonnull final Searchable searchableFieldSpec.getSearchableAnnotation() .getNumValuesFieldName() .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, LONG))); + mappings.putAll(getMappingsForFieldNameAliases(searchableFieldSpec)); return mappings; } @@ -172,4 +195,16 @@ private static Map getMappingsForSearchScoreField( return ImmutableMap.of(searchScoreFieldSpec.getSearchScoreAnnotation().getFieldName(), ImmutableMap.of(TYPE, DOUBLE)); } + + private static Map getMappingsForFieldNameAliases(@Nonnull final SearchableFieldSpec searchableFieldSpec) { + Map mappings = new HashMap<>(); + List fieldNameAliases = searchableFieldSpec.getSearchableAnnotation().getFieldNameAliases(); + fieldNameAliases.forEach(alias -> { + Map aliasMappings = new HashMap<>(); + aliasMappings.put(TYPE, ALIAS); + aliasMappings.put(PATH, searchableFieldSpec.getSearchableAnnotation().getFieldName()); + mappings.put(alias, aliasMappings); + }); + return mappings; + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java index 5b3e396837aa7..e180c8296b48d 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java @@ -66,6 +66,9 @@ public class SettingsBuilder { public static final String KEYWORD_ANALYZER = "keyword"; public static final String URN_ANALYZER = "urn_component"; public static final String URN_SEARCH_ANALYZER = "query_urn_component"; + public static final String WORD_GRAM_2_ANALYZER = "word_gram_2"; + public static final String WORD_GRAM_3_ANALYZER = "word_gram_3"; + public static final String WORD_GRAM_4_ANALYZER = "word_gram_4"; // Filters public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space"; @@ -80,6 +83,10 @@ public class SettingsBuilder { public static final String MULTIFILTER = "multifilter"; public static final String MULTIFILTER_GRAPH = "multifilter_graph"; public static final String PARTIAL_URN_COMPONENT = "partial_urn_component"; + public static final String SHINGLE = "shingle"; + public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter"; + public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter"; + public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter"; public static final String SNOWBALL = "snowball"; public static final String STEM_OVERRIDE = "stem_override"; public static final String STOP = "stop"; @@ -108,6 +115,7 @@ public class SettingsBuilder { public static final String SLASH_TOKENIZER = "slash_tokenizer"; public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer"; public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer"; + public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer"; // Do not remove the space, needed for multi-term synonyms public static final List ALPHANUM_SPACE_PATTERNS = ImmutableList.of( "([a-z0-9 _-]{2,})", @@ -161,6 +169,13 @@ public class SettingsBuilder { AUTOCOMPLETE_CUSTOM_DELIMITER, LOWERCASE); + public static final List WORD_GRAM_TOKEN_FILTERS = ImmutableList.of( + ASCII_FOLDING, + LOWERCASE, + TRIM, + REMOVE_QUOTES + ); + public final Map settings; public SettingsBuilder(String mainTokenizer) { @@ -275,6 +290,17 @@ private static Map buildFilters() throws IOException { .collect(Collectors.toList())) .build()); } + + for (Map.Entry entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) { + String filterName = entry.getKey(); + Integer gramSize = entry.getValue(); + filters.put(filterName, ImmutableMap.builder() + .put(TYPE, SHINGLE) + .put("min_shingle_size", gramSize) + .put("max_shingle_size", gramSize) + .put("output_unigrams", false) + .build()); + } } return filters.build(); @@ -302,13 +328,24 @@ private static Map buildTokenizers() { .put(DELIMITER, "␟") .build()); - // Tokenize by whitespace and most special chars + // Tokenize by most special chars + // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token + // The split by whitespace is done later in the token filters phase tokenizers.put(MAIN_TOKENIZER, ImmutableMap.builder() .put(TYPE, PATTERN) .put(PATTERN, "[(),./:]") .build()); + // Tokenize by whitespace and most special chars for wordgrams + // only split on - when not preceded by a whitespace to preserve exclusion functionality + // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently + tokenizers.put(WORD_GRAM_TOKENIZER, + ImmutableMap.builder() + .put(TYPE, PATTERN) + .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)") + .build()); + return tokenizers.build(); } @@ -382,6 +419,21 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, SEARCH_TOKEN_FILTERS) .build()); + // Support word grams + for (Map.Entry entry : Map.of( + WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER, + WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER, + WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) { + String analyzerName = entry.getKey(); + String filterName = entry.getValue(); + analyzers.put(analyzerName, ImmutableMap.builder() + .put(TOKENIZER, WORD_GRAM_TOKENIZER) + .put(FILTER, ImmutableList.builder() + .addAll(WORD_GRAM_TOKEN_FILTERS) + .add(filterName).build()) + .build()); + } + // For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN) // Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token analyzers.put(PARTIAL_ANALYZER, ImmutableMap.builder() @@ -395,6 +447,7 @@ private static Map buildAnalyzers(String mainTokenizer) { .put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS) .build()); + return analyzers.build(); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java index fb7e19a5d67bc..a75ed40ffca52 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java @@ -11,11 +11,8 @@ import java.util.Set; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER; -import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; + @Builder @Getter @@ -33,7 +30,8 @@ public class SearchFieldConfig { private static final Set TYPES_WITH_DELIMITED_SUBFIELD = Set.of( SearchableAnnotation.FieldType.TEXT, - SearchableAnnotation.FieldType.TEXT_PARTIAL + SearchableAnnotation.FieldType.TEXT_PARTIAL, + SearchableAnnotation.FieldType.WORD_GRAM // NOT URN_PARTIAL (urn field is special) ); // NOT comprehensive @@ -56,6 +54,7 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.TEXT, SearchableAnnotation.FieldType.TEXT_PARTIAL, SearchableAnnotation.FieldType.KEYWORD, + SearchableAnnotation.FieldType.WORD_GRAM, // not analyzed SearchableAnnotation.FieldType.BOOLEAN, SearchableAnnotation.FieldType.COUNT, @@ -69,6 +68,11 @@ public class SearchFieldConfig { SearchableAnnotation.FieldType.URN_PARTIAL ); + public static final Set TYPES_WITH_WORD_GRAM = + Set.of( + SearchableAnnotation.FieldType.WORD_GRAM + ); + @Nonnull private final String fieldName; @Nonnull @@ -78,9 +82,11 @@ public class SearchFieldConfig { private final String analyzer; private boolean hasKeywordSubfield; private boolean hasDelimitedSubfield; + private boolean hasWordGramSubfields; private boolean isQueryByDefault; private boolean isDelimitedSubfield; private boolean isKeywordSubfield; + private boolean isWordGramSubfield; public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) { final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation(); @@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName, .analyzer(getAnalyzer(fieldName, fieldType)) .hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType)) .hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType)) + .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType)) .isQueryByDefault(isQueryByDefault) .build(); } @@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati return !fieldName.contains(".") && ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType)); } + + private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) { + return !fieldName.contains(".") + && (TYPES_WITH_WORD_GRAM.contains(fieldType)); + } private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) { return !"urn".equals(fieldName) && !fieldName.contains(".") @@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) { this.fieldName = fieldName; isDelimitedSubfield(fieldName.endsWith(".delimited")); isKeywordSubfield(fieldName.endsWith(".keyword")); + isWordGramSubfield(fieldName.contains("wordGrams")); shortName(fieldName.split("[.]")[0]); return this; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java index 289c6f1f84e32..49fc882314e0a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java @@ -3,6 +3,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.config.search.custom.QueryConfiguration; @@ -51,6 +52,9 @@ import org.elasticsearch.search.SearchModule; import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES; +import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*; +import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*; + @Slf4j public class SearchQueryBuilder { @@ -69,6 +73,7 @@ public class SearchQueryBuilder { public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q "; private final ExactMatchConfiguration exactMatchConfiguration; private final PartialConfiguration partialConfiguration; + private final WordGramConfiguration wordGramConfiguration; private final CustomizedQueryHandler customizedQueryHandler; @@ -76,6 +81,7 @@ public SearchQueryBuilder(@Nonnull SearchConfiguration searchConfiguration, @Nullable CustomSearchConfiguration customSearchConfiguration) { this.exactMatchConfiguration = searchConfiguration.getExactMatch(); this.partialConfiguration = searchConfiguration.getPartial(); + this.wordGramConfiguration = searchConfiguration.getWordGram(); this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build(); } @@ -148,6 +154,36 @@ private Set getStandardFields(@Nonnull EntitySpec entitySpec) fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited", searchFieldConfig.boost() * partialConfiguration.getFactor(), searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault())); + + if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) { + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams2") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor()) + .analyzer(WORD_GRAM_2_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams3") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor()) + .analyzer(WORD_GRAM_3_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + fields.add(SearchFieldConfig.builder() + .fieldName(searchFieldConfig.fieldName() + ".wordGrams4") + .boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor()) + .analyzer(WORD_GRAM_4_ANALYZER) + .hasKeywordSubfield(true) + .hasDelimitedSubfield(true) + .hasWordGramSubfields(true) + .isQueryByDefault(true) + .build()); + } } } @@ -188,7 +224,7 @@ private Optional getSimpleQuery(@Nullable QueryConfiguration custo .filter(SearchFieldConfig::isQueryByDefault) .collect(Collectors.groupingBy(SearchFieldConfig::analyzer)); - analyzerGroup.keySet().stream().sorted().forEach(analyzer -> { + analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> { List fieldConfigs = analyzerGroup.get(analyzer); SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery); simpleBuilder.analyzer(analyzer); @@ -253,6 +289,13 @@ private Optional getPrefixAndExactMatchQuery(@Nullable QueryConfig * exactMatchConfiguration.getCaseSensitivityFactor()) .queryName(searchFieldConfig.fieldName())); } + + if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) { + finalQuery.should(QueryBuilders + .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery) + .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName())) + .queryName(searchFieldConfig.shortName())); + } }); return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty(); @@ -377,4 +420,15 @@ private FunctionScoreQueryBuilder toFunctionScoreQueryBuilder(QueryBuilder query throw new RuntimeException(e); } } + + public float getWordGramFactor(String fieldName) { + if (fieldName.endsWith("Grams2")) { + return wordGramConfiguration.getTwoGramFactor(); + } else if (fieldName.endsWith("Grams3")) { + return wordGramConfiguration.getThreeGramFactor(); + } else if (fieldName.endsWith("Grams4")) { + return wordGramConfiguration.getFourGramFactor(); + } + throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]"); + } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index bd1e6037ec0c5..5973f77da28aa 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -28,6 +28,8 @@ import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchResultMetadata; +import com.linkedin.metadata.search.SearchSuggestion; +import com.linkedin.metadata.search.SearchSuggestionArray; import com.linkedin.metadata.search.features.Features; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.SearchUtil; @@ -68,7 +70,9 @@ import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; +import org.elasticsearch.search.suggest.term.TermSuggestion; +import static com.linkedin.metadata.search.utils.ESUtils.NAME_SUGGESTION; import static com.linkedin.metadata.search.utils.ESUtils.toFacetField; import static com.linkedin.metadata.search.utils.SearchUtils.applyDefaultSearchFlags; import static com.linkedin.metadata.utils.SearchUtil.*; @@ -199,6 +203,11 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi searchSourceBuilder.highlighter(_highlights); } ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + + if (finalSearchFlags.isGetSuggestions()) { + ESUtils.buildNameSuggestions(searchSourceBuilder, input); + } + searchRequest.source(searchSourceBuilder); log.debug("Search request is: " + searchRequest.toString()); @@ -471,6 +480,9 @@ private SearchResultMetadata extractSearchResultMetadata(@Nonnull SearchResponse final List aggregationMetadataList = extractAggregationMetadata(searchResponse, filter); searchResultMetadata.setAggregations(new AggregationMetadataArray(aggregationMetadataList)); + final List searchSuggestions = extractSearchSuggestions(searchResponse); + searchResultMetadata.setSuggestions(new SearchSuggestionArray(searchSuggestions)); + return searchResultMetadata; } @@ -517,6 +529,23 @@ public static Map extractTermAggregations(@Nonnull SearchResponse return extractTermAggregations((ParsedTerms) aggregation, aggregationName.equals("_entityType")); } + private List extractSearchSuggestions(@Nonnull SearchResponse searchResponse) { + final List searchSuggestions = new ArrayList<>(); + if (searchResponse.getSuggest() != null) { + TermSuggestion termSuggestion = searchResponse.getSuggest().getSuggestion(NAME_SUGGESTION); + if (termSuggestion != null && termSuggestion.getEntries().size() > 0) { + termSuggestion.getEntries().get(0).getOptions().forEach(suggestOption -> { + SearchSuggestion searchSuggestion = new SearchSuggestion(); + searchSuggestion.setText(String.valueOf(suggestOption.getText())); + searchSuggestion.setFrequency(suggestOption.getFreq()); + searchSuggestion.setScore(suggestOption.getScore()); + searchSuggestions.add(searchSuggestion); + }); + } + } + return searchSuggestions; + } + /** * Adds nested sub-aggregation values to the aggregated results * @param aggs The aggregations to traverse. Could be null (base case) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 8a385e4ab2b54..741eb5568d2ea 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -27,6 +27,10 @@ import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.ScoreSortBuilder; import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.search.suggest.SuggestBuilder; +import org.elasticsearch.search.suggest.SuggestBuilders; +import org.elasticsearch.search.suggest.SuggestionBuilder; +import org.elasticsearch.search.suggest.term.TermSuggestionBuilder; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.KEYWORD_FIELDS; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.PATH_HIERARCHY_FIELDS; @@ -45,6 +49,9 @@ public class ESUtils { public static final int MAX_RESULT_SIZE = 10000; public static final String OPAQUE_ID_HEADER = "X-Opaque-Id"; public static final String HEADER_VALUE_DELIMITER = "|"; + public static final String KEYWORD_TYPE = "keyword"; + public static final String ENTITY_NAME_FIELD = "_entityName"; + public static final String NAME_SUGGESTION = "nameSuggestion"; // we use this to make sure we filter for editable & non-editable fields. Also expands out top-level properties // to field level properties @@ -174,6 +181,8 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion * If no sort criterion is provided then the default sorting criterion is chosen which is descending order of score * Furthermore to resolve conflicts, the results are further sorted by ascending order of urn * If the input sort criterion is urn itself, then no additional sort criterion is applied as there will be no conflicts. + * When sorting, set the unmappedType param to arbitrary "keyword" so we essentially ignore sorting where indices do not + * have the field we are sorting on. *

* * @param searchSourceBuilder {@link SearchSourceBuilder} that needs to be populated with sort order @@ -187,13 +196,24 @@ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuild final SortOrder esSortOrder = (sortCriterion.getOrder() == com.linkedin.metadata.query.filter.SortOrder.ASCENDING) ? SortOrder.ASC : SortOrder.DESC; - searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder)); + searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder).unmappedType(KEYWORD_TYPE)); } if (sortCriterion == null || !sortCriterion.getField().equals(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD)) { searchSourceBuilder.sort(new FieldSortBuilder(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD).order(SortOrder.ASC)); } } + /** + * Populates source field of search query with the suggestions query so that we get search suggestions back. + * Right now we are only supporting suggestions based on the virtual _entityName field alias. + */ + public static void buildNameSuggestions(@Nonnull SearchSourceBuilder searchSourceBuilder, @Nullable String textInput) { + SuggestionBuilder builder = SuggestBuilders.termSuggestion(ENTITY_NAME_FIELD).text(textInput); + SuggestBuilder suggestBuilder = new SuggestBuilder(); + suggestBuilder.addSuggestion(NAME_SUGGESTION, builder); + searchSourceBuilder.suggest(suggestBuilder); + } + /** * Escapes the Elasticsearch reserved characters in the given input string. * diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java index 35a322d37b2fd..8b56ae0beb3f1 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/SearchUtils.java @@ -78,7 +78,7 @@ public static Map getRequestMap(@Nullable Filter requestParams) return criterionArray.stream().collect(Collectors.toMap(Criterion::getField, Criterion::getValue)); } - static boolean isUrn(@Nonnull String value) { + public static boolean isUrn(@Nonnull String value) { // TODO(https://github.com/datahub-project/datahub-gma/issues/51): This method is a bit of a hack to support searching for // URNs that have commas in them, while also using commas a delimiter for search. We should stop supporting commas // as delimiter, and then we can stop using this hack. diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java index 847029bc180eb..20501225ef787 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESSampleDataFixture.java @@ -54,6 +54,13 @@ @TestConfiguration @Import(ESTestConfiguration.class) public class ESSampleDataFixture { + /** + * Interested in adding more fixtures? Here's what you will need to update? + * 1. Create a new indexPrefix and FixtureName. Both are needed or else all fixtures will load on top of each other, + * overwriting each other + * 2. Create a new IndexConvention, IndexBuilder, and EntityClient. These are needed + * to index a different set of entities. + */ @Autowired private ESBulkProcessor _bulkProcessor; @@ -61,6 +68,9 @@ public class ESSampleDataFixture { @Autowired private RestHighLevelClient _searchClient; + @Autowired + private RestHighLevelClient _longTailSearchClient; + @Autowired private SearchConfiguration _searchConfiguration; @@ -68,24 +78,54 @@ public class ESSampleDataFixture { private CustomSearchConfiguration _customSearchConfiguration; @Bean(name = "sampleDataPrefix") - protected String indexPrefix() { + protected String sampleDataPrefix() { return "smpldat"; } + @Bean(name = "longTailPrefix") + protected String longTailIndexPrefix() { + return "lngtl"; + } + @Bean(name = "sampleDataIndexConvention") protected IndexConvention indexConvention(@Qualifier("sampleDataPrefix") String prefix) { return new IndexConventionImpl(prefix); } + @Bean(name = "longTailIndexConvention") + protected IndexConvention longTailIndexConvention(@Qualifier("longTailPrefix") String prefix) { + return new IndexConventionImpl(prefix); + } + @Bean(name = "sampleDataFixtureName") - protected String fixtureName() { + protected String sampleDataFixtureName() { return "sample_data"; } + @Bean(name = "longTailFixtureName") + protected String longTailFixtureName() { + return "long_tail"; + } + @Bean(name = "sampleDataEntityIndexBuilders") protected EntityIndexBuilders entityIndexBuilders( @Qualifier("entityRegistry") EntityRegistry entityRegistry, @Qualifier("sampleDataIndexConvention") IndexConvention indexConvention + ) { + return entityIndexBuildersHelper(entityRegistry, indexConvention); + } + + @Bean(name = "longTailEntityIndexBuilders") + protected EntityIndexBuilders longTailEntityIndexBuilders( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailIndexConvention") IndexConvention indexConvention + ) { + return entityIndexBuildersHelper(longTailEntityRegistry, indexConvention); + } + + protected EntityIndexBuilders entityIndexBuildersHelper( + EntityRegistry entityRegistry, + IndexConvention indexConvention ) { GitVersion gitVersion = new GitVersion("0.0.0-test", "123456", Optional.empty()); ESIndexBuilder indexBuilder = new ESIndexBuilder(_searchClient, 1, 0, 1, @@ -100,6 +140,23 @@ protected ElasticSearchService entitySearchService( @Qualifier("entityRegistry") EntityRegistry entityRegistry, @Qualifier("sampleDataEntityIndexBuilders") EntityIndexBuilders indexBuilders, @Qualifier("sampleDataIndexConvention") IndexConvention indexConvention + ) throws IOException { + return entitySearchServiceHelper(entityRegistry, indexBuilders, indexConvention); + } + + @Bean(name = "longTailEntitySearchService") + protected ElasticSearchService longTailEntitySearchService( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailEndexBuilders, + @Qualifier("longTailIndexConvention") IndexConvention longTailIndexConvention + ) throws IOException { + return entitySearchServiceHelper(longTailEntityRegistry, longTailEndexBuilders, longTailIndexConvention); + } + + protected ElasticSearchService entitySearchServiceHelper( + EntityRegistry entityRegistry, + EntityIndexBuilders indexBuilders, + IndexConvention indexConvention ) throws IOException { CustomConfiguration customConfiguration = new CustomConfiguration(); customConfiguration.setEnabled(true); @@ -107,7 +164,7 @@ protected ElasticSearchService entitySearchService( CustomSearchConfiguration customSearchConfiguration = customConfiguration.resolve(new YAMLMapper()); ESSearchDAO searchDAO = new ESSearchDAO(entityRegistry, _searchClient, indexConvention, false, - ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, customSearchConfiguration); + ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, _searchConfiguration, customSearchConfiguration); ESBrowseDAO browseDAO = new ESBrowseDAO(entityRegistry, _searchClient, indexConvention, _searchConfiguration, _customSearchConfiguration); ESWriteDAO writeDAO = new ESWriteDAO(entityRegistry, _searchClient, indexConvention, _bulkProcessor, 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); @@ -120,9 +177,30 @@ protected SearchService searchService( @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, @Qualifier("sampleDataEntityIndexBuilders") EntityIndexBuilders indexBuilders, @Qualifier("sampleDataPrefix") String prefix, - @Qualifier("sampleDataFixtureName") String fixtureName + @Qualifier("sampleDataFixtureName") String sampleDataFixtureName ) throws IOException { + return searchServiceHelper(entityRegistry, entitySearchService, indexBuilders, prefix, sampleDataFixtureName); + } + @Bean(name = "longTailSearchService") + @Nonnull + protected SearchService longTailSearchService( + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry, + @Qualifier("longTailEntitySearchService") ElasticSearchService longTailEntitySearchService, + @Qualifier("longTailEntityIndexBuilders") EntityIndexBuilders longTailIndexBuilders, + @Qualifier("longTailPrefix") String longTailPrefix, + @Qualifier("longTailFixtureName") String longTailFixtureName + ) throws IOException { + return searchServiceHelper(longTailEntityRegistry, longTailEntitySearchService, longTailIndexBuilders, longTailPrefix, longTailFixtureName); + } + + public SearchService searchServiceHelper( + EntityRegistry entityRegistry, + ElasticSearchService entitySearchService, + EntityIndexBuilders indexBuilders, + String prefix, + String fixtureName + ) throws IOException { int batchSize = 100; SearchRanker ranker = new SimpleRanker(); CacheManager cacheManager = new ConcurrentMapCacheManager(); @@ -159,6 +237,24 @@ protected EntityClient entityClient( @Qualifier("sampleDataSearchService") SearchService searchService, @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, @Qualifier("entityRegistry") EntityRegistry entityRegistry + ) { + return entityClientHelper(searchService, entitySearchService, entityRegistry); + } + + @Bean(name = "longTailEntityClient") + @Nonnull + protected EntityClient longTailEntityClient( + @Qualifier("sampleDataSearchService") SearchService searchService, + @Qualifier("sampleDataEntitySearchService") ElasticSearchService entitySearchService, + @Qualifier("longTailEntityRegistry") EntityRegistry longTailEntityRegistry + ) { + return entityClientHelper(searchService, entitySearchService, longTailEntityRegistry); + } + + private EntityClient entityClientHelper( + SearchService searchService, + ElasticSearchService entitySearchService, + EntityRegistry entityRegistry ) { CachingEntitySearchService cachingEntitySearchService = new CachingEntitySearchService( new ConcurrentMapCacheManager(), @@ -173,7 +269,7 @@ protected EntityClient entityClient( preProcessHooks.setUiEnabled(true); return new JavaEntityClient( new EntityServiceImpl(mockAspectDao, null, entityRegistry, true, null, - preProcessHooks), + preProcessHooks), null, entitySearchService, cachingEntitySearchService, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java index 0d7ac506599af..673474c96cc51 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java @@ -6,6 +6,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -55,11 +56,17 @@ public SearchConfiguration searchConfiguration() { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.5f); searchConfiguration.setExactMatch(exactMatchConfiguration); + searchConfiguration.setWordGram(wordGramConfiguration); searchConfiguration.setPartial(partialConfiguration); return searchConfiguration; } @@ -137,4 +144,10 @@ public EntityRegistry entityRegistry() throws EntityRegistryException { return new ConfigEntityRegistry( ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); } + + @Bean(name = "longTailEntityRegistry") + public EntityRegistry longTailEntityRegistry() throws EntityRegistryException { + return new ConfigEntityRegistry( + ESTestConfiguration.class.getClassLoader().getResourceAsStream("entity-registry.yml")); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java index 79496888650e1..45c4c16864b07 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestUtils.java @@ -77,6 +77,11 @@ public static SearchResult searchAcrossEntities(SearchService searchService, Str 100, new SearchFlags().setFulltext(true).setSkipCache(true), facets); } + public static SearchResult searchAcrossCustomEntities(SearchService searchService, String query, List searchableEntities) { + return searchService.searchAcrossEntities(searchableEntities, query, null, null, 0, + 100, new SearchFlags().setFulltext(true).setSkipCache(true)); + } + public static SearchResult search(SearchService searchService, String query) { return search(searchService, SEARCHABLE_ENTITIES, query); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java new file mode 100644 index 0000000000000..d720c95fef84d --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java @@ -0,0 +1,167 @@ +package com.linkedin.metadata.search.elasticsearch.fixtures; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.ESSampleDataFixture; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.search.MatchedFieldArray; +import com.linkedin.metadata.search.SearchEntityArray; +import com.linkedin.metadata.search.SearchResult; +import com.linkedin.metadata.search.SearchService; +import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.linkedin.metadata.ESTestUtils.*; +import static org.testng.Assert.assertTrue; +import static org.testng.AssertJUnit.*; + +@Import(ESSampleDataFixture.class) +public class ElasticSearchGoldenTest extends AbstractTestNGSpringContextTests { + + private static final List SEARCHABLE_LONGTAIL_ENTITIES = Stream.of(EntityType.CHART, EntityType.CONTAINER, + EntityType.DASHBOARD, EntityType.DATASET, EntityType.DOMAIN, EntityType.TAG + ).map(EntityTypeMapper::getName) + .collect(Collectors.toList()); + @Autowired + private RestHighLevelClient _searchClient; + + @Autowired + @Qualifier("longTailSearchService") + protected SearchService searchService; + + @Autowired + @Qualifier("longTailEntityClient") + protected EntityClient entityClient; + + @Autowired + @Qualifier("longTailEntityRegistry") + private EntityRegistry entityRegistry; + + @Test + public void testNameMatchPetProfiles() { + /* + Searching for "pet profiles" should return "pet_profiles" as the first 2 search results + */ + assertNotNull(searchService); + assertNotNull(entityRegistry); + SearchResult searchResult = searchAcrossCustomEntities(searchService, "pet profiles", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("pet_profiles")); + assertTrue(secondResultUrn.toString().contains("pet_profiles")); + } + + @Test + public void testNameMatchPetProfile() { + /* + Searching for "pet profile" should return "pet_profiles" as the first 2 search results + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "pet profile", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("pet_profiles")); + assertTrue(secondResultUrn.toString().contains("pet_profiles")); + } + + @Test + public void testGlossaryTerms() { + /* + Searching for "ReturnRate" should return all tables that have the glossary term applied before + anything else + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "ReturnRate", SEARCHABLE_LONGTAIL_ENTITIES); + SearchEntityArray entities = searchResult.getEntities(); + assertTrue(searchResult.getEntities().size() >= 4); + MatchedFieldArray firstResultMatchedFields = entities.get(0).getMatchedFields(); + MatchedFieldArray secondResultMatchedFields = entities.get(1).getMatchedFields(); + MatchedFieldArray thirdResultMatchedFields = entities.get(2).getMatchedFields(); + MatchedFieldArray fourthResultMatchedFields = entities.get(3).getMatchedFields(); + + assertTrue(firstResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(secondResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(thirdResultMatchedFields.toString().contains("ReturnRate")); + assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate")); + } + + @Test + public void testNameMatchPartiallyQualified() { + /* + Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table + name as the first search results before any others + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "analytics.pet_details", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + assertTrue(firstResultUrn.toString().contains("snowflake,long_tail_companions.analytics.pet_details")); + assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details")); + } + + @Test + public void testNameMatchCollaborativeActionitems() { + /* + Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search + result, followed by "collaborative_actionitems_old" + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + Urn secondResultUrn = searchResult.getEntities().get(1).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,")); + assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + @Test + public void testNameMatchCustomerOrders() { + /* + Searching for "customer orders" should return "customer_orders" as the first search + result, not suffixed by anything + */ + assertNotNull(searchService); + SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES); + assertTrue(searchResult.getEntities().size() >= 2); + Urn firstResultUrn = searchResult.getEntities().get(0).getEntity(); + + // Checks that the table name is not suffixed with anything + assertTrue(firstResultUrn.toString().contains("customer_orders,")); + + Double firstResultScore = searchResult.getEntities().get(0).getScore(); + Double secondResultScore = searchResult.getEntities().get(1).getScore(); + + // Checks that the scores aren't tied so that we are matching on table name more than column name + assertTrue(firstResultScore > secondResultScore); + } + + /* + Tests that should pass but do not yet can be added below here, with the following annotation: + @Test(enabled = false) + */ + +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java index dada13bd6f479..d989d4ef4fa87 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java @@ -82,6 +82,7 @@ public class SampleDataFixtureTests extends AbstractTestNGSpringContextTests { protected EntityClient entityClient; @Autowired + @Qualifier("entityRegistry") private EntityRegistry entityRegistry; @Test @@ -357,6 +358,84 @@ public void testDelimitedSynonym() throws IOException { }).collect(Collectors.toList()); } + @Test + public void testNegateAnalysis() throws IOException { + String queryWithMinus = "logging_events -bckp"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "query_word_delimited", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_3", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp")); + + request = AnalyzeRequest.withIndexAnalyzer( + "smpldat_datasetindex_v2", + "word_gram_4", queryWithMinus + ); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + + } + + @Test + public void testWordGram() throws IOException { + String text = "hello.cat_cool_customer"; + AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer")); + + String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), + List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog")); + + String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\""; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table")); + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table")); + + String textWithParens = "(hi) there"; + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there")); + + String oneWordText = "hello"; + for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) { + request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText); + assertEquals(getTokens(request) + .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of()); + } + } + @Test public void testUrnSynonym() throws IOException { List expectedTokens = List.of("bigquery"); @@ -1266,6 +1345,53 @@ public void testParens() { String.format("%s - Expected search results to include matched fields", query)); assertEquals(result.getEntities().size(), 2); } + @Test + public void testGram() { + String query = "jaffle shop customers"; + SearchResult result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)", + "Expected exact match in 1st position"); + + query = "shop customers source"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop stg customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "jaffle shop transformers customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)", + "Expected ngram match in 1st position"); + + query = "shop raw customers"; + result = searchAcrossEntities(searchService, query); + assertTrue(result.hasEntities() && !result.getEntities().isEmpty(), + String.format("%s - Expected search results", query)); + + assertEquals(result.getEntities().get(0).getEntity().toString(), + "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)", + "Expected ngram match in 1st position"); + } @Test public void testPrefixVsExact() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java index ed72b46e98c46..0b33185549299 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java @@ -16,7 +16,7 @@ public void testMappingsBuilder() { Map result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec()); assertEquals(result.size(), 1); Map properties = (Map) result.get("properties"); - assertEquals(properties.size(), 17); + assertEquals(properties.size(), 19); assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword", "fields", ImmutableMap.of("delimited", @@ -66,6 +66,11 @@ public void testMappingsBuilder() { assertTrue(textFieldSubfields.containsKey("delimited")); assertTrue(textFieldSubfields.containsKey("keyword")); + // TEXT with addToFilters aliased under "_entityName" + Map textFieldAlias = (Map) properties.get("_entityName"); + assertEquals(textFieldAlias.get("type"), "alias"); + assertEquals(textFieldAlias.get("path"), "textFieldOverride"); + // TEXT_PARTIAL Map textArrayField = (Map) properties.get("textArrayField"); assertEquals(textArrayField.get("type"), "keyword"); @@ -76,6 +81,19 @@ public void testMappingsBuilder() { assertTrue(textArrayFieldSubfields.containsKey("ngram")); assertTrue(textArrayFieldSubfields.containsKey("keyword")); + // WORD_GRAM + Map wordGramField = (Map) properties.get("wordGramField"); + assertEquals(wordGramField.get("type"), "keyword"); + assertEquals(wordGramField.get("normalizer"), "keyword_normalizer"); + Map wordGramFieldSubfields = (Map) wordGramField.get("fields"); + assertEquals(wordGramFieldSubfields.size(), 6); + assertTrue(wordGramFieldSubfields.containsKey("delimited")); + assertTrue(wordGramFieldSubfields.containsKey("ngram")); + assertTrue(wordGramFieldSubfields.containsKey("keyword")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams2")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams3")); + assertTrue(wordGramFieldSubfields.containsKey("wordGrams4")); + // URN Map foreignKey = (Map) properties.get("foreignKey"); assertEquals(foreignKey.get("type"), "text"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java index 10b4ee42b1a71..36c8bb8f9a676 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/AggregationQueryBuilderTest.java @@ -31,7 +31,8 @@ public void testGetDefaultAggregationsHasFields() { 1.0, Optional.of("hasTest"), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); @@ -60,7 +61,8 @@ public void testGetDefaultAggregationsFields() { 1.0, Optional.empty(), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); @@ -89,7 +91,8 @@ public void testGetSpecificAggregationsHasFields() { 1.0, Optional.of("hasTest1"), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchableAnnotation annotation2 = new SearchableAnnotation( @@ -104,7 +107,8 @@ public void testGetSpecificAggregationsHasFields() { 1.0, Optional.empty(), Optional.empty(), - Collections.emptyMap() + Collections.emptyMap(), + Collections.emptyList() ); SearchConfiguration config = new SearchConfiguration(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java index a2ec396c34b2d..282b1d8bb6778 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.config.search.ExactMatchConfiguration; import com.linkedin.metadata.config.search.PartialConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; +import com.linkedin.metadata.config.search.WordGramConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.fasterxml.jackson.dataformat.yaml.YAMLMapper; import com.google.common.collect.ImmutableList; @@ -18,6 +19,7 @@ import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchAllQueryBuilder; import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder; +import org.elasticsearch.index.query.MatchPhraseQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryStringQueryBuilder; import org.elasticsearch.index.query.SimpleQueryStringBuilder; @@ -46,11 +48,17 @@ public class SearchQueryBuilderTest { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null); @@ -70,16 +78,17 @@ public void testQueryBuilderFulltext() { assertEquals(keywordQuery.value(), "testQuery"); assertEquals(keywordQuery.analyzer(), "keyword"); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 8); + assertEquals(keywordFields.size(), 9); assertEquals(keywordFields, Map.of( - "urn", 10.f, - "textArrayField", 1.0f, - "customProperties", 1.0f, - "nestedArrayArrayField", 1.0f, - "textFieldOverride", 1.0f, - "nestedArrayStringField", 1.0f, - "keyPart1", 10.0f, - "esObjectField", 1.0f + "urn", 10.f, + "textArrayField", 1.0f, + "customProperties", 1.0f, + "wordGramField", 1.0f, + "nestedArrayArrayField", 1.0f, + "textFieldOverride", 1.0f, + "nestedArrayStringField", 1.0f, + "keyPart1", 10.0f, + "esObjectField", 1.0f )); SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1); @@ -99,7 +108,8 @@ public void testQueryBuilderFulltext() { "nestedArrayArrayField.delimited", 0.4f, "urn.delimited", 7.0f, "textArrayField.delimited", 0.4f, - "nestedArrayStringField.delimited", 0.4f + "nestedArrayStringField.delimited", 0.4f, + "wordGramField.delimited", 0.4f )); BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1); @@ -109,21 +119,30 @@ public void testQueryBuilderFulltext() { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact TermQueryBuilder builder = (TermQueryBuilder) prefixQuery; return Pair.of(builder.fieldName(), builder.boost()); + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery; + return Pair.of(builder.fieldName(), builder.boost()); } }).collect(Collectors.toList()); - assertEquals(prefixFieldWeights.size(), 22); + assertEquals(prefixFieldWeights.size(), 28); List.of( Pair.of("urn", 100.0f), Pair.of("urn", 70.0f), Pair.of("keyPart1.delimited", 16.8f), Pair.of("keyPart1.keyword", 100.0f), - Pair.of("keyPart1.keyword", 70.0f) + Pair.of("keyPart1.keyword", 70.0f), + Pair.of("wordGramField.wordGrams2", 1.44f), + Pair.of("wordGramField.wordGrams3", 2.25f), + Pair.of("wordGramField.wordGrams4", 3.2399998f), + Pair.of("wordGramField.keyword", 10.0f), + Pair.of("wordGramField.keyword", 7.0f) ).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p)); // Validate scorer @@ -144,7 +163,7 @@ public void testQueryBuilderStructured() { assertEquals(keywordQuery.queryString(), "testQuery"); assertNull(keywordQuery.analyzer()); Map keywordFields = keywordQuery.fields(); - assertEquals(keywordFields.size(), 16); + assertEquals(keywordFields.size(), 21); assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f); assertFalse(keywordFields.containsKey("keyPart3")); assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f); @@ -196,10 +215,14 @@ public void testCustomExactMatch() { List queries = boolPrefixQuery.should().stream().map(prefixQuery -> { if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) { + // prefix return (MatchPhrasePrefixQueryBuilder) prefixQuery; - } else { + } else if (prefixQuery instanceof TermQueryBuilder) { // exact return (TermQueryBuilder) prefixQuery; + } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) { + // ngram + return (MatchPhraseQueryBuilder) prefixQuery; } }).collect(Collectors.toList()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java index d66d6a0ab0e76..db56e2d34881b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java @@ -7,6 +7,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.metadata.ESTestConfiguration; import com.linkedin.metadata.TestEntitySpecBuilder; +import com.linkedin.metadata.config.search.WordGramConfiguration; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests { exactMatchConfiguration.setCaseSensitivityFactor(0.7f); exactMatchConfiguration.setEnableStructured(true); + WordGramConfiguration wordGramConfiguration = new WordGramConfiguration(); + wordGramConfiguration.setTwoGramFactor(1.2f); + wordGramConfiguration.setThreeGramFactor(1.5f); + wordGramConfiguration.setFourGramFactor(1.8f); + PartialConfiguration partialConfiguration = new PartialConfiguration(); partialConfiguration.setFactor(0.4f); partialConfiguration.setUrnFactor(0.7f); testQueryConfig.setExactMatch(exactMatchConfiguration); + testQueryConfig.setWordGram(wordGramConfiguration); testQueryConfig.setPartial(partialConfiguration); } @@ -113,10 +120,10 @@ public void testSearchRequestHandler() { HighlightBuilder highlightBuilder = sourceBuilder.highlighter(); List fields = highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList()); - assertEquals(fields.size(), 20); + assertEquals(fields.size(), 22); List highlightableFields = ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey", - "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField"); + "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField"); highlightableFields.forEach(field -> { assertTrue(fields.contains(field), "Missing: " + field); assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*"); diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index e7941a04224e3..3811a9537ac24 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -43,6 +43,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -55,7 +57,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index 5981284e9da3f..2229c387f3676 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -56,6 +56,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -68,7 +70,7 @@ tasks.getByName("docker").dependsOn([bootJar]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}".toString()) + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl index 4339a186f1304..9fea71003ae6e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl @@ -20,8 +20,9 @@ record ChartInfo includes CustomProperties, ExternalReference { * Title of the chart */ @Searchable = { - "fieldType": "TEXT_PARTIAL", - "enableAutocomplete": true + "fieldType": "WORD_GRAM", + "enableAutocomplete": true, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl index 26745fe46caaa..526878cbe60d3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl @@ -15,9 +15,10 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Display name of the Asset Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -25,7 +26,7 @@ record ContainerProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Container */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -61,4 +62,4 @@ record ContainerProperties includes CustomProperties, ExternalReference { } } lastModified: optional TimeStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl index 5cb306039506e..c436011eb58db 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl @@ -22,9 +22,10 @@ record DashboardInfo includes CustomProperties, ExternalReference { * Title of the dashboard */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } title: string @@ -126,4 +127,4 @@ record DashboardInfo includes CustomProperties, ExternalReference { * The time when this dashboard last refreshed */ lastRefreshed: optional Time -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl index 481240740876a..2ff3e8cd930af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl @@ -17,9 +17,10 @@ record DataFlowInfo includes CustomProperties, ExternalReference { * Flow name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl index 8737dd4d9ef52..250fb76003777 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl @@ -18,9 +18,10 @@ record DataJobInfo includes CustomProperties, ExternalReference { * Job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl index acc40e9f693ec..5dd35c7f49520 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl @@ -15,9 +15,10 @@ record DataPlatformInfo { */ @validate.strlen.max = 15 @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": false, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -25,7 +26,7 @@ record DataPlatformInfo { * The name that will be used for displaying a platform type. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl index d7ce5565103ee..b24e220ac3bcf 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl @@ -16,9 +16,10 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen * Display name of the Data Platform Instance */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl index 72eefd5e294e4..c63cb1a97c017 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl @@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc * Process name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -31,6 +31,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc @Searchable = { "fieldType": "KEYWORD", "addToFilters": true, + "fieldName": "processType", "filterNameOverride": "Process Type" } type: optional enum DataProcessType { diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl index 3861b7def7669..b2d26094fd0b7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl @@ -13,9 +13,10 @@ record DataProductProperties includes CustomProperties, ExternalReference { * Display name of the Data Product */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl index 57b1fe7693129..ad8705a29d4ed 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl @@ -17,9 +17,10 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Display name of the Dataset */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string @@ -27,7 +28,7 @@ record DatasetProperties includes CustomProperties, ExternalReference { * Fully-qualified name of the Dataset */ @Searchable = { - "fieldType": "TEXT", + "fieldType": "WORD_GRAM", "addToFilters": false, "enableAutocomplete": true, "boostScore": 10.0 @@ -77,4 +78,4 @@ record DatasetProperties includes CustomProperties, ExternalReference { */ @deprecated = "Use GlobalTags aspect instead." tags: array[string] = [ ] -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl index 5a0b8657ecb47..5c8c8a4912e4c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl @@ -14,9 +14,10 @@ record DomainProperties { * Display name of the Domain */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl index 1e840e5a1df7e..c3388d4f462d4 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl @@ -35,9 +35,10 @@ record GlossaryNodeInfo { */ @Searchable = { "fieldName": "displayName", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string @@ -49,4 +50,4 @@ record GlossaryNodeInfo { } id: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl index aa2a8b31e3dde..e987a71be7131 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl @@ -23,9 +23,10 @@ record GlossaryTermInfo includes CustomProperties { * Display name of the term */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: optional string @@ -75,4 +76,4 @@ record GlossaryTermInfo includes CustomProperties { */ @deprecated rawSchema: optional string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl index 8d764604237da..28b87476c61bd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpGroupInfo.pdl @@ -21,7 +21,8 @@ record CorpGroupInfo { "fieldType": "TEXT_PARTIAL" "queryByDefault": true, "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } displayName: optional string diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl index 6b050f484fedd..48ee53377e582 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl @@ -45,7 +45,7 @@ record CorpUserEditableInfo { * DataHub-native display name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl index 1cb705d426cc0..382b120fa942a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl @@ -26,10 +26,11 @@ record CorpUserInfo includes CustomProperties { * displayName of this user , e.g. Hang Zhang(DataHQ) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } displayName: optional string @@ -89,7 +90,7 @@ record CorpUserInfo includes CustomProperties { * Common name of this user, format is firstName + lastName (split by a whitespace) */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl index 075cc14ddc83b..9e65b8f6e9929 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl @@ -11,10 +11,10 @@ record CorpGroupKey { * The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "queryByDefault": true, "enableAutocomplete": true, "boostScore": 10.0 } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl index d1a8a4bb5bb23..476a0ad9704b3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl @@ -12,7 +12,7 @@ record CorpUserKey { */ @Searchable = { "fieldName": "ldap", - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "boostScore": 2.0, "enableAutocomplete": true } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl index bcdb92f75d055..d8342630248b6 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl @@ -19,7 +19,7 @@ record DataFlowKey { * Unique Identifier of the data flow */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } flowId: string @@ -31,4 +31,4 @@ record DataFlowKey { "fieldType": "TEXT_PARTIAL" } cluster: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl index d0ac7dbca0f99..60ec51b464dcc 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl @@ -27,7 +27,7 @@ record DataJobKey { * Unique Identifier of the data job */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } jobId: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl index a5c05029352c2..4df1364a04ebe 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl @@ -13,7 +13,7 @@ record DataProcessKey { * Process name i.e. an ETL job name */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 4.0 } @@ -37,4 +37,4 @@ record DataProcessKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl index ea1f9510ed438..70c5d174171af 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl @@ -25,7 +25,7 @@ record DatasetKey { //This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead. @Searchable = { "fieldName": "id" - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl index 88697fe3ff364..51a3bc00f4e9e 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl @@ -12,9 +12,9 @@ import com.linkedin.common.FabricType record GlossaryNodeKey { @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl index a9f35146da18e..61bcd60cbc754 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl @@ -13,10 +13,10 @@ record GlossaryTermKey { * The term name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl index 579f1966977a9..050b954c89fb8 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl @@ -20,9 +20,10 @@ record MLFeatureKey { * Name of the feature */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl index 1f786ad417be7..175a7b0d31b00 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl @@ -22,9 +22,10 @@ record MLFeatureTableKey { * Name of the feature table */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl index 7c36f410fede3..daa1deceb5fc3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl @@ -19,9 +19,10 @@ record MLModelDeploymentKey { * Name of the MLModelDeployment */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -35,4 +36,4 @@ record MLModelDeploymentKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl index 17c401c0b8c48..582a899633c2a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl @@ -19,9 +19,10 @@ record MLModelGroupKey { * Name of the MLModelGroup */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -33,4 +34,4 @@ record MLModelGroupKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl index 55fd2bc370846..f097bbda738a2 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl @@ -19,9 +19,10 @@ record MLModelKey { * Name of the MLModel */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string @@ -35,4 +36,4 @@ record MLModelKey { "queryByDefault": false } origin: FabricType -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl index 9eb67eaf5f651..ef812df206b46 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl @@ -21,9 +21,10 @@ record MLPrimaryKeyKey { * Name of the primary key */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 8.0 + "boostScore": 8.0, + "fieldNameAliases": [ "_entityName" ] } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl index 47f1a631b4a2c..4622e32dce67b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl @@ -11,10 +11,10 @@ record TagKey { * The tag name, which serves as a unique id */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0, "fieldName": "id" } name: string -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl index 05a94b8fabc4b..be1a30c7f082c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/query/SearchFlags.pdl @@ -28,4 +28,9 @@ record SearchFlags { * Whether to skip aggregates/facets */ skipAggregates:optional boolean = false + + /** + * Whether to request for search suggestions on the _entityName virtualized field + */ + getSuggestions:optional boolean = false } diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl index 718d80ba4cb36..60f1b568f586a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchResultMetadata.pdl @@ -12,4 +12,9 @@ record SearchResultMetadata { */ aggregations: array[AggregationMetadata] = [] + /** + * A list of search query suggestions based on the given query + */ + suggestions: array[SearchSuggestion] = [] + } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl new file mode 100644 index 0000000000000..7776ec54fe03e --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/search/SearchSuggestion.pdl @@ -0,0 +1,24 @@ +namespace com.linkedin.metadata.search + +/** + * The model for the search result + */ +record SearchSuggestion { + + /** + * The suggestion text for this search query + */ + text: string + + /** + * The score for how close this suggestion is to the original search query. + * The closer to 1 means it is closer to the original query and 0 is further away. + */ + score: float + + /** + * How many matches there are with the suggested text for the given field + */ + frequency: long + +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl index 1f4dcf975f48c..8ec5f262890f3 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl @@ -18,9 +18,10 @@ record NotebookInfo includes CustomProperties, ExternalReference { * Title of the Notebook */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } title: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl index 004df6e399be4..3e7b53beff531 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl @@ -14,7 +14,7 @@ record OwnershipTypeInfo { * Display name of the Ownership Type */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -54,4 +54,4 @@ record OwnershipTypeInfo { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl index bb7e22900e168..3ba19d348913b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl @@ -29,7 +29,7 @@ record QueryProperties { * Optional display name to identify the query. */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, "boostScore": 10.0 } @@ -69,4 +69,4 @@ record QueryProperties { } } lastModified: AuditStamp -} \ No newline at end of file +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl index acebdf5558c59..8422d3c49046c 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl @@ -14,9 +14,10 @@ record RoleProperties { * Display name of the IAM Role in the external system */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl index 41c500c6fff2f..9df47fac3928a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl @@ -11,9 +11,10 @@ record TagProperties { * Display name of the tag */ @Searchable = { - "fieldType": "TEXT_PARTIAL", + "fieldType": "WORD_GRAM", "enableAutocomplete": true, - "boostScore": 10.0 + "boostScore": 10.0, + "fieldNameAliases": [ "_entityName" ] } name: string diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index 690528059b555..f653ccf72cf54 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -250,11 +250,11 @@ private void addPoliciesToCache(final Map> cache private void addPolicyToCache(final Map> cache, final DataHubPolicyInfo policy) { final List privileges = policy.getPrivileges(); for (String privilege : privileges) { - List existingPolicies = cache.getOrDefault(privilege, new ArrayList<>()); + List existingPolicies = cache.containsKey(privilege) ? new ArrayList<>(cache.get(privilege)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(privilege, existingPolicies); } - List existingPolicies = cache.getOrDefault(ALL, new ArrayList<>()); + List existingPolicies = cache.containsKey(ALL) ? new ArrayList<>(cache.get(ALL)) : new ArrayList<>(); existingPolicies.add(policy); cache.put(ALL, existingPolicies); } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java new file mode 100644 index 0000000000000..7094bbd710f75 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/SearchResultVisualConfig.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config; + +import lombok.Data; + +@Data +public class SearchResultVisualConfig { + /** + * The default tab to show first on a Domain entity profile. Defaults to React code sorting if not present. + */ + public Boolean enableNameHighlight; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java index d1c357186e1ae..14ac2406c2256 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/VisualConfiguration.java @@ -22,4 +22,9 @@ public class VisualConfiguration { * Queries tab related configurations */ public EntityProfileConfig entityProfile; + + /** + * Search result related configurations + */ + public SearchResultVisualConfig searchResult; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java index 1a56db1bd68b0..b2b5260dc5e70 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java @@ -11,4 +11,5 @@ public class SearchConfiguration { private PartialConfiguration partial; private CustomConfiguration custom; private GraphQueryConfiguration graph; + private WordGramConfiguration wordGram; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java new file mode 100644 index 0000000000000..624d2a4c63c4c --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java @@ -0,0 +1,11 @@ +package com.linkedin.metadata.config.search; + +import lombok.Data; + + +@Data +public class WordGramConfiguration { + private float twoGramFactor; + private float threeGramFactor; + private float fourGramFactor; +} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 9f7bf92039fdc..d21442d0bf5c8 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -111,6 +111,8 @@ visualConfig: entityProfile: # we only support default tab for domains right now. In order to implement for other entities, update React code domainDefaultTab: ${DOMAIN_DEFAULT_TAB:} # set to DOCUMENTATION_TAB to show documentation tab first + searchResult: + enableNameHighlight: ${SEARCH_RESULT_NAME_HIGHLIGHT_ENABLED:true} # Enables visual highlighting on search result names/descriptions. # Storage Layer @@ -198,6 +200,10 @@ elasticsearch: prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search + wordGram: + twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens + threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens + fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens # Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments partial: urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed @@ -318,4 +324,4 @@ cache: search: lineage: ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day - lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} \ No newline at end of file + lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300} diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 796b6ee436b78..8e9b859e3b136 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -49,6 +49,12 @@ dependencies { testCompile externalDependency.hazelcastTest implementation externalDependency.jline implementation externalDependency.common + + constraints { + implementation(externalDependency.snappy) { + because("previous versions are vulnerable to CVE-2023-34453 through CVE-2023-34455") + } + } } configurations.all{ diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json index 7aeca546af3c9..e3beef5ac4871 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.aspects.snapshot.json @@ -341,6 +341,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1279,6 +1280,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1405,6 +1407,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1464,6 +1467,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1865,6 +1869,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2061,6 +2066,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2097,6 +2103,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2161,6 +2168,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2340,6 +2348,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -3217,6 +3226,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -3282,6 +3292,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -3867,6 +3878,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json index 83ecaf41022c4..0c9b49649bf1e 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1326,6 +1327,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1471,6 +1473,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1530,6 +1533,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1922,6 +1926,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : false, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" }, "validate" : { @@ -2111,6 +2116,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2437,6 +2443,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2585,6 +2592,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -3704,6 +3712,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4302,6 +4311,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4390,6 +4400,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4484,6 +4495,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4590,6 +4602,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4696,6 +4709,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4796,6 +4810,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4879,6 +4894,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -5096,6 +5112,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -5710,6 +5727,12 @@ "doc" : "Whether to skip aggregates/facets", "default" : false, "optional" : true + }, { + "name" : "getSuggestions", + "type" : "boolean", + "doc" : "Whether to request for search suggestions on the _entityName virtualized field", + "default" : false, + "optional" : true } ] }, { "type" : "enum", @@ -6081,6 +6104,31 @@ }, "doc" : "A list of search result metadata such as aggregations", "default" : [ ] + }, { + "name" : "suggestions", + "type" : { + "type" : "array", + "items" : { + "type" : "record", + "name" : "SearchSuggestion", + "doc" : "The model for the search result", + "fields" : [ { + "name" : "text", + "type" : "string", + "doc" : "The suggestion text for this search query" + }, { + "name" : "score", + "type" : "float", + "doc" : "The score for how close this suggestion is to the original search query.\nThe closer to 1 means it is closer to the original query and 0 is further away." + }, { + "name" : "frequency", + "type" : "long", + "doc" : "How many matches there are with the suggested text for the given field" + } ] + } + }, + "doc" : "A list of search query suggestions based on the given query", + "default" : [ ] } ] }, "doc" : "Metadata specific to the browse result of the queried path" @@ -6187,7 +6235,7 @@ "type" : "int", "doc" : "The total number of entities directly under searched path" } ] - }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { + }, "com.linkedin.metadata.search.SearchResultMetadata", "com.linkedin.metadata.search.SearchSuggestion", "com.linkedin.metadata.snapshot.ChartSnapshot", "com.linkedin.metadata.snapshot.CorpGroupSnapshot", "com.linkedin.metadata.snapshot.CorpUserSnapshot", "com.linkedin.metadata.snapshot.DashboardSnapshot", "com.linkedin.metadata.snapshot.DataFlowSnapshot", "com.linkedin.metadata.snapshot.DataHubPolicySnapshot", "com.linkedin.metadata.snapshot.DataHubRetentionSnapshot", "com.linkedin.metadata.snapshot.DataJobSnapshot", "com.linkedin.metadata.snapshot.DataPlatformSnapshot", "com.linkedin.metadata.snapshot.DataProcessSnapshot", "com.linkedin.metadata.snapshot.DatasetSnapshot", "com.linkedin.metadata.snapshot.GlossaryNodeSnapshot", "com.linkedin.metadata.snapshot.GlossaryTermSnapshot", "com.linkedin.metadata.snapshot.MLFeatureSnapshot", "com.linkedin.metadata.snapshot.MLFeatureTableSnapshot", "com.linkedin.metadata.snapshot.MLModelDeploymentSnapshot", "com.linkedin.metadata.snapshot.MLModelGroupSnapshot", "com.linkedin.metadata.snapshot.MLModelSnapshot", "com.linkedin.metadata.snapshot.MLPrimaryKeySnapshot", "com.linkedin.metadata.snapshot.SchemaFieldSnapshot", "com.linkedin.metadata.snapshot.Snapshot", "com.linkedin.metadata.snapshot.TagSnapshot", "com.linkedin.ml.metadata.BaseData", "com.linkedin.ml.metadata.CaveatDetails", "com.linkedin.ml.metadata.CaveatsAndRecommendations", "com.linkedin.ml.metadata.DeploymentStatus", "com.linkedin.ml.metadata.EthicalConsiderations", "com.linkedin.ml.metadata.EvaluationData", "com.linkedin.ml.metadata.HyperParameterValueType", "com.linkedin.ml.metadata.IntendedUse", "com.linkedin.ml.metadata.IntendedUserType", "com.linkedin.ml.metadata.MLFeatureProperties", "com.linkedin.ml.metadata.MLFeatureTableProperties", "com.linkedin.ml.metadata.MLHyperParam", "com.linkedin.ml.metadata.MLMetric", "com.linkedin.ml.metadata.MLModelDeploymentProperties", "com.linkedin.ml.metadata.MLModelFactorPrompts", "com.linkedin.ml.metadata.MLModelFactors", "com.linkedin.ml.metadata.MLModelGroupProperties", "com.linkedin.ml.metadata.MLModelProperties", "com.linkedin.ml.metadata.MLPrimaryKeyProperties", "com.linkedin.ml.metadata.Metrics", "com.linkedin.ml.metadata.QuantitativeAnalyses", "com.linkedin.ml.metadata.ResultsType", "com.linkedin.ml.metadata.SourceCode", "com.linkedin.ml.metadata.SourceCodeUrl", "com.linkedin.ml.metadata.SourceCodeUrlType", "com.linkedin.ml.metadata.TrainingData", { "type" : "record", "name" : "SystemMetadata", "namespace" : "com.linkedin.mxe", diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json index b1489df3db55e..ffaefc8232e83 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.runs.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1032,6 +1033,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1158,6 +1160,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1217,6 +1220,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1618,6 +1622,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1806,6 +1811,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1842,6 +1848,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1906,6 +1913,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2085,6 +2093,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2962,6 +2971,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -3027,6 +3037,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -3612,6 +3623,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json index f4c2d16f84747..e385c7c30b21a 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.operations.operations.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1032,6 +1033,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1158,6 +1160,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1217,6 +1220,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1618,6 +1622,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1800,6 +1805,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1836,6 +1842,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1900,6 +1907,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2079,6 +2087,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2956,6 +2965,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -3021,6 +3031,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -3606,6 +3617,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json index 2676c2687bd72..b85c84be23795 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.platform.platform.snapshot.json @@ -94,6 +94,7 @@ "doc" : "Title of the chart", "Searchable" : { "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1326,6 +1327,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1471,6 +1473,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1530,6 +1533,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -1922,6 +1926,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : false, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" }, "validate" : { @@ -2111,6 +2116,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -2431,6 +2437,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -2579,6 +2586,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL", "queryByDefault" : true } @@ -3698,6 +3706,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4296,6 +4305,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4384,6 +4394,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4478,6 +4489,7 @@ "Searchable" : { "boostScore" : 8.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } } ], @@ -4584,6 +4596,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4690,6 +4703,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4790,6 +4804,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -4873,6 +4888,7 @@ "Searchable" : { "boostScore" : 10.0, "enableAutocomplete" : true, + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { @@ -5090,6 +5106,7 @@ "boostScore" : 10.0, "enableAutocomplete" : true, "fieldName" : "displayName", + "fieldNameAliases" : [ "_entityName" ], "fieldType" : "TEXT_PARTIAL" } }, { diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 7e9aa90664611..eaf14f7fd6c18 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -72,6 +72,8 @@ docker { include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' + }.exclude { + i -> i.file.isHidden() || i.file == buildDir } tag("Debug", "${docker_registry}/${docker_repo}:debug") @@ -84,7 +86,7 @@ tasks.getByName("docker").dependsOn([build, war]) task cleanLocalDockerImages { doLast { - rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "v${version}") + rootProject.ext.cleanLocalDockerImages(docker_registry, docker_repo, "${version}") } } dockerClean.finalizedBy(cleanLocalDockerImages) diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json index 3fddf3456ecd7..3cda0269b79f1 100644 --- a/metadata-service/war/src/main/resources/boot/policies.json +++ b/metadata-service/war/src/main/resources/boot/policies.json @@ -19,6 +19,7 @@ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_USER_CREDENTIALS", @@ -102,6 +103,7 @@ "VIEW_ANALYTICS", "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_TAGS", @@ -190,6 +192,7 @@ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_TESTS", "MANAGE_GLOSSARIES", "MANAGE_USER_CREDENTIALS", @@ -283,6 +286,7 @@ "privileges":[ "GENERATE_PERSONAL_ACCESS_TOKENS", "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", "MANAGE_GLOSSARIES", "MANAGE_TAGS" ], diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java index c46d02a6eadf0..0b0d462f079bf 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/authorization/PoliciesConfig.java @@ -64,6 +64,11 @@ public class PoliciesConfig { "Manage Domains", "Create and remove Asset Domains."); + public static final Privilege MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE = Privilege.of( + "MANAGE_GLOBAL_ANNOUNCEMENTS", + "Manage Home Page Posts", + "Create and delete home page posts"); + public static final Privilege MANAGE_TESTS_PRIVILEGE = Privilege.of( "MANAGE_TESTS", "Manage Tests", @@ -113,6 +118,7 @@ public class PoliciesConfig { MANAGE_USERS_AND_GROUPS_PRIVILEGE, VIEW_ANALYTICS_PRIVILEGE, MANAGE_DOMAINS_PRIVILEGE, + MANAGE_GLOBAL_ANNOUNCEMENTS_PRIVILEGE, MANAGE_INGESTION_PRIVILEGE, MANAGE_SECRETS_PRIVILEGE, GENERATE_PERSONAL_ACCESS_TOKENS_PRIVILEGE, @@ -192,8 +198,8 @@ public class PoliciesConfig { public static final Privilege EDIT_ENTITY_PRIVILEGE = Privilege.of( "EDIT_ENTITY", - "Edit All", - "The ability to edit any information about an entity. Super user privileges."); + "Edit Entity", + "The ability to edit any information about an entity. Super user privileges for the entity."); public static final Privilege DELETE_ENTITY_PRIVILEGE = Privilege.of( "DELETE_ENTITY", diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 050b5d2db95c9..d40e4a5e7a4aa 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -15,4 +15,4 @@ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ -datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup +datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js new file mode 100644 index 0000000000000..2a8fe045f154e --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_level.js @@ -0,0 +1,51 @@ +const DATASET_ENTITY_TYPE = 'dataset'; +const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)'; + +describe("column-level lineage graph test", () => { + + it("navigate to lineage graph view and verify that column-level lineage is showing correctly", () => { + cy.login(); + cy.goToEntityLineageGraph(DATASET_ENTITY_TYPE, DATASET_URN); + //verify columns not shown by default + cy.waitTextVisible("SampleCypressHdfs"); + cy.waitTextVisible("SampleCypressHive"); + cy.waitTextVisible("cypress_logging"); + cy.ensureTextNotPresent("shipment_info"); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.ensureTextNotPresent("event_name"); + cy.ensureTextNotPresent("event_data"); + cy.ensureTextNotPresent("timestamp"); + cy.ensureTextNotPresent("browser"); + cy.clickOptionWithTestId("column-toggle") + //verify columns appear and belong co correct dataset + cy.waitTextVisible("shipment_info"); + cy.waitTextVisible("shipment_info.date"); + cy.waitTextVisible("shipment_info.target"); + cy.waitTextVisible("shipment_info.destination"); + cy.waitTextVisible("shipment_info.geo_info"); + cy.waitTextVisible("field_foo"); + cy.waitTextVisible("field_baz"); + cy.waitTextVisible("event_name"); + cy.waitTextVisible("event_data"); + cy.waitTextVisible("timestamp"); + cy.waitTextVisible("browser"); + //verify columns can be hidden and shown again + cy.contains("Hide").click({ force:true }); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.get("[aria-label='down']").eq(1).click({ force:true }); + cy.waitTextVisible("field_foo"); + cy.waitTextVisible("field_baz"); + //verify columns can be disabled successfully + cy.clickOptionWithTestId("column-toggle") + cy.ensureTextNotPresent("shipment_info"); + cy.ensureTextNotPresent("field_foo"); + cy.ensureTextNotPresent("field_baz"); + cy.ensureTextNotPresent("event_name"); + cy.ensureTextNotPresent("event_data"); + cy.ensureTextNotPresent("timestamp"); + cy.ensureTextNotPresent("browser"); + }); + +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js b/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js index 1d41d155440e8..2fa11654a3c3e 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/deprecations.js @@ -1,19 +1,29 @@ -describe("deprecation", () => { +describe("dataset deprecation", () => { it("go to dataset and check deprecation works", () => { const urn = "urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)"; const datasetName = "cypress_logging_events"; cy.login(); - cy.goToDataset(urn, datasetName); cy.openThreeDotDropdown(); cy.clickOptionWithText("Mark as deprecated"); cy.addViaFormModal("test deprecation", "Add Deprecation Details"); - - cy.goToDataset(urn, datasetName); - cy.contains("DEPRECATED"); - + cy.waitTextVisible("Deprecation Updated"); + cy.waitTextVisible("DEPRECATED") cy.openThreeDotDropdown(); cy.clickOptionWithText("Mark as un-deprecated"); + cy.waitTextVisible("Deprecation Updated"); + cy.ensureTextNotPresent("DEPRECATED"); + cy.openThreeDotDropdown(); + cy.clickOptionWithText("Mark as deprecated"); + cy.addViaFormModal("test deprecation", "Add Deprecation Details"); + cy.waitTextVisible("Deprecation Updated"); + cy.waitTextVisible("DEPRECATED"); + cy.contains("DEPRECATED").trigger("mouseover", { force: true }); + cy.waitTextVisible("Deprecation note"); + cy.get("[role='tooltip']").contains("Mark as un-deprecated").click(); + cy.waitTextVisible("Confirm Mark as un-deprecated"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Marked assets as un-deprecated!"); cy.ensureTextNotPresent("DEPRECATED"); - }); + }); }); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js new file mode 100644 index 0000000000000..e4e5a39ce1100 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -0,0 +1,97 @@ +const test_id = Math.floor(Math.random() * 100000); +const documentation_edited = `This is test${test_id} documentation EDITED`; +const wrong_url = "https://www.linkedincom"; +const correct_url = "https://www.linkedin.com"; + +describe("edit documentation and link to dataset", () => { + it("open test dataset page, edit documentation", () => { + //edit documentation and verify changes saved + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.waitTextVisible("my hive dataset"); + cy.waitTextVisible("Sample doc"); + cy.clickOptionWithText("Edit"); + cy.focused().clear(); + cy.focused().type(documentation_edited); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible(documentation_edited); + //return documentation to original state + cy.clickOptionWithText("Edit"); + cy.focused().clear().wait(1000); + cy.focused().type("my hive dataset"); + cy.get("button").contains("Save").click(); + cy.waitTextVisible("Description Updated"); + cy.waitTextVisible("my hive dataset"); + }); + + it("open test dataset page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("[role='tab']").contains("Documentation").click(); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + }); + + it("open test domain page, remove and add dataset link", () => { + cy.loginWithCredentials(); + cy.visit("/domain/urn:li:domain:marketing/Entities"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get("button").contains("Add Link").click(); + cy.get("#addLinkForm_url").type(wrong_url); + cy.waitTextVisible("This field must be a valid url."); + cy.focused().clear(); + cy.waitTextVisible("A URL is required."); + cy.focused().type(correct_url); + cy.ensureTextNotPresent("This field must be a valid url."); + cy.get("#addLinkForm_label").type("Sample doc"); + cy.get('[role="dialog"] button').contains("Add").click(); + cy.waitTextVisible("Link Added"); + cy.get("[role='tab']").contains("Documentation").click(); + cy.get(`[href='${correct_url}']`).should("be.visible"); + cy.contains("Sample doc").trigger("mouseover", { force: true }); + cy.get('[data-icon="delete"]').click(); + cy.waitTextVisible("Link Removed"); + }); + + it("edit field documentation", () => { + cy.loginWithCredentials(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" + ); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.waitTextVisible("Update description"); + cy.waitTextVisible("Foo field description has changed"); + cy.focused().clear().wait(1000); + cy.focused().type(documentation_edited); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible(documentation_edited); + cy.waitTextVisible("(edited)"); + cy.get("tbody [data-icon='edit']").first().click({ force: true }); + cy.focused().clear().wait(1000); + cy.focused().type("Foo field description has changed"); + cy.get("button").contains("Update").click(); + cy.waitTextVisible("Updated!"); + cy.waitTextVisible("Foo field description has changed"); + cy.waitTextVisible("(edited)"); + }); +}); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js index ddda8626fba2f..24a24cc21138d 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js @@ -31,8 +31,7 @@ describe("run managed ingestion", () => { cy.waitTextVisible(testName) cy.contains(testName).parent().within(() => { - // TODO: Skipping until disk size resolved - // cy.contains("Succeeded", {timeout: 30000}) + cy.contains("Succeeded", {timeout: 180000}) cy.clickOptionWithTestId("delete-button"); }) cy.clickOptionWithText("Yes") diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 7686acfe50de0..9559435ff01c8 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -64,6 +64,7 @@ describe("create and manage group", () => { }); it("update group info", () => { + var expected_name = Cypress.env('ADMIN_USERNAME'); cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.clickOptionWithText(group_name); @@ -77,13 +78,13 @@ describe("create and manage group", () => { cy.contains("Test group description EDITED").should("be.visible"); cy.clickOptionWithText("Add Owners"); cy.contains("Search for users or groups...").click({ force: true }); - cy.focused().type(Cypress.env('ADMIN_USERNAME')); - cy.get(".ant-select-item-option").contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).click(); + cy.focused().type(expected_name); + cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click(); cy.focused().blur(); - cy.contains(Cypress.env('ADMIN_USERNAME')).should("have.length", 1); + cy.contains(expected_name).should("have.length", 1); cy.get('[role="dialog"] button').contains("Done").click(); cy.waitTextVisible("Owners Added"); - cy.contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).should("be.visible"); + cy.contains(expected_name, { matchCase: false }).should("be.visible"); cy.clickOptionWithText("Edit Group"); cy.waitTextVisible("Edit Profile"); cy.get("#email").type(`${test_id}@testemail.com`); diff --git a/smoke-test/tests/cypress/data.json b/smoke-test/tests/cypress/data.json index c6606519e8d73..3b2ee1afaba58 100644 --- a/smoke-test/tests/cypress/data.json +++ b/smoke-test/tests/cypress/data.json @@ -2012,4 +2012,4 @@ }, "systemMetadata": null } -] +] \ No newline at end of file diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl index ed30244c31b17..6dff14133ee60 100644 --- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl +++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl @@ -14,7 +14,8 @@ record TestEntityInfo includes CustomProperties { @Searchable = { "fieldName": "textFieldOverride", "fieldType": "TEXT", - "addToFilters": true + "addToFilters": true, + "fieldNameAliases": [ "_entityName" ] } textField: optional string @@ -25,6 +26,11 @@ record TestEntityInfo includes CustomProperties { } textArrayField: optional array[string] + @Searchable = { + "fieldType": "WORD_GRAM" + } + wordGramField: optional string + @Relationship = { "name": "foreignKey", "entityTypes": []