Merge branch 'datahub-project:master' into master

acryldata · Oct 6, 2023 · 7dabb18 · 7dabb18
2 parents f3c9c3c + 8e7f286
commit 7dabb18
Show file tree

Hide file tree

Showing 44 changed files with 584 additions and 500 deletions.
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
@@ -10,7 +10,7 @@ on:
       - "metadata-models/**"
   pull_request:
     branches:
-      - master
+      - "**"
     paths:
       - ".github/**"
       - "metadata-ingestion-modules/airflow-plugin/**"

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -8,7 +8,7 @@ on:
       - "**.md"
   pull_request:
     branches:
-      - master
+      - "**"
     paths-ignore:
       - "docs/**"
       - "**.md"
@@ -24,17 +24,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        command:
-          [
+        command: [
             # metadata-ingestion and airflow-plugin each have dedicated build jobs
             "except_metadata_ingestion",
             "frontend"
           ]
-        timezone:
-          [
-            "UTC",
-            "America/New_York",
-          ]
+        timezone: ["UTC", "America/New_York"]
     runs-on: ubuntu-latest
     timeout-minutes: 60
     steps:

diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml
@@ -10,7 +10,7 @@ on:
       - "**.md"
   pull_request:
     branches:
-      - master
+      - "**"
     paths-ignore:
       - "docker/**"
       - "docs/**"
@@ -28,12 +28,7 @@ jobs:
       max-parallel: 1
       fail-fast: false
       matrix:
-        command:
-          [
-            "datahub-client",
-            "datahub-protobuf",
-            "spark-lineage"
-          ]
+        command: ["datahub-client", "datahub-protobuf", "spark-lineage"]
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3

diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml
@@ -18,7 +18,9 @@ jobs:
           days-before-issue-stale: 30
           days-before-issue-close: 30
           stale-issue-label: "stale"
-          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io. For feature requests please use https://feature-requests.datahubproject.io"
+          stale-issue-message:
+            "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io.\
+            \ For feature requests please use https://feature-requests.datahubproject.io"
           close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale."
           days-before-pr-stale: -1
           days-before-pr-close: -1

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -10,7 +10,7 @@ on:
       - ".github/workflows/code-checks.yml"
   pull_request:
     branches:
-      - master
+      - "**"
     paths:
       - "metadata-io/**"
       - "datahub-web-react/**"
@@ -21,17 +21,12 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
-
 jobs:
   code_check:
     strategy:
       fail-fast: false
       matrix:
-        command:
-          [
-            "check_event_type.py",
-            "check_policies.py"
-          ]
+        command: ["check_event_type.py", "check_policies.py"]
     name: run code checks
     runs-on: ubuntu-latest
     steps:
@@ -43,5 +38,5 @@ jobs:
         with:
           python-version: "3.10"
       - name: run check ${{ matrix.command }}
-        run: |
-          python .github/scripts/${{ matrix.command }}
+        run: |-
+          python .github/scripts/${{ matrix.command }}
diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml
@@ -8,7 +8,7 @@ on:
       - ".github/workflows/docker-postgres-setup.yml"
   pull_request:
     branches:
-      - master
+      - "**"
     paths:
       - "docker/postgres-setup/**"
       - ".github/workflows/docker-postgres-setup.yml"
@@ -61,4 +61,3 @@ jobs:
           context: .
           file: ./docker/postgres-setup/Dockerfile
           platforms: linux/amd64,linux/arm64
-
diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
@@ -8,7 +8,7 @@ on:
       - "**.md"
   pull_request:
     branches:
-      - master
+      - "**"
     paths-ignore:
       - "docs/**"
       - "**.md"
@@ -551,7 +551,6 @@ jobs:
         id: tag
         run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
 
-
   datahub_ingestion_slim_build:
     name: Build and Push DataHub Ingestion Docker Images
     runs-on: ubuntu-latest
@@ -815,8 +814,8 @@ jobs:
           DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }}
           DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }}
           ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }}
-          ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5'
-          ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml'
+          ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5"
+          ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml"
         run: |
           ./smoke-test/run-quickstart.sh
       - name: sleep 60s

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -3,7 +3,7 @@ name: documentation
 on:
   pull_request:
     branches:
-      - master
+      - "**"
   push:
     branches:
       - master

diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml
@@ -2,8 +2,10 @@ name: Lint actions
 on:
   pull_request:
     paths:
-      - '.github/workflows/**'
+      - ".github/workflows/**"
 
+    branches:
+      - "**"
 jobs:
   actionlint:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
@@ -9,7 +9,7 @@ on:
       - "metadata-models/**"
   pull_request:
     branches:
-      - master
+      - "**"
     paths:
       - ".github/**"
       - "metadata-ingestion/**"

diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml
@@ -10,7 +10,7 @@ on:
       - "metadata-io/**"
   pull_request:
     branches:
-      - master
+      - "**"
     paths:
       - "**/*.gradle"
       - "li-utils/**"

diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml
@@ -12,7 +12,7 @@ on:
       - ".github/workflows/spark-smoke-test.yml"
   pull_request:
     branches:
-      - master
+      - "**"
     paths:
       - "metadata_models/**"
       - "metadata-integration/java/datahub-client/**"

diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -130,7 +130,7 @@
         "name": "dynamodb",
         "displayName": "DynamoDB",
         "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
-        "recipe": "source:\n    type: dynamodb\n    config:\n        platform_instance: \"AWS_ACCOUNT_ID\"\n        aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n        aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n        # User could use the below option to provide a list of primary keys of a table in dynamodb format,\n        # those items from given primary keys will be included when we scan the table.\n        # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.\n        # We'll enforce the the primary keys list size not to exceed 100\n        # The total items we'll try to retrieve in these two scenarios:\n        # 1. If user don't specify include_table_item: we'll retrieve up to 100 items\n        # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in\n        # the table, with a total not more than 200 items\n        # include_table_item:\n        #   table_name:\n        #     [\n        #       {\n        #         'partition_key_name': { 'attribute_type': 'attribute_value' },\n        #         'sort_key_name': { 'attribute_type': 'attribute_value' },\n        #       },\n        #     ]"
+        "recipe": "source:\n    type: dynamodb\n    config:\n        platform_instance: \"AWS_ACCOUNT_ID\"\n        aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n        aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n        # If there are items that have most representative fields of the table, users could use the\n        # `include_table_item` option to provide a list of primary keys of the table in dynamodb format.\n        # For each `region.table`, the list of primary keys can be at most 100.\n        # We include these items in addition to the first 100 items in the table when we scan it.\n        # include_table_item:\n        #   region.table_name:\n        #     [\n        #       {\n        #         'partition_key_name': { 'attribute_type': 'attribute_value' },\n        #         'sort_key_name': { 'attribute_type': 'attribute_value' },\n        #       },\n        #     ]"
     },
     {
         "urn": "urn:li:dataPlatform:glue",
@@ -223,4 +223,4 @@
         "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
         "recipe": "source:\n  type: <source-type>\n  config:\n    # Source-type specifics config\n    <source-configs>"
     }
-]
+]
diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py
@@ -1,6 +1,7 @@
 import json
 import os
 import tarfile
+import time
 import urllib.request
 
 repo_url = "https://api.github.com/repos/datahub-project/static-assets"
@@ -16,17 +17,30 @@ def download_file(url, destination):
                 f.write(chunk)
 
 
-def fetch_urls(repo_url: str, folder_path: str, file_format: str):
+def fetch_urls(
+    repo_url: str, folder_path: str, file_format: str, max_retries=3, retry_delay=5
+):
     api_url = f"{repo_url}/contents/{folder_path}"
-    response = urllib.request.urlopen(api_url)
-    data = response.read().decode("utf-8")
-    urls = [
-        file["download_url"]
-        for file in json.loads(data)
-        if file["name"].endswith(file_format)
-    ]
-    print(urls)
-    return urls
+    for attempt in range(max_retries + 1):
+        try:
+            response = urllib.request.urlopen(api_url)
+            if response.status == 403 or (500 <= response.status < 600):
+                raise Exception(f"HTTP Error {response.status}: {response.reason}")
+            data = response.read().decode("utf-8")
+            urls = [
+                file["download_url"]
+                for file in json.loads(data)
+                if file["name"].endswith(file_format)
+            ]
+            print(urls)
+            return urls
+        except Exception as e:
+            if attempt < max_retries:
+                print(f"Attempt {attempt + 1}/{max_retries}: {e}")
+                time.sleep(retry_delay)
+            else:
+                print(f"Max retries reached. Unable to fetch data.")
+                raise
 
 
 def extract_tar_file(destination_path):

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -9,6 +9,11 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
 - #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now.
 - #8853 - The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. See the docs for more details.
 - #8853 - Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`.
+- #8943 The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled.
+This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future.
+If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup.
+Otherwise, we recommend soft deleting all databricks data via the DataHub CLI:
+`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`.
 
 ### Potential Downtime
 

diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md b/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md
@@ -1,21 +1,18 @@
-## Limitations
-
-For each region, the list table operation returns maximum number 100 tables, we need to further improve it by implementing pagination for listing tables
-
 ## Advanced Configurations
 
 ### Using `include_table_item` config
 
-If there are items that have most representative fields of the table, user could use the `include_table_item` option to provide a list of primary keys of a table in dynamodb format, those items from given primary keys will be included when we scan the table.
+If there are items that have most representative fields of the table, users could use the `include_table_item` option to provide a list of primary keys of the table in dynamodb format. We include these items in addition to the first 100 items in the table when we scan it.
 
-Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if user has a table `Reply` with composite primary key `Id` and `ReplyDateTime`, user can use `include_table_item` to include 2 items as following:
+Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if a account has a table `Reply` in the `us-west-2` region with composite primary key `Id` and `ReplyDateTime`, users can use `include_table_item` to include 2 items as following:
 
 Example:
 
 ```yml
-# put the table name and composite key in DynamoDB format
+# The table name should be in the format of region.table_name
+# The primary keys should be in the DynamoDB format
 include_table_item:
-  Reply:
+  us-west-2.Reply:
     [
       {
         "ReplyDateTime": { "S": "2015-09-22T19:58:22.947Z" },

diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md b/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md
@@ -1,8 +1,8 @@
 ### Prerequisities
 
-In order to execute this source, you will need to create access key and secret keys that have DynamoDB read access. You can create these policies and attach to your account or can ask your account admin to attach these policies to your account.
+In order to execute this source, you need to attach the `AmazonDynamoDBReadOnlyAccess` policy to a user in your AWS account. Then create an API access key and secret for the user.
 
-For access key permissions, you can create a policy with permissions below and attach to your account, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html)
+For a user to be able to create API access key, it needs the following access key permissions. Your AWS account admin can create a policy with these permissions and attach to the user, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html)
 
 ```json
 {
@@ -22,5 +22,3 @@ For access key permissions, you can create a policy with permissions below and a
   ]
 }
 ```
-
-For DynamoDB read access, you can simply attach AWS managed policy `AmazonDynamoDBReadOnlyAccess` to your account, you can find more details in [Attaching a policy to an IAM user group](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_groups_manage_attach-policy.html)
diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml b/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml
@@ -4,16 +4,14 @@ source:
     platform_instance: "AWS_ACCOUNT_ID"
     aws_access_key_id: "${AWS_ACCESS_KEY_ID}"
     aws_secret_access_key: "${AWS_SECRET_ACCESS_KEY}"
-    # User could use the below option to provide a list of primary keys of a table in dynamodb format,
-    # those items from given primary keys will be included when we scan the table.
-    # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.
-    # We'll enforce the the primary keys list size not to exceed 100
-    # The total items we'll try to retrieve in these two scenarios:
-    # 1. If user don't specify include_table_item: we'll retrieve up to 100 items
-    # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in
-    # the table, with a total not more than 200 items
+    #
+    # If there are items that have most representative fields of the table, users could use the
+    # `include_table_item` option to provide a list of primary keys of the table in dynamodb format.
+    # For each `region.table`, the list of primary keys can be at most 100.
+    # We include these items in addition to the first 100 items in the table when we scan it.
+    #
     # include_table_item:
-    #   table_name:
+    #   region.table_name:
     #     [
     #       {
     #         "partition_key_name": { "attribute_type": "attribute_value" },

diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py
@@ -4,7 +4,7 @@
 from pydantic.fields import Field
 
 from datahub.configuration.common import ConfigModel, ConfigurationError
-from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated
+from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
 from datahub.metadata.schema_classes import FabricTypeClass
 
 DEFAULT_ENV = FabricTypeClass.PROD

diff --git a/...nfiguration/pydantic_field_deprecation.py → ...nfiguration/validate_field_deprecation.py b/...nfiguration/pydantic_field_deprecation.py → ...nfiguration/validate_field_deprecation.py
@@ -1,20 +1,28 @@
 import warnings
-from typing import Optional, Type
+from typing import Any, Optional, Type
 
 import pydantic
 
 from datahub.configuration.common import ConfigurationWarning
 from datahub.utilities.global_warning_util import add_global_warning
 
+_unset = object()
 
-def pydantic_field_deprecated(field: str, message: Optional[str] = None) -> classmethod:
+
+def pydantic_field_deprecated(
+    field: str,
+    warn_if_value_is_not: Any = _unset,
+    message: Optional[str] = None,
+) -> classmethod:
     if message:
         output = message
     else:
         output = f"{field} is deprecated and will be removed in a future release. Please remove it from your config."
 
     def _validate_deprecated(cls: Type, values: dict) -> dict:
-        if field in values:
+        if field in values and (
+            warn_if_value_is_not is _unset or values[field] != warn_if_value_is_not
+        ):
             add_global_warning(output)
             warnings.warn(output, ConfigurationWarning, stacklevel=2)
         return values