Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
asikowitz authored Oct 6, 2023
2 parents f3c9c3c + 8e7f286 commit 7dabb18
Show file tree
Hide file tree
Showing 44 changed files with 584 additions and 500 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/airflow-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- "metadata-models/**"
pull_request:
branches:
- master
- "**"
paths:
- ".github/**"
- "metadata-ingestion-modules/airflow-plugin/**"
Expand Down
11 changes: 3 additions & 8 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- "**.md"
pull_request:
branches:
- master
- "**"
paths-ignore:
- "docs/**"
- "**.md"
Expand All @@ -24,17 +24,12 @@ jobs:
strategy:
fail-fast: false
matrix:
command:
[
command: [
# metadata-ingestion and airflow-plugin each have dedicated build jobs
"except_metadata_ingestion",
"frontend"
]
timezone:
[
"UTC",
"America/New_York",
]
timezone: ["UTC", "America/New_York"]
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
Expand Down
9 changes: 2 additions & 7 deletions .github/workflows/check-datahub-jars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- "**.md"
pull_request:
branches:
- master
- "**"
paths-ignore:
- "docker/**"
- "docs/**"
Expand All @@ -28,12 +28,7 @@ jobs:
max-parallel: 1
fail-fast: false
matrix:
command:
[
"datahub-client",
"datahub-protobuf",
"spark-lineage"
]
command: ["datahub-client", "datahub-protobuf", "spark-lineage"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/close-stale-issues.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ jobs:
days-before-issue-stale: 30
days-before-issue-close: 30
stale-issue-label: "stale"
stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io. For feature requests please use https://feature-requests.datahubproject.io"
stale-issue-message:
"This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io.\
\ For feature requests please use https://feature-requests.datahubproject.io"
close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
Expand Down
13 changes: 4 additions & 9 deletions .github/workflows/code-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- ".github/workflows/code-checks.yml"
pull_request:
branches:
- master
- "**"
paths:
- "metadata-io/**"
- "datahub-web-react/**"
Expand All @@ -21,17 +21,12 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true


jobs:
code_check:
strategy:
fail-fast: false
matrix:
command:
[
"check_event_type.py",
"check_policies.py"
]
command: ["check_event_type.py", "check_policies.py"]
name: run code checks
runs-on: ubuntu-latest
steps:
Expand All @@ -43,5 +38,5 @@ jobs:
with:
python-version: "3.10"
- name: run check ${{ matrix.command }}
run: |
python .github/scripts/${{ matrix.command }}
run: |-
python .github/scripts/${{ matrix.command }}
3 changes: 1 addition & 2 deletions .github/workflows/docker-postgres-setup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- ".github/workflows/docker-postgres-setup.yml"
pull_request:
branches:
- master
- "**"
paths:
- "docker/postgres-setup/**"
- ".github/workflows/docker-postgres-setup.yml"
Expand Down Expand Up @@ -61,4 +61,3 @@ jobs:
context: .
file: ./docker/postgres-setup/Dockerfile
platforms: linux/amd64,linux/arm64

7 changes: 3 additions & 4 deletions .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- "**.md"
pull_request:
branches:
- master
- "**"
paths-ignore:
- "docs/**"
- "**.md"
Expand Down Expand Up @@ -551,7 +551,6 @@ jobs:
id: tag
run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT


datahub_ingestion_slim_build:
name: Build and Push DataHub Ingestion Docker Images
runs-on: ubuntu-latest
Expand Down Expand Up @@ -815,8 +814,8 @@ jobs:
DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }}
DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }}
ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }}
ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5'
ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml'
ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5"
ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml"
run: |
./smoke-test/run-quickstart.sh
- name: sleep 60s
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: documentation
on:
pull_request:
branches:
- master
- "**"
push:
branches:
- master
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/lint-actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ name: Lint actions
on:
pull_request:
paths:
- '.github/workflows/**'
- ".github/workflows/**"

branches:
- "**"
jobs:
actionlint:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/metadata-ingestion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
- "metadata-models/**"
pull_request:
branches:
- master
- "**"
paths:
- ".github/**"
- "metadata-ingestion/**"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/metadata-io.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
- "metadata-io/**"
pull_request:
branches:
- master
- "**"
paths:
- "**/*.gradle"
- "li-utils/**"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/spark-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
- ".github/workflows/spark-smoke-test.yml"
pull_request:
branches:
- master
- "**"
paths:
- "metadata_models/**"
- "metadata-integration/java/datahub-client/**"
Expand Down
4 changes: 2 additions & 2 deletions datahub-web-react/src/app/ingest/source/builder/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@
"name": "dynamodb",
"displayName": "DynamoDB",
"docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
"recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # User could use the below option to provide a list of primary keys of a table in dynamodb format,\n # those items from given primary keys will be included when we scan the table.\n # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.\n # We'll enforce the the primary keys list size not to exceed 100\n # The total items we'll try to retrieve in these two scenarios:\n # 1. If user don't specify include_table_item: we'll retrieve up to 100 items\n # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in\n # the table, with a total not more than 200 items\n # include_table_item:\n # table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]"
"recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # If there are items that have most representative fields of the table, users could use the\n # `include_table_item` option to provide a list of primary keys of the table in dynamodb format.\n # For each `region.table`, the list of primary keys can be at most 100.\n # We include these items in addition to the first 100 items in the table when we scan it.\n # include_table_item:\n # region.table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]"
},
{
"urn": "urn:li:dataPlatform:glue",
Expand Down Expand Up @@ -223,4 +223,4 @@
"docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
"recipe": "source:\n type: <source-type>\n config:\n # Source-type specifics config\n <source-configs>"
}
]
]
34 changes: 24 additions & 10 deletions docs-website/download_historical_versions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import tarfile
import time
import urllib.request

repo_url = "https://api.github.com/repos/datahub-project/static-assets"
Expand All @@ -16,17 +17,30 @@ def download_file(url, destination):
f.write(chunk)


def fetch_urls(repo_url: str, folder_path: str, file_format: str):
def fetch_urls(
repo_url: str, folder_path: str, file_format: str, max_retries=3, retry_delay=5
):
api_url = f"{repo_url}/contents/{folder_path}"
response = urllib.request.urlopen(api_url)
data = response.read().decode("utf-8")
urls = [
file["download_url"]
for file in json.loads(data)
if file["name"].endswith(file_format)
]
print(urls)
return urls
for attempt in range(max_retries + 1):
try:
response = urllib.request.urlopen(api_url)
if response.status == 403 or (500 <= response.status < 600):
raise Exception(f"HTTP Error {response.status}: {response.reason}")
data = response.read().decode("utf-8")
urls = [
file["download_url"]
for file in json.loads(data)
if file["name"].endswith(file_format)
]
print(urls)
return urls
except Exception as e:
if attempt < max_retries:
print(f"Attempt {attempt + 1}/{max_retries}: {e}")
time.sleep(retry_delay)
else:
print(f"Max retries reached. Unable to fetch data.")
raise


def extract_tar_file(destination_path):
Expand Down
5 changes: 5 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now.
- #8853 - The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. See the docs for more details.
- #8853 - Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`.
- #8943 The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled.
This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future.
If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup.
Otherwise, we recommend soft deleting all databricks data via the DataHub CLI:
`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`.

### Potential Downtime

Expand Down
13 changes: 5 additions & 8 deletions metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
## Limitations

For each region, the list table operation returns maximum number 100 tables, we need to further improve it by implementing pagination for listing tables

## Advanced Configurations

### Using `include_table_item` config

If there are items that have most representative fields of the table, user could use the `include_table_item` option to provide a list of primary keys of a table in dynamodb format, those items from given primary keys will be included when we scan the table.
If there are items that have most representative fields of the table, users could use the `include_table_item` option to provide a list of primary keys of the table in dynamodb format. We include these items in addition to the first 100 items in the table when we scan it.

Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if user has a table `Reply` with composite primary key `Id` and `ReplyDateTime`, user can use `include_table_item` to include 2 items as following:
Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if a account has a table `Reply` in the `us-west-2` region with composite primary key `Id` and `ReplyDateTime`, users can use `include_table_item` to include 2 items as following:

Example:

```yml
# put the table name and composite key in DynamoDB format
# The table name should be in the format of region.table_name
# The primary keys should be in the DynamoDB format
include_table_item:
Reply:
us-west-2.Reply:
[
{
"ReplyDateTime": { "S": "2015-09-22T19:58:22.947Z" },
Expand Down
6 changes: 2 additions & 4 deletions metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
### Prerequisities

In order to execute this source, you will need to create access key and secret keys that have DynamoDB read access. You can create these policies and attach to your account or can ask your account admin to attach these policies to your account.
In order to execute this source, you need to attach the `AmazonDynamoDBReadOnlyAccess` policy to a user in your AWS account. Then create an API access key and secret for the user.

For access key permissions, you can create a policy with permissions below and attach to your account, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html)
For a user to be able to create API access key, it needs the following access key permissions. Your AWS account admin can create a policy with these permissions and attach to the user, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html)

```json
{
Expand All @@ -22,5 +22,3 @@ For access key permissions, you can create a policy with permissions below and a
]
}
```

For DynamoDB read access, you can simply attach AWS managed policy `AmazonDynamoDBReadOnlyAccess` to your account, you can find more details in [Attaching a policy to an IAM user group](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_groups_manage_attach-policy.html)
16 changes: 7 additions & 9 deletions metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@ source:
platform_instance: "AWS_ACCOUNT_ID"
aws_access_key_id: "${AWS_ACCESS_KEY_ID}"
aws_secret_access_key: "${AWS_SECRET_ACCESS_KEY}"
# User could use the below option to provide a list of primary keys of a table in dynamodb format,
# those items from given primary keys will be included when we scan the table.
# For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.
# We'll enforce the the primary keys list size not to exceed 100
# The total items we'll try to retrieve in these two scenarios:
# 1. If user don't specify include_table_item: we'll retrieve up to 100 items
# 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in
# the table, with a total not more than 200 items
#
# If there are items that have most representative fields of the table, users could use the
# `include_table_item` option to provide a list of primary keys of the table in dynamodb format.
# For each `region.table`, the list of primary keys can be at most 100.
# We include these items in addition to the first 100 items in the table when we scan it.
#
# include_table_item:
# table_name:
# region.table_name:
# [
# {
# "partition_key_name": { "attribute_type": "attribute_value" },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pydantic.fields import Field

from datahub.configuration.common import ConfigModel, ConfigurationError
from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
from datahub.metadata.schema_classes import FabricTypeClass

DEFAULT_ENV = FabricTypeClass.PROD
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
import warnings
from typing import Optional, Type
from typing import Any, Optional, Type

import pydantic

from datahub.configuration.common import ConfigurationWarning
from datahub.utilities.global_warning_util import add_global_warning

_unset = object()

def pydantic_field_deprecated(field: str, message: Optional[str] = None) -> classmethod:

def pydantic_field_deprecated(
field: str,
warn_if_value_is_not: Any = _unset,
message: Optional[str] = None,
) -> classmethod:
if message:
output = message
else:
output = f"{field} is deprecated and will be removed in a future release. Please remove it from your config."

def _validate_deprecated(cls: Type, values: dict) -> dict:
if field in values:
if field in values and (
warn_if_value_is_not is _unset or values[field] != warn_if_value_is_not
):
add_global_warning(output)
warnings.warn(output, ConfigurationWarning, stacklevel=2)
return values
Expand Down
Loading

0 comments on commit 7dabb18

Please sign in to comment.