From adf8c8db38c56250cb612b208f6e59b04c7258c6 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Oct 2023 02:59:56 -0400 Subject: [PATCH 1/5] refactor(ingest): Move sqlalchemy import out of sql_types.py (#9065) --- .../src/datahub/ingestion/source/sql/athena.py | 2 +- .../src/datahub/ingestion/source/sql/sql_common.py | 2 +- .../src/datahub/ingestion/source/sql/sql_types.py | 9 +-------- .../src/datahub/utilities/sqlalchemy_type_converter.py | 6 +++++- metadata-ingestion/tests/unit/test_athena_source.py | 2 +- .../unit/utilities/test_sqlalchemy_type_converter.py | 2 +- 6 files changed, 10 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index dad61e5173166..06b9ad92677a2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -31,7 +31,6 @@ register_custom_type, ) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri -from datahub.ingestion.source.sql.sql_types import MapType from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, @@ -41,6 +40,7 @@ from datahub.metadata.schema_classes import RecordTypeClass from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column from datahub.utilities.sqlalchemy_type_converter import ( + MapType, get_schema_fields_for_sqlalchemy_column, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 6524eea8222d4..be03858ec3ef9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -37,7 +37,6 @@ DatasetSubTypes, ) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig -from datahub.ingestion.source.sql.sql_types import MapType from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, downgrade_schema_from_v2, @@ -91,6 +90,7 @@ from datahub.utilities.lossy_collections import LossyList from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport +from datahub.utilities.sqlalchemy_type_converter import MapType if TYPE_CHECKING: from datahub.ingestion.source.ge_data_profiler import ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py index 51626891e9fef..ae47623188f42 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py @@ -1,8 +1,6 @@ import re from typing import Any, Dict, ValuesView -from sqlalchemy import types - from datahub.metadata.com.linkedin.pegasus2avro.schema import ( ArrayType, BooleanType, @@ -17,6 +15,7 @@ TimeType, UnionType, ) +from datahub.utilities.sqlalchemy_type_converter import MapType # these can be obtained by running `select format_type(oid, null),* from pg_type;` # we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.) @@ -369,12 +368,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any: "array": ArrayType, } - -class MapType(types.TupleType): - # Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub - pass - - # https://docs.aws.amazon.com/athena/latest/ug/data-types.html # https://github.com/dbt-athena/dbt-athena/tree/main ATHENA_SQL_TYPES_MAP: Dict[str, Any] = { diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py index a431f262a85fd..1d5ec5dae3519 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py +++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py @@ -7,13 +7,17 @@ from sqlalchemy_bigquery import STRUCT from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields -from datahub.ingestion.source.sql.sql_types import MapType from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField from datahub.metadata.schema_classes import NullTypeClass, SchemaFieldDataTypeClass logger = logging.getLogger(__name__) +class MapType(types.TupleType): + # Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub + pass + + class SqlAlchemyColumnToAvroConverter: """Helper class that collects some methods to convert SQLalchemy columns to Avro schema.""" diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py index 6d3ed20eafde2..23dd7dd5a6e45 100644 --- a/metadata-ingestion/tests/unit/test_athena_source.py +++ b/metadata-ingestion/tests/unit/test_athena_source.py @@ -9,7 +9,7 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.sql.athena import CustomAthenaRestDialect -from datahub.ingestion.source.sql.sql_types import MapType +from datahub.utilities.sqlalchemy_type_converter import MapType FROZEN_TIME = "2020-04-14 07:00:00" diff --git a/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py index 959da0987a825..6c719d351c4c2 100644 --- a/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py +++ b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py @@ -3,7 +3,6 @@ from sqlalchemy import types from sqlalchemy_bigquery import STRUCT -from datahub.ingestion.source.sql.sql_types import MapType from datahub.metadata.schema_classes import ( ArrayTypeClass, MapTypeClass, @@ -12,6 +11,7 @@ RecordTypeClass, ) from datahub.utilities.sqlalchemy_type_converter import ( + MapType, get_schema_fields_for_sqlalchemy_column, ) From c849246e63284bc73768ed58a22be62b708a6c48 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 24 Oct 2023 00:09:41 -0700 Subject: [PATCH 2/5] fix(ingest): add releases link (#9014) --- metadata-ingestion/setup.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c46409ecbf52f..417588a433655 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -282,7 +282,8 @@ # Source plugins # sqlalchemy-bigquery is included here since it provides an implementation of # a SQLalchemy-conform STRUCT type definition - "athena": sql_common | {"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0", "sqlalchemy-bigquery>=1.4.1"}, + "athena": sql_common + | {"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0", "sqlalchemy-bigquery>=1.4.1"}, "azure-ad": set(), "bigquery": sql_common | bigquery_common @@ -354,7 +355,11 @@ | {"psycopg2-binary", "pymysql>=1.0.2"}, "pulsar": {"requests"}, "redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib, - "redshift": sql_common | redshift_common | usage_common | sqlglot_lib | {"redshift-connector"}, + "redshift": sql_common + | redshift_common + | usage_common + | sqlglot_lib + | {"redshift-connector"}, "redshift-legacy": sql_common | redshift_common, "redshift-usage-legacy": sql_common | usage_common | redshift_common, "s3": {*s3_base, *data_lake_profiling}, @@ -435,7 +440,9 @@ deepdiff_dep = "deepdiff" test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"} -debug_requirements = {"memray"} +debug_requirements = { + "memray", +} base_dev_requirements = { *base_requirements, @@ -668,6 +675,7 @@ "Documentation": "https://datahubproject.io/docs/", "Source": "https://github.com/datahub-project/datahub", "Changelog": "https://github.com/datahub-project/datahub/releases", + "Releases": "https://github.com/acryldata/datahub/releases", }, license="Apache License 2.0", description="A CLI to work with DataHub metadata", From eb0b03d2f2f2c9ce88562c32d968d095a59f8547 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 24 Oct 2023 10:45:09 -0400 Subject: [PATCH 3/5] fix(ingest/bigquery): Correctly apply table pattern to read events; fix end time calculation; deprecate match_fully_qualified_names (#9077) --- .../ingestion/source/bigquery_v2/bigquery_config.py | 7 +++---- .../datahub/ingestion/source/bigquery_v2/lineage.py | 2 +- .../src/datahub/ingestion/source/bigquery_v2/usage.py | 10 +++++++--- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 944814b6936a4..a6a740385cf5c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -119,8 +119,8 @@ class BigQueryV2Config( ) match_fully_qualified_names: bool = Field( - default=False, - description="Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", + default=True, + description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", ) include_external_url: bool = Field( @@ -327,8 +327,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: ): logger.warning( "Please update `dataset_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." - "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " - "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + "The config option `match_fully_qualified_names` is deprecated and will be removed in a future release." ) return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 98c8cbaf85eec..aa462435b8105 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -548,7 +548,7 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. corrected_start_time = self.start_time - self.config.max_query_duration - corrected_end_time = self.end_time + -self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration self.report.log_entry_start_time = corrected_start_time self.report.log_entry_end_time = corrected_end_time diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 201567e104a51..7fc38991e5928 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -335,8 +335,12 @@ def get_time_window(self) -> Tuple[datetime, datetime]: def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool: return ( table_ref is not None - and self.config.dataset_pattern.allowed(table_ref.table_identifier.dataset) - and self.config.table_pattern.allowed(table_ref.table_identifier.table) + and self.config.dataset_pattern.allowed( + f"{table_ref.table_identifier.project_id}.{table_ref.table_identifier.dataset}" + if self.config.match_fully_qualified_names + else table_ref.table_identifier.dataset + ) + and self.config.table_pattern.allowed(str(table_ref.table_identifier)) ) def _should_ingest_usage(self) -> bool: @@ -844,7 +848,7 @@ def _get_parsed_bigquery_log_events( # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. corrected_start_time = self.start_time - self.config.max_query_duration - corrected_end_time = self.end_time + -self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration self.report.audit_start_time = corrected_start_time self.report.audit_end_time = corrected_end_time From d13553f53ad9e7592256cd88e78eef0ca95832e4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 24 Oct 2023 12:24:50 -0700 Subject: [PATCH 4/5] feat(sqlparser): extract CLL from `update`s (#9078) --- .../src/datahub/utilities/sqlglot_lineage.py | 68 +++++++++++-- .../test_snowflake_update_from_table.json | 56 +++++++++++ .../test_snowflake_update_hardcoded.json | 35 +++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 96 +++++++++++++++++++ 4 files changed, 246 insertions(+), 9 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 97121b368f507..526d90b2a1bfa 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -745,6 +745,47 @@ def _extract_select_from_create( return statement +_UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT: Set[str] = set( + sqlglot.exp.Update.arg_types.keys() +) - set(sqlglot.exp.Select.arg_types.keys()) + + +def _extract_select_from_update( + statement: sqlglot.exp.Update, +) -> sqlglot.exp.Select: + statement = statement.copy() + + # The "SET" expressions need to be converted. + # For the update command, it'll be a list of EQ expressions, but the select + # should contain aliased columns. + new_expressions = [] + for expr in statement.expressions: + if isinstance(expr, sqlglot.exp.EQ) and isinstance( + expr.left, sqlglot.exp.Column + ): + new_expressions.append( + sqlglot.exp.Alias( + this=expr.right, + alias=expr.left.this, + ) + ) + else: + # If we don't know how to convert it, just leave it as-is. If this causes issues, + # they'll get caught later. + new_expressions.append(expr) + + return sqlglot.exp.Select( + **{ + **{ + k: v + for k, v in statement.args.items() + if k not in _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT + }, + "expressions": new_expressions, + } + ) + + def _is_create_table_ddl(statement: sqlglot.exp.Expression) -> bool: return isinstance(statement, sqlglot.exp.Create) and isinstance( statement.this, sqlglot.exp.Schema @@ -767,6 +808,9 @@ def _try_extract_select( elif isinstance(statement, sqlglot.exp.Insert): # TODO Need to map column renames in the expressions part of the statement. statement = statement.expression + elif isinstance(statement, sqlglot.exp.Update): + # Assumption: the output table is already captured in the modified tables list. + statement = _extract_select_from_update(statement) elif isinstance(statement, sqlglot.exp.Create): # TODO May need to map column renames. # Assumption: the output table is already captured in the modified tables list. @@ -942,19 +986,25 @@ def _sqlglot_lineage_inner( ) # Simplify the input statement for column-level lineage generation. - select_statement = _try_extract_select(statement) + try: + select_statement = _try_extract_select(statement) + except Exception as e: + logger.debug(f"Failed to extract select from statement: {e}", exc_info=True) + debug_info.column_error = e + select_statement = None # Generate column-level lineage. column_lineage: Optional[List[_ColumnLineageInfo]] = None try: - column_lineage = _column_level_lineage( - select_statement, - dialect=dialect, - input_tables=table_name_schema_mapping, - output_table=downstream_table, - default_db=default_db, - default_schema=default_schema, - ) + if select_statement is not None: + column_lineage = _column_level_lineage( + select_statement, + dialect=dialect, + input_tables=table_name_schema_mapping, + output_table=downstream_table, + default_db=default_db, + default_schema=default_schema, + ) except UnsupportedStatementTypeError as e: # Inject details about the outer statement type too. e.args = (f"{e.args[0]} (outer statement type: {type(statement)})",) diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json new file mode 100644 index 0000000000000..e2baa34e7fe28 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json @@ -0,0 +1,56 @@ +{ + "query_type": "UPDATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col2" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)", + "column": "col2" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json new file mode 100644 index 0000000000000..b41ed61b37cdb --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json @@ -0,0 +1,35 @@ +{ + "query_type": "UPDATE", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "totalprice", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 059add8db67e4..dfc5b486abd35 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -3,6 +3,7 @@ import pytest from datahub.testing.check_sql_parser_result import assert_sql_result +from datahub.utilities.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens" @@ -672,3 +673,98 @@ def test_teradata_default_normalization(): }, expected_file=RESOURCE_DIR / "test_teradata_default_normalization.json", ) + + +def test_snowflake_update_hardcoded(): + assert_sql_result( + """ +UPDATE snowflake_sample_data.tpch_sf1.orders +SET orderkey = 1, totalprice = 2 +WHERE orderkey = 3 +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER(38,0)", + "totalprice": "NUMBER(12,2)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_hardcoded.json", + ) + + +def test_update_from_select(): + assert _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT == {"returning", "this"} + + +def test_snowflake_update_from_table(): + # Can create these tables with the following SQL: + """ + -- Create or replace my_table + CREATE OR REPLACE TABLE my_table ( + id INT IDENTITY PRIMARY KEY, + col1 VARCHAR(50), + col2 VARCHAR(50) + ); + + -- Create or replace table1 + CREATE OR REPLACE TABLE table1 ( + id INT IDENTITY PRIMARY KEY, + col1 VARCHAR(50), + col2 VARCHAR(50) + ); + + -- Create or replace table2 + CREATE OR REPLACE TABLE table2 ( + id INT IDENTITY PRIMARY KEY, + col2 VARCHAR(50) + ); + + -- Insert data into my_table + INSERT INTO my_table (col1, col2) + VALUES ('foo', 'bar'), + ('baz', 'qux'); + + -- Insert data into table1 + INSERT INTO table1 (col1, col2) + VALUES ('foo', 'bar'), + ('baz', 'qux'); + + -- Insert data into table2 + INSERT INTO table2 (col2) + VALUES ('bar'), + ('qux'); + """ + + assert_sql_result( + """ +UPDATE my_table +SET + col1 = t1.col1 || t1.col2, + col2 = t1.col1 || t2.col2 +FROM table1 t1 +JOIN table2 t2 ON t1.id = t2.id +WHERE my_table.id = t1.id; +""", + dialect="snowflake", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_from_table.json", + ) From 378d84a346cff4061f795dd1b296bde3ea5313c1 Mon Sep 17 00:00:00 2001 From: skrydal Date: Tue, 24 Oct 2023 22:12:11 +0200 Subject: [PATCH 5/5] fix(ui): Fixes handling of resources filters in UI (#9087) --- .../app/permissions/policy/PolicyDetailsModal.tsx | 4 ++-- .../permissions/policy/PolicyPrivilegeForm.tsx | 15 ++++++--------- .../src/app/permissions/policy/policyUtils.ts | 4 ++-- docs/authorization/policies.md | 8 ++++---- metadata-ingestion/tests/unit/serde/test_serde.py | 8 ++++---- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx index 68e91983babdb..d3e01df3a66e8 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx @@ -67,8 +67,8 @@ export default function PolicyDetailsModal({ policy, visible, onClose, privilege const isMetadataPolicy = policy?.type === PolicyType.Metadata; const resources = convertLegacyResourceFilter(policy?.resources); - const resourceTypes = getFieldValues(resources?.filter, 'RESOURCE_TYPE') || []; - const resourceEntities = getFieldValues(resources?.filter, 'RESOURCE_URN') || []; + const resourceTypes = getFieldValues(resources?.filter, 'TYPE') || []; + const resourceEntities = getFieldValues(resources?.filter, 'URN') || []; const domains = getFieldValues(resources?.filter, 'DOMAIN') || []; const { diff --git a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx index 1520388a5033a..b8e1505fceaec 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx @@ -67,8 +67,8 @@ export default function PolicyPrivilegeForm({ } = useAppConfig(); const resources: ResourceFilter = convertLegacyResourceFilter(maybeResources) || EMPTY_POLICY.resources; - const resourceTypes = getFieldValues(resources.filter, 'RESOURCE_TYPE') || []; - const resourceEntities = getFieldValues(resources.filter, 'RESOURCE_URN') || []; + const resourceTypes = getFieldValues(resources.filter, 'TYPE') || []; + const resourceEntities = getFieldValues(resources.filter, 'URN') || []; const getDisplayName = (entity) => { if (!entity) { @@ -145,10 +145,7 @@ export default function PolicyPrivilegeForm({ }; setResources({ ...resources, - filter: setFieldValues(filter, 'RESOURCE_TYPE', [ - ...resourceTypes, - createCriterionValue(selectedResourceType), - ]), + filter: setFieldValues(filter, 'TYPE', [...resourceTypes, createCriterionValue(selectedResourceType)]), }); }; @@ -160,7 +157,7 @@ export default function PolicyPrivilegeForm({ ...resources, filter: setFieldValues( filter, - 'RESOURCE_TYPE', + 'TYPE', resourceTypes?.filter((criterionValue) => criterionValue.value !== deselectedResourceType), ), }); @@ -173,7 +170,7 @@ export default function PolicyPrivilegeForm({ }; setResources({ ...resources, - filter: setFieldValues(filter, 'RESOURCE_URN', [ + filter: setFieldValues(filter, 'URN', [ ...resourceEntities, createCriterionValueWithEntity( resource, @@ -192,7 +189,7 @@ export default function PolicyPrivilegeForm({ ...resources, filter: setFieldValues( filter, - 'RESOURCE_URN', + 'URN', resourceEntities?.filter((criterionValue) => criterionValue.value !== resource), ), }); diff --git a/datahub-web-react/src/app/permissions/policy/policyUtils.ts b/datahub-web-react/src/app/permissions/policy/policyUtils.ts index c7af7342f6efa..2f178fcdeb5c3 100644 --- a/datahub-web-react/src/app/permissions/policy/policyUtils.ts +++ b/datahub-web-react/src/app/permissions/policy/policyUtils.ts @@ -99,10 +99,10 @@ export const convertLegacyResourceFilter = (resourceFilter: Maybe(); if (resourceFilter.type) { - criteria.push(createCriterion('RESOURCE_TYPE', [createCriterionValue(resourceFilter.type)])); + criteria.push(createCriterion('TYPE', [createCriterionValue(resourceFilter.type)])); } if (resourceFilter.resources && resourceFilter.resources.length > 0) { - criteria.push(createCriterion('RESOURCE_URN', resourceFilter.resources.map(createCriterionValue))); + criteria.push(createCriterion('URN', resourceFilter.resources.map(createCriterionValue))); } return { filter: { diff --git a/docs/authorization/policies.md b/docs/authorization/policies.md index e3606f2a3e48d..63aa6688d3eec 100644 --- a/docs/authorization/policies.md +++ b/docs/authorization/policies.md @@ -137,7 +137,7 @@ We currently support the following: #### Resources Resource filter defines the set of resources that the policy applies to is defined using a list of criteria. Each -criterion defines a field type (like resource_type, resource_urn, domain), a list of field values to compare, and a +criterion defines a field type (like type, urn, domain), a list of field values to compare, and a condition (like EQUALS). It essentially checks whether the field of a certain resource matches any of the input values. Note, that if there are no criteria or resource is not set, policy is applied to ALL resources. @@ -149,7 +149,7 @@ For example, the following resource filter will apply the policy to datasets, ch "filter": { "criteria": [ { - "field": "RESOURCE_TYPE", + "field": "TYPE", "condition": "EQUALS", "values": [ "dataset", @@ -175,8 +175,8 @@ Supported fields are as follows | Field Type | Description | Example | |---------------|------------------------|-------------------------| -| resource_type | Type of the resource | dataset, chart, dataJob | -| resource_urn | Urn of the resource | urn:li:dataset:... | +| type | Type of the resource | dataset, chart, dataJob | +| urn | Urn of the resource | urn:li:dataset:... | | domain | Domain of the resource | urn:li:domain:domainX | ## Managing Policies diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index d116f1f5473fa..d2d6a0bdda5b9 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -238,7 +238,7 @@ def test_missing_optional_simple() -> None: "criteria": [ { "condition": "EQUALS", - "field": "RESOURCE_TYPE", + "field": "TYPE", "values": ["notebook", "dataset", "dashboard"], } ] @@ -252,7 +252,7 @@ def test_missing_optional_simple() -> None: "criteria": [ { "condition": "EQUALS", - "field": "RESOURCE_TYPE", + "field": "TYPE", "values": ["notebook", "dataset", "dashboard"], } ] @@ -267,13 +267,13 @@ def test_missing_optional_simple() -> None: def test_missing_optional_in_union() -> None: # This one doesn't contain any optional fields and should work fine. revised_json = json.loads( - '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}' + '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}' ) revised = models.DataHubPolicyInfoClass.from_obj(revised_json) # This one is missing the optional filters.allResources field. original_json = json.loads( - '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}' + '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}' ) original = models.DataHubPolicyInfoClass.from_obj(original_json)