From aa593c32d8888928864e5eccef4c2f2a44666995 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Wed, 16 Mar 2022 23:35:04 +0000 Subject: [PATCH] Flexible search on soft delete (#4405) * Adds filter logic to correct DB * Fix build * Adds documentation & fixes flag typo * apply review comments * Adds test for filtered search * Adds warning log for redundant parameter combo --- docs/how/delete-metadata.md | 3 +- .../src/datahub/cli/cli_utils.py | 10 +++ .../src/datahub/cli/delete_cli.py | 15 +++- .../query/request/SearchRequestHandler.java | 15 +++- .../request/SearchRequestHandlerTest.java | 82 ++++++++++++++++++- 5 files changed, 116 insertions(+), 9 deletions(-) diff --git a/docs/how/delete-metadata.md b/docs/how/delete-metadata.md index 1f9ec42268a06..5a414c92959f8 100644 --- a/docs/how/delete-metadata.md +++ b/docs/how/delete-metadata.md @@ -46,7 +46,8 @@ curl "http://localhost:8080/entities?action=delete" -X POST --data '{"urn": "urn ## Delete using Broader Filters -_Note: All these commands below support the soft-delete option (`-s/--soft`) as well as the dry-run option (`-n/--dry-run`)._ +_Note: All these commands below support the soft-delete option (`-s/--soft`) as well as the dry-run option (`-n/--dry-run`). Additionally, as of v0.8.29 there is a new option: `--include-removed` that deletes softly deleted entities that match the provided filter. + ### Delete all datasets in the DEV environment ``` diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index f083a4434e8ec..3c57455605374 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -301,6 +301,7 @@ def get_urns_by_filter( env: Optional[str], entity_type: str = "dataset", search_query: str = "*", + include_removed: bool = False, ) -> Iterable[str]: session, gms_host = get_session_and_host() endpoint: str = "/entities?action=search" @@ -333,6 +334,15 @@ def get_urns_by_filter( } ) + if include_removed: + filter_criteria.append( + { + "field": "removed", + "value": "true", + "condition": "EQUAL", + } + ) + search_body = { "input": search_query, "entity": entity_type, diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index f5845757747bd..86c3869eb2aa5 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -67,10 +67,7 @@ def delete_for_registry( deletion_result = DeletionResult() deletion_result.num_entities = 1 deletion_result.num_records = UNKNOWN_NUM_RECORDS # Default is unknown - registry_delete = { - "registryId": registry_id, - "dryRun": dry_run, - } + registry_delete = {"registryId": registry_id, "dryRun": dry_run, "soft": soft} ( structured_rows, entities_affected, @@ -93,6 +90,7 @@ def delete_for_registry( @click.option("--query", required=False, type=str) @click.option("--registry-id", required=False, type=str) @click.option("-n", "--dry-run", required=False, is_flag=True) +@click.option("--include-removed", required=False, is_flag=True) @telemetry.with_telemetry def delete( urn: str, @@ -104,6 +102,7 @@ def delete( query: str, registry_id: str, dry_run: bool, + include_removed: bool, ) -> None: """Delete metadata from datahub using a single urn or a combination of filters""" @@ -153,6 +152,11 @@ def delete( registry_id=registry_id, soft=soft, dry_run=dry_run ) else: + # log warn include_removed + hard is the only way to work + if include_removed and soft: + logger.warn( + "A filtered delete including soft deleted entities is redundant, because it is a soft delete by default. Please use --include-removed in conjunction with --hard" + ) # Filter based delete deletion_result = delete_with_filters( env=env, @@ -162,6 +166,7 @@ def delete( entity_type=entity_type, search_query=query, force=force, + include_removed=include_removed, ) if not dry_run: @@ -188,6 +193,7 @@ def delete_with_filters( dry_run: bool, soft: bool, force: bool, + include_removed: bool, search_query: str = "*", entity_type: str = "dataset", env: Optional[str] = None, @@ -207,6 +213,7 @@ def delete_with_filters( platform=platform, search_query=search_query, entity_type=entity_type, + include_removed=include_removed, ) ] logger.info( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 55799330fe0e9..6f09403d1e8b3 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -64,6 +64,7 @@ public class SearchRequestHandler { private final Set _defaultQueryFieldNames; private final Map _filtersToDisplayName; private final int _maxTermBucketSize = 100; + private static final String REMOVED = "removed"; private SearchRequestHandler(@Nonnull EntitySpec entitySpec) { _entitySpec = entitySpec; @@ -100,8 +101,18 @@ private Set getDefaultQueryFieldNames() { public static BoolQueryBuilder getFilterQuery(@Nullable Filter filter) { BoolQueryBuilder filterQuery = ESUtils.buildFilterQuery(filter); - // Filter out entities that are marked "removed" - filterQuery.mustNot(QueryBuilders.matchQuery("removed", true)); + + boolean removedInOrFilter = false; + if (filter != null) { + removedInOrFilter = filter.getOr().stream().anyMatch( + or -> or.getAnd().stream().anyMatch(criterion -> criterion.getField().equals(REMOVED)) + ); + } + // Filter out entities that are marked "removed" if and only if filter does not contain a criterion referencing it. + if (!removedInOrFilter) { + filterQuery.mustNot(QueryBuilders.matchQuery(REMOVED, true)); + } + return filterQuery; } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java index 63464a01b1bdf..8f01067f40d89 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java @@ -5,15 +5,23 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; + +import com.linkedin.metadata.query.filter.Condition; +import com.linkedin.metadata.query.filter.ConjunctiveCriterion; +import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; +import com.linkedin.metadata.query.filter.Criterion; +import com.linkedin.metadata.query.filter.CriterionArray; +import com.linkedin.metadata.query.filter.Filter; import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.index.query.BoolQueryBuilder; +import org.elasticsearch.index.query.MatchQueryBuilder; import org.elasticsearch.search.aggregations.AggregationBuilder; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.testng.annotations.Test; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; +import static org.testng.Assert.*; public class SearchRequestHandlerTest { @@ -43,4 +51,74 @@ public void testSearchRequestHandler() { assertTrue(fields.contains(field + ".*")); }); } + + @Test + public void testFilteredSearch() { + + final Criterion filterCriterion = new Criterion() + .setField("keyword") + .setCondition(Condition.EQUAL) + .setValue("some value"); + + final Criterion removedCriterion = new Criterion() + .setField("removed") + .setCondition(Condition.EQUAL) + .setValue(String.valueOf(false)); + + final Filter filterWithoutRemovedCondition = new Filter().setOr( + new ConjunctiveCriterionArray( + new ConjunctiveCriterion().setAnd( + new CriterionArray(ImmutableList.of(filterCriterion))) + )); + + final SearchRequestHandler requestHandler = SearchRequestHandler.getBuilder(TestEntitySpecBuilder.getSpec()); + + final BoolQueryBuilder testQuery = (BoolQueryBuilder) requestHandler + .getSearchRequest("testQuery", filterWithoutRemovedCondition, null, 0, 10) + .source() + .query(); + + Optional mustNotHaveRemovedCondition = testQuery.must() + .stream() + .filter(or -> or instanceof BoolQueryBuilder) + .map(or -> (BoolQueryBuilder) or) + .flatMap(or -> { + System.out.println("processing: " + or.mustNot()); + return or.mustNot().stream(); + }) + .filter(and -> and instanceof MatchQueryBuilder) + .map(and -> (MatchQueryBuilder) and) + .filter(match -> match.fieldName().equals("removed")) + .findAny(); + + assertTrue(mustNotHaveRemovedCondition.isPresent(), "Expected must not have removed condition to exist" + + " if filter does not have it"); + + final Filter filterWithRemovedCondition = new Filter().setOr( + new ConjunctiveCriterionArray( + new ConjunctiveCriterion().setAnd( + new CriterionArray(ImmutableList.of(filterCriterion, removedCriterion))) + )); + + final BoolQueryBuilder queryWithRemoved = (BoolQueryBuilder) requestHandler + .getSearchRequest("testQuery", filterWithRemovedCondition, null, 0, 10) + .source() + .query(); + + mustNotHaveRemovedCondition = queryWithRemoved.must() + .stream() + .filter(or -> or instanceof BoolQueryBuilder) + .map(or -> (BoolQueryBuilder) or) + .flatMap(or -> { + System.out.println("processing: " + or.mustNot()); + return or.mustNot().stream(); + }) + .filter(and -> and instanceof MatchQueryBuilder) + .map(and -> (MatchQueryBuilder) and) + .filter(match -> match.fieldName().equals("removed")) + .findAny(); + + assertFalse(mustNotHaveRemovedCondition.isPresent(), "Expect `must not have removed` condition to not" + + " exist because filter already has it a condition for the removed property"); + } }