Skip to content

Commit

Permalink
Flexible search on soft delete (#4405)
Browse files Browse the repository at this point in the history
* Adds filter logic to correct DB

* Fix build

* Adds documentation & fixes flag typo

* apply review comments

* Adds test for filtered search

* Adds warning log for redundant parameter combo
  • Loading branch information
pedro93 authored Mar 16, 2022
1 parent d4d1635 commit aa593c3
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 9 deletions.
3 changes: 2 additions & 1 deletion docs/how/delete-metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ curl "http://localhost:8080/entities?action=delete" -X POST --data '{"urn": "urn

## Delete using Broader Filters

_Note: All these commands below support the soft-delete option (`-s/--soft`) as well as the dry-run option (`-n/--dry-run`)._
_Note: All these commands below support the soft-delete option (`-s/--soft`) as well as the dry-run option (`-n/--dry-run`). Additionally, as of v0.8.29 there is a new option: `--include-removed` that deletes softly deleted entities that match the provided filter.


### Delete all datasets in the DEV environment
```
Expand Down
10 changes: 10 additions & 0 deletions metadata-ingestion/src/datahub/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ def get_urns_by_filter(
env: Optional[str],
entity_type: str = "dataset",
search_query: str = "*",
include_removed: bool = False,
) -> Iterable[str]:
session, gms_host = get_session_and_host()
endpoint: str = "/entities?action=search"
Expand Down Expand Up @@ -333,6 +334,15 @@ def get_urns_by_filter(
}
)

if include_removed:
filter_criteria.append(
{
"field": "removed",
"value": "true",
"condition": "EQUAL",
}
)

search_body = {
"input": search_query,
"entity": entity_type,
Expand Down
15 changes: 11 additions & 4 deletions metadata-ingestion/src/datahub/cli/delete_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,7 @@ def delete_for_registry(
deletion_result = DeletionResult()
deletion_result.num_entities = 1
deletion_result.num_records = UNKNOWN_NUM_RECORDS # Default is unknown
registry_delete = {
"registryId": registry_id,
"dryRun": dry_run,
}
registry_delete = {"registryId": registry_id, "dryRun": dry_run, "soft": soft}
(
structured_rows,
entities_affected,
Expand All @@ -93,6 +90,7 @@ def delete_for_registry(
@click.option("--query", required=False, type=str)
@click.option("--registry-id", required=False, type=str)
@click.option("-n", "--dry-run", required=False, is_flag=True)
@click.option("--include-removed", required=False, is_flag=True)
@telemetry.with_telemetry
def delete(
urn: str,
Expand All @@ -104,6 +102,7 @@ def delete(
query: str,
registry_id: str,
dry_run: bool,
include_removed: bool,
) -> None:
"""Delete metadata from datahub using a single urn or a combination of filters"""

Expand Down Expand Up @@ -153,6 +152,11 @@ def delete(
registry_id=registry_id, soft=soft, dry_run=dry_run
)
else:
# log warn include_removed + hard is the only way to work
if include_removed and soft:
logger.warn(
"A filtered delete including soft deleted entities is redundant, because it is a soft delete by default. Please use --include-removed in conjunction with --hard"
)
# Filter based delete
deletion_result = delete_with_filters(
env=env,
Expand All @@ -162,6 +166,7 @@ def delete(
entity_type=entity_type,
search_query=query,
force=force,
include_removed=include_removed,
)

if not dry_run:
Expand All @@ -188,6 +193,7 @@ def delete_with_filters(
dry_run: bool,
soft: bool,
force: bool,
include_removed: bool,
search_query: str = "*",
entity_type: str = "dataset",
env: Optional[str] = None,
Expand All @@ -207,6 +213,7 @@ def delete_with_filters(
platform=platform,
search_query=search_query,
entity_type=entity_type,
include_removed=include_removed,
)
]
logger.info(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ public class SearchRequestHandler {
private final Set<String> _defaultQueryFieldNames;
private final Map<String, String> _filtersToDisplayName;
private final int _maxTermBucketSize = 100;
private static final String REMOVED = "removed";

private SearchRequestHandler(@Nonnull EntitySpec entitySpec) {
_entitySpec = entitySpec;
Expand Down Expand Up @@ -100,8 +101,18 @@ private Set<String> getDefaultQueryFieldNames() {

public static BoolQueryBuilder getFilterQuery(@Nullable Filter filter) {
BoolQueryBuilder filterQuery = ESUtils.buildFilterQuery(filter);
// Filter out entities that are marked "removed"
filterQuery.mustNot(QueryBuilders.matchQuery("removed", true));

boolean removedInOrFilter = false;
if (filter != null) {
removedInOrFilter = filter.getOr().stream().anyMatch(
or -> or.getAnd().stream().anyMatch(criterion -> criterion.getField().equals(REMOVED))
);
}
// Filter out entities that are marked "removed" if and only if filter does not contain a criterion referencing it.
if (!removedInOrFilter) {
filterQuery.mustNot(QueryBuilders.matchQuery(REMOVED, true));
}

return filterQuery;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,23 @@
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;

import com.linkedin.metadata.query.filter.Condition;
import com.linkedin.metadata.query.filter.ConjunctiveCriterion;
import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray;
import com.linkedin.metadata.query.filter.Criterion;
import com.linkedin.metadata.query.filter.CriterionArray;
import com.linkedin.metadata.query.filter.Filter;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import static org.testng.Assert.*;


public class SearchRequestHandlerTest {
Expand Down Expand Up @@ -43,4 +51,74 @@ public void testSearchRequestHandler() {
assertTrue(fields.contains(field + ".*"));
});
}

@Test
public void testFilteredSearch() {

final Criterion filterCriterion = new Criterion()
.setField("keyword")
.setCondition(Condition.EQUAL)
.setValue("some value");

final Criterion removedCriterion = new Criterion()
.setField("removed")
.setCondition(Condition.EQUAL)
.setValue(String.valueOf(false));

final Filter filterWithoutRemovedCondition = new Filter().setOr(
new ConjunctiveCriterionArray(
new ConjunctiveCriterion().setAnd(
new CriterionArray(ImmutableList.of(filterCriterion)))
));

final SearchRequestHandler requestHandler = SearchRequestHandler.getBuilder(TestEntitySpecBuilder.getSpec());

final BoolQueryBuilder testQuery = (BoolQueryBuilder) requestHandler
.getSearchRequest("testQuery", filterWithoutRemovedCondition, null, 0, 10)
.source()
.query();

Optional<MatchQueryBuilder> mustNotHaveRemovedCondition = testQuery.must()
.stream()
.filter(or -> or instanceof BoolQueryBuilder)
.map(or -> (BoolQueryBuilder) or)
.flatMap(or -> {
System.out.println("processing: " + or.mustNot());
return or.mustNot().stream();
})
.filter(and -> and instanceof MatchQueryBuilder)
.map(and -> (MatchQueryBuilder) and)
.filter(match -> match.fieldName().equals("removed"))
.findAny();

assertTrue(mustNotHaveRemovedCondition.isPresent(), "Expected must not have removed condition to exist"
+ " if filter does not have it");

final Filter filterWithRemovedCondition = new Filter().setOr(
new ConjunctiveCriterionArray(
new ConjunctiveCriterion().setAnd(
new CriterionArray(ImmutableList.of(filterCriterion, removedCriterion)))
));

final BoolQueryBuilder queryWithRemoved = (BoolQueryBuilder) requestHandler
.getSearchRequest("testQuery", filterWithRemovedCondition, null, 0, 10)
.source()
.query();

mustNotHaveRemovedCondition = queryWithRemoved.must()
.stream()
.filter(or -> or instanceof BoolQueryBuilder)
.map(or -> (BoolQueryBuilder) or)
.flatMap(or -> {
System.out.println("processing: " + or.mustNot());
return or.mustNot().stream();
})
.filter(and -> and instanceof MatchQueryBuilder)
.map(and -> (MatchQueryBuilder) and)
.filter(match -> match.fieldName().equals("removed"))
.findAny();

assertFalse(mustNotHaveRemovedCondition.isPresent(), "Expect `must not have removed` condition to not"
+ " exist because filter already has it a condition for the removed property");
}
}

0 comments on commit aa593c3

Please sign in to comment.