Skip to content

Commit

Permalink
Restrict TypeMetadata reduction to specific cases, i.e. when users
Browse files Browse the repository at this point in the history
specify include or exclude fields.
  • Loading branch information
apmoriarty committed Nov 22, 2024
1 parent 6de9e50 commit 496dd79
Show file tree
Hide file tree
Showing 5 changed files with 300 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2477,29 +2477,59 @@ public void configureTypeMappings(ShardQueryConfiguration config, IteratorSettin
String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes);
String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset();

TypeMetadata typeMetadata = getTypeMetadata();

if (config.getReduceTypeMetadata() && !isPreload) {
Set<String> fieldsToRetain = ReduceFields.getQueryFields(config.getQueryTree());
typeMetadata = typeMetadata.reduce(fieldsToRetain);
}

String serializedTypeMetadata = typeMetadata.toString();

if (compressMappings) {
nonIndexedTypes = QueryOptions.compressOption(nonIndexedTypes, QueryOptions.UTF8);
requiredAuthsString = QueryOptions.compressOption(requiredAuthsString, QueryOptions.UTF8);
if (!config.getReduceTypeMetadataPerShard()) {
// if we're reducing later, don't compress the type metadata
serializedTypeMetadata = QueryOptions.compressOption(serializedTypeMetadata, QueryOptions.UTF8);
}
}

addOption(cfg, QueryOptions.NON_INDEXED_DATATYPES, nonIndexedTypes, false);
addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false);
addOption(cfg, QueryOptions.TYPE_METADATA_AUTHS, requiredAuthsString, false);
addOption(cfg, QueryOptions.METADATA_TABLE_NAME, config.getMetadataTableName(), false);

// now handle TypeMetadata
boolean canReduceTypeMetadata = !config.getProjectFields().isEmpty() || !config.getDisallowlistedFields().isEmpty();
if (!canReduceTypeMetadata) {
config.setReduceTypeMetadata(false);
config.setReduceTypeMetadataPerShard(false);
}

if (!isPreload) {
// TypeMetadata is serialized at the end of query planning for two reasons
// First, the metadata is fetched in an async thread so don't wait on that during a preload
// Second, the query model application updates the projection fields, so that needs to happen first
TypeMetadata typeMetadata = getTypeMetadata();

if (canReduceTypeMetadata && (config.getReduceTypeMetadata() || config.getReduceTypeMetadataPerShard())) {
// If per-shard reduction is enabled we still attempt a first-pass reduction here. This reduces
// the amount of future work done by the VisitorFunction and the raw bytes passed around.

Set<String> fieldsToRetain = new HashSet<>();
if (!config.getProjectFields().isEmpty()) {
// sum query fields, projection fields, and composite fields
fieldsToRetain.addAll(ReduceFields.getQueryFields(config.getQueryTree()));
fieldsToRetain.addAll(config.getProjectFields());
fieldsToRetain.addAll(config.getCompositeToFieldMap().keySet());
// might need to include GroupBy, Unique, and/or Excerpt fields
} else {
// sum all fields, remove exclude fields
fieldsToRetain.addAll(typeMetadata.keySet()); // metadata fetch filtered by datatype
// might need to add composite fields here
fieldsToRetain.removeAll(config.getDisallowlistedFields());
// might need to include GroupBy, Unique, and/or Excerpt fields
}

typeMetadata = typeMetadata.reduce(fieldsToRetain);
}

// only compress if enabled AND not reducing per shard
// type metadata will be serialized in the VisitorFunction
String serializedTypeMetadata = typeMetadata.toString();
if (compressMappings && !config.getReduceTypeMetadataPerShard()) {
serializedTypeMetadata = QueryOptions.compressOption(serializedTypeMetadata, QueryOptions.UTF8);
}

addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false);
}
} catch (IOException e) {
QueryException qe = new QueryException(DatawaveErrorCode.TYPE_MAPPING_CONFIG_ERROR, e);
throw new DatawaveQueryException(qe);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -457,13 +457,52 @@ protected void reduceQueryFields(ASTJexlScript script, IteratorSetting settings)
* @param newIteratorSetting
* the iterator settings
*/
private void reduceTypeMetadata(ASTJexlScript script, IteratorSetting newIteratorSetting) {
protected void reduceTypeMetadata(ASTJexlScript script, IteratorSetting newIteratorSetting) {

String serializedTypeMetadata = newIteratorSetting.removeOption(QueryOptions.TYPE_METADATA);
TypeMetadata typeMetadata = new TypeMetadata(serializedTypeMetadata);

Set<String> fieldsToRetain = ReduceFields.getQueryFields(script);
typeMetadata = typeMetadata.reduce(fieldsToRetain);
Map<String,String> options = newIteratorSetting.getOptions();

Set<String> fieldsToRetain = new HashSet<>();
if (options.containsKey(QueryOptions.PROJECTION_FIELDS)) {
// sum query fields, projection fields, and composite fields
fieldsToRetain.addAll(ReduceFields.getQueryFields(script));

if (options.containsKey(QueryOptions.PROJECTION_FIELDS)) {
String option = options.get(QueryOptions.PROJECTION_FIELDS);
if (org.apache.commons.lang3.StringUtils.isNotBlank(option)) {
fieldsToRetain.addAll(Splitter.on(',').splitToList(option));
}
}

if (options.containsKey(QueryOptions.COMPOSITE_FIELDS)) {
String option = options.get(QueryOptions.COMPOSITE_FIELDS);
if (org.apache.commons.lang3.StringUtils.isNotBlank(option)) {
fieldsToRetain.addAll(Splitter.on(',').splitToList(option));
}
}

} else if (options.containsKey(QueryOptions.DISALLOWLISTED_FIELDS)) {
// sum all fields and remove exclude fields
fieldsToRetain.addAll(typeMetadata.keySet());

String option = options.get(QueryOptions.DISALLOWLISTED_FIELDS);
if (org.apache.commons.lang3.StringUtils.isNotBlank(option)) {
Splitter.on(',').splitToList(option).forEach(fieldsToRetain::remove);
}
} else {
log.trace("Could not reduce type metadata per shard");
}

// we could get really clever and check to see if the query is satisfiable from the field index only,
// in which case all event-only fields could be removed. But sometimes being too clever is bad.
// Such a check could be run in the default query planner, but I'm not sure if natural query pruning via
// the range stream would falsify the field index satisfiability of a query.

if (!fieldsToRetain.isEmpty()) {
typeMetadata = typeMetadata.reduce(fieldsToRetain);
}

serializedTypeMetadata = typeMetadata.toString();

Expand Down
155 changes: 148 additions & 7 deletions warehouse/query-core/src/test/java/datawave/query/ShapesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

Expand All @@ -25,7 +27,6 @@
import org.apache.commons.collections.iterators.IteratorChain;
import org.apache.commons.jexl3.parser.ASTJexlScript;
import org.apache.commons.jexl3.parser.ParseException;
import org.apache.log4j.Logger;
import org.jboss.arquillian.container.test.api.Deployment;
import org.jboss.arquillian.junit.Arquillian;
import org.jboss.shrinkwrap.api.ShrinkWrap;
Expand All @@ -37,13 +38,19 @@
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.collect.Sets;

import datawave.accumulo.inmemory.InMemoryAccumuloClient;
import datawave.accumulo.inmemory.InMemoryInstance;
import datawave.configuration.spring.SpringBean;
import datawave.core.query.configuration.GenericQueryConfiguration;
import datawave.data.type.LcNoDiacriticsType;
import datawave.data.type.NoOpType;
import datawave.data.type.NumberType;
import datawave.helpers.PrintUtility;
import datawave.ingest.data.TypeRegistry;
import datawave.microservice.query.QueryImpl;
Expand Down Expand Up @@ -71,7 +78,7 @@
*/
public abstract class ShapesTest {

private static final Logger log = Logger.getLogger(ShapesTest.class);
private static final Logger log = LoggerFactory.getLogger(ShapesTest.class);
protected Authorizations auths = new Authorizations("ALL");
protected Set<Authorizations> authSet = Collections.singleton(auths);

Expand Down Expand Up @@ -242,7 +249,7 @@ public void planQuery() throws Exception {
GenericQueryConfiguration config = logic.initialize(clientForTest, settings, authSet);
logic.setupQuery(config);
} catch (Exception e) {
log.info("exception while planning query: " + e);
log.info("exception while planning query", e);
throw e;
}
}
Expand Down Expand Up @@ -270,12 +277,12 @@ public ShapesTest assertUuids() {

Set<String> missing = Sets.difference(expected, found);
if (!missing.isEmpty()) {
log.info("missing uuids: " + missing);
log.info("missing uuids: {}", missing);
}

Set<String> extra = Sets.difference(found, expected);
if (!extra.isEmpty()) {
log.info("extra uuids: " + extra);
log.info("extra uuids: {}", extra);
}

assertEquals(expected, found);
Expand All @@ -294,8 +301,8 @@ public void assertPlannedQuery(String query) {
ASTJexlScript expected = JexlASTHelper.parseAndFlattenJexlQuery(query);
ASTJexlScript plannedScript = logic.getConfig().getQueryTree();
if (!TreeEqualityVisitor.isEqual(expected, plannedScript)) {
log.info("expected: " + query);
log.info("planned : " + logic.getConfig().getQueryString());
log.info("expected: {}", query);
log.info("planned : {}", logic.getConfig().getQueryString());
fail("Planned query did not match expectation");
}
} catch (ParseException e) {
Expand Down Expand Up @@ -915,4 +922,138 @@ private void disableAllSortOptions() {
logic.setSortQueryPostIndexWithTermCounts(false);
}

@Test
public void testAttributeNormalizers() throws Exception {
withQuery("SHAPE == 'triangle'");
withExpected(new HashSet<>(triangleUids));
planAndExecuteQuery();

assertAttributeNormalizer("EDGES", NumberType.class);
assertAttributeNormalizer("ONLY_TRI", LcNoDiacriticsType.class);
assertAttributeNormalizer("PROPERTIES", NoOpType.class);
assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class);
assertAttributeNormalizer("TYPE", LcNoDiacriticsType.class);
assertAttributeNormalizer("UUID", NoOpType.class);
}

// use projection to trigger reduction
@Test
public void testReduceTypeMetadataViaIncludeFields() throws Exception {
boolean orig = logic.getReduceTypeMetadata();
try {
withIncludeFields(Set.of("EDGES", "UUID", "SHAPE"));
logic.setReduceTypeMetadata(true);

withQuery("SHAPE == 'triangle'");
withExpected(new HashSet<>(triangleUids));
planAndExecuteQuery();

assertAttributeNormalizer("EDGES", NumberType.class);
assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class);
assertAttributeNormalizer("UUID", NoOpType.class);

assertFieldNotFound("ONLY_TRI");
assertFieldNotFound("PROPERTIES");
assertFieldNotFound("TYPE");
} finally {
logic.setReduceTypeMetadata(orig);
}
}

// use disallow listed fields to trigger reduction
@Test
public void testReduceTypeMetadataViaExcludeFields() throws Exception {
boolean orig = logic.getReduceTypeMetadata();
try {
withExcludeFields(Set.of("ONLY_TRI", "PROPERTIES", "TYPE"));
logic.setReduceTypeMetadata(true);

withQuery("SHAPE == 'triangle'");
withExpected(new HashSet<>(triangleUids));
planAndExecuteQuery();

assertAttributeNormalizer("EDGES", NumberType.class);
assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class);
assertAttributeNormalizer("UUID", NoOpType.class);

assertFieldNotFound("ONLY_TRI");
assertFieldNotFound("PROPERTIES");
assertFieldNotFound("TYPE");
} finally {
logic.setReduceTypeMetadata(orig);
}
}

// use projection to trigger reduction per shard
@Test
public void testReduceTypeMetadataPerShardViaIncludeFields() throws Exception {
boolean orig = logic.getReduceTypeMetadataPerShard();
try {
withIncludeFields(Set.of("EDGES", "UUID", "SHAPE"));
logic.setReduceTypeMetadataPerShard(true);

withQuery("SHAPE == 'triangle'");
withExpected(new HashSet<>(triangleUids));
planAndExecuteQuery();

assertAttributeNormalizer("EDGES", NumberType.class);
assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class);
assertAttributeNormalizer("UUID", NoOpType.class);

assertFieldNotFound("ONLY_TRI");
assertFieldNotFound("PROPERTIES");
assertFieldNotFound("TYPE");
} finally {
logic.setReduceTypeMetadataPerShard(orig);
}
}

// use disallow listed fields to trigger reduction
@Test
public void testReduceTypeMetadataPerShardViaExcludeFields() throws Exception {
boolean orig = logic.getReduceTypeMetadataPerShard();
try {
withExcludeFields(Set.of("ONLY_TRI", "PROPERTIES", "TYPE"));
logic.setReduceTypeMetadata(true);

withQuery("SHAPE == 'triangle'");
withExpected(new HashSet<>(triangleUids));
planAndExecuteQuery();

assertAttributeNormalizer("EDGES", NumberType.class);
assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class);
assertAttributeNormalizer("UUID", NoOpType.class);

assertFieldNotFound("ONLY_TRI");
assertFieldNotFound("PROPERTIES");
assertFieldNotFound("TYPE");
} finally {
logic.setReduceTypeMetadata(orig);
}
}

private void withIncludeFields(Set<String> includes) {
parameters.put(QueryParameters.RETURN_FIELDS, Joiner.on(',').join(includes));
}

private void withExcludeFields(Set<String> excludes) {
parameters.put(QueryParameters.DISALLOWLISTED_FIELDS, Joiner.on(',').join(excludes));
}

private void assertAttributeNormalizer(String field, Class<?> expectedNormalizer) {
for (Document result : results) {
Attribute<?> attrs = result.get(field);
if (attrs instanceof TypeAttribute<?>) {
TypeAttribute<?> attr = (TypeAttribute<?>) attrs;
assertSame(expectedNormalizer, attr.getType().getClass());
}
}
}

private void assertFieldNotFound(String field) {
for (Document result : results) {
Attribute<?> attrs = result.get(field);
assertNull("Expected null value for field " + field, attrs);
}
}
}
Loading

0 comments on commit 496dd79

Please sign in to comment.