From 496dd79f4624029fa6fccc4f18525dcb0ce92a12 Mon Sep 17 00:00:00 2001 From: Moriarty <22225248+apmoriarty@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:56:47 +0000 Subject: [PATCH] Restrict TypeMetadata reduction to specific cases, i.e. when users specify include or exclude fields. --- .../query/planner/DefaultQueryPlanner.java | 58 +++++-- .../tables/async/event/VisitorFunction.java | 45 ++++- .../test/java/datawave/query/ShapesTest.java | 155 +++++++++++++++++- .../async/event/VisitorFunctionTest.java | 63 +++++++ .../datawave/query/EventQueryLogicFactory.xml | 3 + 5 files changed, 300 insertions(+), 24 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 8fbebd477e8..4f63194503e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -2477,29 +2477,59 @@ public void configureTypeMappings(ShardQueryConfiguration config, IteratorSettin String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); - TypeMetadata typeMetadata = getTypeMetadata(); - - if (config.getReduceTypeMetadata() && !isPreload) { - Set fieldsToRetain = ReduceFields.getQueryFields(config.getQueryTree()); - typeMetadata = typeMetadata.reduce(fieldsToRetain); - } - - String serializedTypeMetadata = typeMetadata.toString(); - if (compressMappings) { nonIndexedTypes = QueryOptions.compressOption(nonIndexedTypes, QueryOptions.UTF8); requiredAuthsString = QueryOptions.compressOption(requiredAuthsString, QueryOptions.UTF8); - if (!config.getReduceTypeMetadataPerShard()) { - // if we're reducing later, don't compress the type metadata - serializedTypeMetadata = QueryOptions.compressOption(serializedTypeMetadata, QueryOptions.UTF8); - } } addOption(cfg, QueryOptions.NON_INDEXED_DATATYPES, nonIndexedTypes, false); - addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false); addOption(cfg, QueryOptions.TYPE_METADATA_AUTHS, requiredAuthsString, false); addOption(cfg, QueryOptions.METADATA_TABLE_NAME, config.getMetadataTableName(), false); + // now handle TypeMetadata + boolean canReduceTypeMetadata = !config.getProjectFields().isEmpty() || !config.getDisallowlistedFields().isEmpty(); + if (!canReduceTypeMetadata) { + config.setReduceTypeMetadata(false); + config.setReduceTypeMetadataPerShard(false); + } + + if (!isPreload) { + // TypeMetadata is serialized at the end of query planning for two reasons + // First, the metadata is fetched in an async thread so don't wait on that during a preload + // Second, the query model application updates the projection fields, so that needs to happen first + TypeMetadata typeMetadata = getTypeMetadata(); + + if (canReduceTypeMetadata && (config.getReduceTypeMetadata() || config.getReduceTypeMetadataPerShard())) { + // If per-shard reduction is enabled we still attempt a first-pass reduction here. This reduces + // the amount of future work done by the VisitorFunction and the raw bytes passed around. + + Set fieldsToRetain = new HashSet<>(); + if (!config.getProjectFields().isEmpty()) { + // sum query fields, projection fields, and composite fields + fieldsToRetain.addAll(ReduceFields.getQueryFields(config.getQueryTree())); + fieldsToRetain.addAll(config.getProjectFields()); + fieldsToRetain.addAll(config.getCompositeToFieldMap().keySet()); + // might need to include GroupBy, Unique, and/or Excerpt fields + } else { + // sum all fields, remove exclude fields + fieldsToRetain.addAll(typeMetadata.keySet()); // metadata fetch filtered by datatype + // might need to add composite fields here + fieldsToRetain.removeAll(config.getDisallowlistedFields()); + // might need to include GroupBy, Unique, and/or Excerpt fields + } + + typeMetadata = typeMetadata.reduce(fieldsToRetain); + } + + // only compress if enabled AND not reducing per shard + // type metadata will be serialized in the VisitorFunction + String serializedTypeMetadata = typeMetadata.toString(); + if (compressMappings && !config.getReduceTypeMetadataPerShard()) { + serializedTypeMetadata = QueryOptions.compressOption(serializedTypeMetadata, QueryOptions.UTF8); + } + + addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false); + } } catch (IOException e) { QueryException qe = new QueryException(DatawaveErrorCode.TYPE_MAPPING_CONFIG_ERROR, e); throw new DatawaveQueryException(qe); diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/async/event/VisitorFunction.java b/warehouse/query-core/src/main/java/datawave/query/tables/async/event/VisitorFunction.java index c30a3dfb819..71621d58a5a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/async/event/VisitorFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/async/event/VisitorFunction.java @@ -457,13 +457,52 @@ protected void reduceQueryFields(ASTJexlScript script, IteratorSetting settings) * @param newIteratorSetting * the iterator settings */ - private void reduceTypeMetadata(ASTJexlScript script, IteratorSetting newIteratorSetting) { + protected void reduceTypeMetadata(ASTJexlScript script, IteratorSetting newIteratorSetting) { String serializedTypeMetadata = newIteratorSetting.removeOption(QueryOptions.TYPE_METADATA); TypeMetadata typeMetadata = new TypeMetadata(serializedTypeMetadata); - Set fieldsToRetain = ReduceFields.getQueryFields(script); - typeMetadata = typeMetadata.reduce(fieldsToRetain); + Map options = newIteratorSetting.getOptions(); + + Set fieldsToRetain = new HashSet<>(); + if (options.containsKey(QueryOptions.PROJECTION_FIELDS)) { + // sum query fields, projection fields, and composite fields + fieldsToRetain.addAll(ReduceFields.getQueryFields(script)); + + if (options.containsKey(QueryOptions.PROJECTION_FIELDS)) { + String option = options.get(QueryOptions.PROJECTION_FIELDS); + if (org.apache.commons.lang3.StringUtils.isNotBlank(option)) { + fieldsToRetain.addAll(Splitter.on(',').splitToList(option)); + } + } + + if (options.containsKey(QueryOptions.COMPOSITE_FIELDS)) { + String option = options.get(QueryOptions.COMPOSITE_FIELDS); + if (org.apache.commons.lang3.StringUtils.isNotBlank(option)) { + fieldsToRetain.addAll(Splitter.on(',').splitToList(option)); + } + } + + } else if (options.containsKey(QueryOptions.DISALLOWLISTED_FIELDS)) { + // sum all fields and remove exclude fields + fieldsToRetain.addAll(typeMetadata.keySet()); + + String option = options.get(QueryOptions.DISALLOWLISTED_FIELDS); + if (org.apache.commons.lang3.StringUtils.isNotBlank(option)) { + Splitter.on(',').splitToList(option).forEach(fieldsToRetain::remove); + } + } else { + log.trace("Could not reduce type metadata per shard"); + } + + // we could get really clever and check to see if the query is satisfiable from the field index only, + // in which case all event-only fields could be removed. But sometimes being too clever is bad. + // Such a check could be run in the default query planner, but I'm not sure if natural query pruning via + // the range stream would falsify the field index satisfiability of a query. + + if (!fieldsToRetain.isEmpty()) { + typeMetadata = typeMetadata.reduce(fieldsToRetain); + } serializedTypeMetadata = typeMetadata.toString(); diff --git a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java index a44295201ef..1b5b8fc9a5d 100644 --- a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java @@ -2,6 +2,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -25,7 +27,6 @@ import org.apache.commons.collections.iterators.IteratorChain; import org.apache.commons.jexl3.parser.ASTJexlScript; import org.apache.commons.jexl3.parser.ParseException; -import org.apache.log4j.Logger; import org.jboss.arquillian.container.test.api.Deployment; import org.jboss.arquillian.junit.Arquillian; import org.jboss.shrinkwrap.api.ShrinkWrap; @@ -37,13 +38,19 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.google.common.base.Joiner; import com.google.common.collect.Sets; import datawave.accumulo.inmemory.InMemoryAccumuloClient; import datawave.accumulo.inmemory.InMemoryInstance; import datawave.configuration.spring.SpringBean; import datawave.core.query.configuration.GenericQueryConfiguration; +import datawave.data.type.LcNoDiacriticsType; +import datawave.data.type.NoOpType; +import datawave.data.type.NumberType; import datawave.helpers.PrintUtility; import datawave.ingest.data.TypeRegistry; import datawave.microservice.query.QueryImpl; @@ -71,7 +78,7 @@ */ public abstract class ShapesTest { - private static final Logger log = Logger.getLogger(ShapesTest.class); + private static final Logger log = LoggerFactory.getLogger(ShapesTest.class); protected Authorizations auths = new Authorizations("ALL"); protected Set authSet = Collections.singleton(auths); @@ -242,7 +249,7 @@ public void planQuery() throws Exception { GenericQueryConfiguration config = logic.initialize(clientForTest, settings, authSet); logic.setupQuery(config); } catch (Exception e) { - log.info("exception while planning query: " + e); + log.info("exception while planning query", e); throw e; } } @@ -270,12 +277,12 @@ public ShapesTest assertUuids() { Set missing = Sets.difference(expected, found); if (!missing.isEmpty()) { - log.info("missing uuids: " + missing); + log.info("missing uuids: {}", missing); } Set extra = Sets.difference(found, expected); if (!extra.isEmpty()) { - log.info("extra uuids: " + extra); + log.info("extra uuids: {}", extra); } assertEquals(expected, found); @@ -294,8 +301,8 @@ public void assertPlannedQuery(String query) { ASTJexlScript expected = JexlASTHelper.parseAndFlattenJexlQuery(query); ASTJexlScript plannedScript = logic.getConfig().getQueryTree(); if (!TreeEqualityVisitor.isEqual(expected, plannedScript)) { - log.info("expected: " + query); - log.info("planned : " + logic.getConfig().getQueryString()); + log.info("expected: {}", query); + log.info("planned : {}", logic.getConfig().getQueryString()); fail("Planned query did not match expectation"); } } catch (ParseException e) { @@ -915,4 +922,138 @@ private void disableAllSortOptions() { logic.setSortQueryPostIndexWithTermCounts(false); } + @Test + public void testAttributeNormalizers() throws Exception { + withQuery("SHAPE == 'triangle'"); + withExpected(new HashSet<>(triangleUids)); + planAndExecuteQuery(); + + assertAttributeNormalizer("EDGES", NumberType.class); + assertAttributeNormalizer("ONLY_TRI", LcNoDiacriticsType.class); + assertAttributeNormalizer("PROPERTIES", NoOpType.class); + assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class); + assertAttributeNormalizer("TYPE", LcNoDiacriticsType.class); + assertAttributeNormalizer("UUID", NoOpType.class); + } + + // use projection to trigger reduction + @Test + public void testReduceTypeMetadataViaIncludeFields() throws Exception { + boolean orig = logic.getReduceTypeMetadata(); + try { + withIncludeFields(Set.of("EDGES", "UUID", "SHAPE")); + logic.setReduceTypeMetadata(true); + + withQuery("SHAPE == 'triangle'"); + withExpected(new HashSet<>(triangleUids)); + planAndExecuteQuery(); + + assertAttributeNormalizer("EDGES", NumberType.class); + assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class); + assertAttributeNormalizer("UUID", NoOpType.class); + + assertFieldNotFound("ONLY_TRI"); + assertFieldNotFound("PROPERTIES"); + assertFieldNotFound("TYPE"); + } finally { + logic.setReduceTypeMetadata(orig); + } + } + + // use disallow listed fields to trigger reduction + @Test + public void testReduceTypeMetadataViaExcludeFields() throws Exception { + boolean orig = logic.getReduceTypeMetadata(); + try { + withExcludeFields(Set.of("ONLY_TRI", "PROPERTIES", "TYPE")); + logic.setReduceTypeMetadata(true); + + withQuery("SHAPE == 'triangle'"); + withExpected(new HashSet<>(triangleUids)); + planAndExecuteQuery(); + + assertAttributeNormalizer("EDGES", NumberType.class); + assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class); + assertAttributeNormalizer("UUID", NoOpType.class); + + assertFieldNotFound("ONLY_TRI"); + assertFieldNotFound("PROPERTIES"); + assertFieldNotFound("TYPE"); + } finally { + logic.setReduceTypeMetadata(orig); + } + } + + // use projection to trigger reduction per shard + @Test + public void testReduceTypeMetadataPerShardViaIncludeFields() throws Exception { + boolean orig = logic.getReduceTypeMetadataPerShard(); + try { + withIncludeFields(Set.of("EDGES", "UUID", "SHAPE")); + logic.setReduceTypeMetadataPerShard(true); + + withQuery("SHAPE == 'triangle'"); + withExpected(new HashSet<>(triangleUids)); + planAndExecuteQuery(); + + assertAttributeNormalizer("EDGES", NumberType.class); + assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class); + assertAttributeNormalizer("UUID", NoOpType.class); + + assertFieldNotFound("ONLY_TRI"); + assertFieldNotFound("PROPERTIES"); + assertFieldNotFound("TYPE"); + } finally { + logic.setReduceTypeMetadataPerShard(orig); + } + } + + // use disallow listed fields to trigger reduction + @Test + public void testReduceTypeMetadataPerShardViaExcludeFields() throws Exception { + boolean orig = logic.getReduceTypeMetadataPerShard(); + try { + withExcludeFields(Set.of("ONLY_TRI", "PROPERTIES", "TYPE")); + logic.setReduceTypeMetadata(true); + + withQuery("SHAPE == 'triangle'"); + withExpected(new HashSet<>(triangleUids)); + planAndExecuteQuery(); + + assertAttributeNormalizer("EDGES", NumberType.class); + assertAttributeNormalizer("SHAPE", LcNoDiacriticsType.class); + assertAttributeNormalizer("UUID", NoOpType.class); + + assertFieldNotFound("ONLY_TRI"); + assertFieldNotFound("PROPERTIES"); + assertFieldNotFound("TYPE"); + } finally { + logic.setReduceTypeMetadata(orig); + } + } + + private void withIncludeFields(Set includes) { + parameters.put(QueryParameters.RETURN_FIELDS, Joiner.on(',').join(includes)); + } + + private void withExcludeFields(Set excludes) { + parameters.put(QueryParameters.DISALLOWLISTED_FIELDS, Joiner.on(',').join(excludes)); + } + + private void assertAttributeNormalizer(String field, Class expectedNormalizer) { + for (Document result : results) { + Attribute attrs = result.get(field); + if (attrs instanceof TypeAttribute) { + TypeAttribute attr = (TypeAttribute) attrs; + assertSame(expectedNormalizer, attr.getType().getClass()); + } + } + } + + private void assertFieldNotFound(String field) { + for (Document result : results) { + Attribute attrs = result.get(field); + assertNull("Expected null value for field " + field, attrs); + } + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/async/event/VisitorFunctionTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/async/event/VisitorFunctionTest.java index 73111fcdc88..abc4d000f4d 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/async/event/VisitorFunctionTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/async/event/VisitorFunctionTest.java @@ -2,11 +2,13 @@ import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Collections; import java.util.HashSet; +import java.util.Map; import java.util.Set; import java.util.UUID; @@ -14,6 +16,7 @@ import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Range; import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.easymock.EasyMock; @@ -23,6 +26,7 @@ import org.junit.Test; import datawave.core.query.configuration.QueryData; +import datawave.data.type.LcNoDiacriticsType; import datawave.microservice.query.Query; import datawave.query.config.ShardQueryConfiguration; import datawave.query.exceptions.DatawaveFatalQueryException; @@ -34,6 +38,7 @@ import datawave.query.tables.async.ScannerChunk; import datawave.query.util.MetadataHelper; import datawave.query.util.MockMetadataHelper; +import datawave.query.util.TypeMetadata; import datawave.util.TableName; public class VisitorFunctionTest extends EasyMockSupport { @@ -447,4 +452,62 @@ public void testPruneEmptyIteratorOptions() throws Exception { Assert.assertTrue(keys.contains(QueryOptions.QUERY)); Assert.assertFalse(keys.contains(QueryOptions.COMPOSITE_FIELDS)); } + + @Test + public void testTypeMetadataReductionViaIncludeFields() throws Exception { + ShardQueryConfiguration config = new ShardQueryConfiguration(); + MockMetadataHelper helper = new MockMetadataHelper(); + VisitorFunction function = new VisitorFunction(config, helper); + + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("FIELD_A == 'a'"); + IteratorSetting settings = new IteratorSetting(10, "itr", QueryIterator.class); + loadSettings(settings); + + // add include fields + settings.addOption(QueryOptions.PROJECTION_FIELDS, "FIELD_A,FIELD_B"); + + function.reduceTypeMetadata(script, settings); + + Map options = settings.getOptions(); + Assert.assertTrue(options.containsKey(QueryOptions.TYPE_METADATA)); + + String option = options.get(QueryOptions.TYPE_METADATA); + Assert.assertTrue(StringUtils.isNotBlank(option)); + + TypeMetadata metadata = new TypeMetadata(option); + Assert.assertEquals(Set.of("FIELD_A", "FIELD_B"), metadata.keySet()); + } + + @Test + public void testTypeMetadataReductionViaExcludeFields() throws Exception { + ShardQueryConfiguration config = new ShardQueryConfiguration(); + MockMetadataHelper helper = new MockMetadataHelper(); + VisitorFunction function = new VisitorFunction(config, helper); + + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("FIELD_A == 'a'"); + IteratorSetting settings = new IteratorSetting(10, "itr", QueryIterator.class); + loadSettings(settings); + + // add exclude fields + settings.addOption(QueryOptions.DISALLOWLISTED_FIELDS, "FIELD_B"); + + function.reduceTypeMetadata(script, settings); + + Map options = settings.getOptions(); + Assert.assertTrue(options.containsKey(QueryOptions.TYPE_METADATA)); + + String option = options.get(QueryOptions.TYPE_METADATA); + Assert.assertTrue(StringUtils.isNotBlank(option)); + + TypeMetadata metadata = new TypeMetadata(option); + Assert.assertEquals(Set.of("FIELD_A", "FIELD_C"), metadata.keySet()); + } + + private void loadSettings(IteratorSetting settings) { + TypeMetadata metadata = new TypeMetadata(); + metadata.put("FIELD_A", "type-a", LcNoDiacriticsType.class.getSimpleName()); + metadata.put("FIELD_B", "type-a", LcNoDiacriticsType.class.getSimpleName()); + metadata.put("FIELD_C", "type-a", LcNoDiacriticsType.class.getSimpleName()); + settings.addOption(QueryOptions.TYPE_METADATA, metadata.toString()); + } } diff --git a/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml index f652a4731b5..623a2093b04 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml @@ -128,6 +128,9 @@ + + +