Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ingest field configuration helper cache #2614

Draft
wants to merge 6 commits into
base: integration
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions warehouse/ingest-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,21 @@
<artifactId>javassist</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package datawave.ingest.data.config;

import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.LinkedHashMap;
import java.util.Map;
import java.util.function.Function;
import java.util.function.Predicate;

public class CachedFieldConfigHelper implements FieldConfigHelper {
private final static Logger log = LoggerFactory.getLogger(CachedFieldConfigHelper.class);

private final static float DEFAULT_LRU_LF = 0.75f;
private final static int EMIT_OVER_LIMIT_THRESHOLD = 100;

private final FieldConfigHelper underlyingHelper;
private final Map<String,CachedEntry> resultCache;
private final Function<String,CachedEntry> resultEntryFn;

private long fieldComputes;
private boolean fieldLimitExceeded;

enum AttributeType {
INDEXED_FIELD, REVERSE_INDEXED_FIELD, TOKENIZED_FIELD, REVERSE_TOKENIZED_FIELD, STORED_FIELD, INDEX_ONLY_FIELD
}

public CachedFieldConfigHelper(FieldConfigHelper helper, int limit) {
this(helper, limit, false);
}

public CachedFieldConfigHelper(FieldConfigHelper helper, int limit, boolean debugLimitExceeded) {
if (limit < 1) {
throw new IllegalArgumentException("Limit must be a positive integer");
}
this.underlyingHelper = helper;
this.resultCache = lruCache(limit);
this.resultEntryFn = !debugLimitExceeded ? CachedEntry::new : (String f) -> {
fieldComputes++;
if (fieldComputes >= limit && ((fieldComputes == limit) || (fieldComputes % EMIT_OVER_LIMIT_THRESHOLD) == 0)) {
fieldLimitExceeded = true;
log.info("Field cache limit exceeded [val: {}, size={}, limit={}]", f, fieldComputes, limit);
}
return new CachedEntry(f);
};
}

@Override
public boolean isStoredField(String fieldName) {
return getFieldResult(AttributeType.STORED_FIELD, fieldName, underlyingHelper::isStoredField);
}

@Override
public boolean isIndexedField(String fieldName) {
return getFieldResult(AttributeType.INDEXED_FIELD, fieldName, underlyingHelper::isIndexedField);
}

@Override
public boolean isIndexOnlyField(String fieldName) {
return getFieldResult(AttributeType.INDEX_ONLY_FIELD, fieldName, underlyingHelper::isIndexOnlyField);
}

@Override
public boolean isReverseIndexedField(String fieldName) {
return getFieldResult(AttributeType.REVERSE_INDEXED_FIELD, fieldName, underlyingHelper::isReverseIndexedField);
}

@Override
public boolean isTokenizedField(String fieldName) {
return getFieldResult(AttributeType.TOKENIZED_FIELD, fieldName, underlyingHelper::isTokenizedField);
}

@Override
public boolean isReverseTokenizedField(String fieldName) {
return getFieldResult(AttributeType.REVERSE_TOKENIZED_FIELD, fieldName, underlyingHelper::isReverseTokenizedField);
}

@VisibleForTesting
boolean getFieldResult(AttributeType attributeType, String fieldName, Predicate<String> fn) {
return resultCache.computeIfAbsent(fieldName, resultEntryFn).get(attributeType).getResultOrEvaluate(fn);
}

@VisibleForTesting
boolean hasLimitExceeded() {
return fieldLimitExceeded;
}

private static <K,V> Map<K,V> lruCache(final int maxSize) {
// Testing showed slightly better or same performance of LRU implementation below
// when compared to Apache Commons LRUMap
return new LinkedHashMap<>((int) (maxSize / DEFAULT_LRU_LF) + 1, DEFAULT_LRU_LF, true) {
protected boolean removeEldestEntry(Map.Entry<K,V> eldest) {
return size() > maxSize;
}
};
}

private static class CachedEntry {
private final String fieldName;
private final MemoizedResult indexed;
private final MemoizedResult reverseIndexed;
private final MemoizedResult stored;
private final MemoizedResult indexedOnly;
private final MemoizedResult tokenized;
private final MemoizedResult reverseTokenized;

private CachedEntry(String fieldName) {
this.fieldName = fieldName;
this.indexed = new MemoizedResult();
this.reverseIndexed = new MemoizedResult();
this.stored = new MemoizedResult();
this.indexedOnly = new MemoizedResult();
this.tokenized = new MemoizedResult();
this.reverseTokenized = new MemoizedResult();
}

private MemoizedResult get(AttributeType attributeType) {
MemoizedResult result;
switch (attributeType) {
case INDEX_ONLY_FIELD:
result = indexedOnly;
break;
case INDEXED_FIELD:
result = indexed;
break;
case REVERSE_INDEXED_FIELD:
result = reverseIndexed;
break;
case TOKENIZED_FIELD:
result = tokenized;
break;
case REVERSE_TOKENIZED_FIELD:
result = reverseTokenized;
break;
case STORED_FIELD:
result = stored;
break;
default:
throw new IllegalArgumentException("Undefined attribute type: " + attributeType);
}
return result;
}

private class MemoizedResult {
private boolean resultEvaluated;
private boolean result;

private boolean getResultOrEvaluate(Predicate<String> evaluateFn) {
if (!resultEvaluated) {
result = evaluateFn.test(fieldName);
resultEvaluated = true;
}
return result;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import datawave.ingest.config.IngestConfigurationFactory;
import datawave.ingest.data.Type;
import datawave.ingest.data.TypeRegistry;
import datawave.ingest.data.config.CachedFieldConfigHelper;
import datawave.ingest.data.config.DataTypeHelperImpl;
import datawave.ingest.data.config.FieldConfigHelper;
import datawave.ingest.data.config.MarkingsHelper;
Expand Down Expand Up @@ -137,6 +138,9 @@ public abstract class BaseIngestHelper extends AbstractIngestHelper implements C
public static final String FIELD_FAILED_NORMALIZATION_POLICY = ".data.field.normalization.failure.policy";

public static final String FIELD_CONFIG_FILE = ".data.category.field.config.file";
public static final String FIELD_CONFIG_CACHE_ENABLED = ".data.category.field.config.cache.enabled";
public static final String FIELD_CONFIG_CACHE_KEY_LIMIT = ".data.category.field.config.cache.limit";
public static final String FIELD_CONFIG_CACHE_KEY_LIMIT_DEBUG = ".data.category.field.config.cache.limit.debug";

private static final Logger log = ThreadConfigurableLogger.getLogger(BaseIngestHelper.class);

Expand Down Expand Up @@ -254,10 +258,21 @@ public void setup(Configuration config) {
// Load the field helper, which takes precedence over the individual field configurations
final String fieldConfigFile = config.get(this.getType().typeName() + FIELD_CONFIG_FILE);
if (fieldConfigFile != null) {
final boolean fieldConfigCacheEnabled = config.getBoolean(this.getType().typeName() + FIELD_CONFIG_CACHE_ENABLED, false);
final boolean fieldConfigCacheLimitDebug = config.getBoolean(this.getType().typeName() + FIELD_CONFIG_CACHE_KEY_LIMIT_DEBUG, false);
final int fieldConfigCacheLimit = config.getInt(this.getType().typeName() + FIELD_CONFIG_CACHE_KEY_LIMIT, 100);
if (log.isDebugEnabled()) {
log.debug("Field config file " + fieldConfigFile + " specified for: " + this.getType().typeName() + FIELD_CONFIG_FILE);
log.debug("Field config cache enabled: " + fieldConfigCacheEnabled);
if (fieldConfigCacheEnabled) {
log.debug("Field config cache limit: " + fieldConfigCacheLimit);
log.debug("Field config cache limit debug: " + fieldConfigCacheLimitDebug);
}
}
fieldConfigHelper = XMLFieldConfigHelper.load(fieldConfigFile, this);
if (fieldConfigCacheEnabled) {
fieldConfigHelper = new CachedFieldConfigHelper(fieldConfigHelper, fieldConfigCacheLimit, fieldConfigCacheLimitDebug);
}
this.fieldConfigHelper = XMLFieldConfigHelper.load(fieldConfigFile, this);
}

// Process the indexed fields
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package datawave.ingest.data.config;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;

import java.util.concurrent.atomic.AtomicLong;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.ValueSource;

public class CachedFieldConfigHelperTest {
@Test
public void testCachingBehaviorWillCallBaseMethods() {
String fieldName = "test";
FieldConfigHelper mockHelper = mock(FieldConfigHelper.class);
FieldConfigHelper cachedHelper = new CachedFieldConfigHelper(mockHelper, 1);

cachedHelper.isIndexOnlyField(fieldName);
verify(mockHelper).isIndexOnlyField(eq(fieldName));

cachedHelper.isIndexedField(fieldName);
verify(mockHelper).isIndexedField(eq(fieldName));

cachedHelper.isTokenizedField(fieldName);
verify(mockHelper).isTokenizedField(eq(fieldName));

cachedHelper.isStoredField(fieldName);
verify(mockHelper).isStoredField(eq(fieldName));

cachedHelper.isReverseIndexedField(fieldName);
verify(mockHelper).isReverseIndexedField(eq(fieldName));

cachedHelper.isReverseTokenizedField(fieldName);
verify(mockHelper).isReverseTokenizedField(eq(fieldName));
}

@ParameterizedTest
@ValueSource(ints = {-1, 0})
public void testConstructorWithNonPositiveLimitWillThrow(int limit) {
assertThrows(IllegalArgumentException.class, () -> new CachedFieldConfigHelper(mock(FieldConfigHelper.class), limit));
}

@SuppressWarnings("ClassEscapesDefinedScope")
@ParameterizedTest
@EnumSource(CachedFieldConfigHelper.AttributeType.class)
public void testAttributeTypesDoNotThrow(CachedFieldConfigHelper.AttributeType attributeType) {
String fieldName = "test";
FieldConfigHelper mockHelper = mock(FieldConfigHelper.class);
CachedFieldConfigHelper cachedHelper = new CachedFieldConfigHelper(mockHelper, 1);
cachedHelper.getFieldResult(attributeType, fieldName, (f) -> true);
}

@Test
public void testCachingLimitsBetweenFieldsAndAttributeTypes() {
AtomicLong storedCounter = new AtomicLong();
AtomicLong indexCounter = new AtomicLong();
FieldConfigHelper innerHelper = mock(FieldConfigHelper.class);
CachedFieldConfigHelper helper = new CachedFieldConfigHelper(innerHelper, 2, true);

when(innerHelper.isStoredField(any())).then((a) -> {
storedCounter.incrementAndGet();
return true;
});

when(innerHelper.isIndexedField(any())).then((a) -> {
indexCounter.incrementAndGet();
return true;
});

// following ensures that:
// 1. fields are computed, where appropriate per attribute-type
// 2. limit allows cache results to return
// 3. limit blocks results to return if exceeded
// 4. limit functions across attribute-types

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field1", innerHelper::isStoredField);
assertEquals(1, storedCounter.get(), "field1 should compute result (new field)");
assertFalse(helper.hasLimitExceeded());

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field1", innerHelper::isStoredField);
assertEquals(1, storedCounter.get(), "field1 repeated (existing field)");
assertFalse(helper.hasLimitExceeded());

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field2", innerHelper::isStoredField);
assertEquals(2, storedCounter.get(), "field2 should compute result (new field)");
assertTrue(helper.hasLimitExceeded());

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field2", innerHelper::isStoredField);
assertEquals(2, storedCounter.get(), "field2 repeated (existing)");

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.INDEXED_FIELD, "field1", innerHelper::isIndexedField);
assertEquals(1, indexCounter.get(), "field1 should compute result (new attribute)");

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field3", innerHelper::isStoredField);
assertEquals(3, storedCounter.get(), "field3 exceeded limit (new field)");

helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field3", innerHelper::isStoredField);
assertEquals(3, storedCounter.get(), "field3 exceeded limit (existing field)");

// LRU map should evict field #2
// we access field #1 above which has more accesses over field #2
helper.getFieldResult(CachedFieldConfigHelper.AttributeType.STORED_FIELD, "field2", innerHelper::isStoredField);
assertEquals(4, storedCounter.get(), "field1 exceeded limit (new field/eviction)");
}
}
Loading