From e8c5daf6225d10d7a172ec1376b40707f43a1ff8 Mon Sep 17 00:00:00 2001 From: Mohammad Qureshi <47198598+qreshi@users.noreply.github.com> Date: Wed, 3 Apr 2024 16:27:21 -0700 Subject: [PATCH] [Derived Fields] Add support for emitting multiple values in DerivedFieldScripts (#12837) --------- Signed-off-by: Mohammad Qureshi Signed-off-by: Rishabh Maurya Co-authored-by: Rishabh Maurya --- .../painless/PainlessModulePlugin.java | 6 + .../painless/spi/org.opensearch.derived.txt | 17 ++ .../painless/DerivedFieldScriptTests.java | 227 ++++++++++++++++++ .../mapper/DerivedFieldValueFetcher.java | 4 +- .../opensearch/script/DerivedFieldScript.java | 67 +++++- .../opensearch/script/ScriptEmitValues.java | 63 +++++ .../mapper/DerivedFieldMapperQueryTests.java | 4 +- .../index/query/DerivedFieldQueryTests.java | 4 +- .../opensearch/script/MockScriptEngine.java | 4 +- 9 files changed, 381 insertions(+), 15 deletions(-) create mode 100644 modules/lang-painless/src/main/resources/org/opensearch/painless/spi/org.opensearch.derived.txt create mode 100644 modules/lang-painless/src/test/java/org/opensearch/painless/DerivedFieldScriptTests.java create mode 100644 server/src/main/java/org/opensearch/script/ScriptEmitValues.java diff --git a/modules/lang-painless/src/main/java/org/opensearch/painless/PainlessModulePlugin.java b/modules/lang-painless/src/main/java/org/opensearch/painless/PainlessModulePlugin.java index c7638b3c41c63..55dc23f665d2e 100644 --- a/modules/lang-painless/src/main/java/org/opensearch/painless/PainlessModulePlugin.java +++ b/modules/lang-painless/src/main/java/org/opensearch/painless/PainlessModulePlugin.java @@ -60,6 +60,7 @@ import org.opensearch.repositories.RepositoriesService; import org.opensearch.rest.RestController; import org.opensearch.rest.RestHandler; +import org.opensearch.script.DerivedFieldScript; import org.opensearch.script.IngestScript; import org.opensearch.script.ScoreScript; import org.opensearch.script.ScriptContext; @@ -108,6 +109,11 @@ public final class PainlessModulePlugin extends Plugin implements ScriptPlugin, ingest.add(AllowlistLoader.loadFromResourceFiles(Allowlist.class, "org.opensearch.ingest.txt")); map.put(IngestScript.CONTEXT, ingest); + // Functions available to derived fields + List derived = new ArrayList<>(Allowlist.BASE_ALLOWLISTS); + derived.add(AllowlistLoader.loadFromResourceFiles(Allowlist.class, "org.opensearch.derived.txt")); + map.put(DerivedFieldScript.CONTEXT, derived); + allowlists = map; } diff --git a/modules/lang-painless/src/main/resources/org/opensearch/painless/spi/org.opensearch.derived.txt b/modules/lang-painless/src/main/resources/org/opensearch/painless/spi/org.opensearch.derived.txt new file mode 100644 index 0000000000000..9a3dd4894b286 --- /dev/null +++ b/modules/lang-painless/src/main/resources/org/opensearch/painless/spi/org.opensearch.derived.txt @@ -0,0 +1,17 @@ +# +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# + +# This file contains an allowlist for functions to be used in derived field context + +class org.opensearch.script.DerivedFieldScript @no_import { +} + +static_import { + void emit(org.opensearch.script.DerivedFieldScript, Object) bound_to org.opensearch.script.ScriptEmitValues$EmitSingle + void emit(org.opensearch.script.DerivedFieldScript, double, double) bound_to org.opensearch.script.ScriptEmitValues$GeoPoint +} diff --git a/modules/lang-painless/src/test/java/org/opensearch/painless/DerivedFieldScriptTests.java b/modules/lang-painless/src/test/java/org/opensearch/painless/DerivedFieldScriptTests.java new file mode 100644 index 0000000000000..2340e5b238ebb --- /dev/null +++ b/modules/lang-painless/src/test/java/org/opensearch/painless/DerivedFieldScriptTests.java @@ -0,0 +1,227 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.painless; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.memory.MemoryIndex; +import org.opensearch.common.collect.Tuple; +import org.opensearch.common.geo.GeoPoint; +import org.opensearch.common.settings.Settings; +import org.opensearch.index.fielddata.IndexGeoPointFieldData; +import org.opensearch.index.fielddata.IndexNumericFieldData; +import org.opensearch.index.fielddata.LeafGeoPointFieldData; +import org.opensearch.index.fielddata.LeafNumericFieldData; +import org.opensearch.index.fielddata.MultiGeoPointValues; +import org.opensearch.index.fielddata.SortedNumericDoubleValues; +import org.opensearch.index.fielddata.plain.AbstractLeafGeoPointFieldData; +import org.opensearch.index.fielddata.plain.LeafDoubleFieldData; +import org.opensearch.index.mapper.GeoPointFieldMapper.GeoPointFieldType; +import org.opensearch.index.mapper.MapperService; +import org.opensearch.index.mapper.NumberFieldMapper.NumberFieldType; +import org.opensearch.index.mapper.NumberFieldMapper.NumberType; +import org.opensearch.painless.spi.Allowlist; +import org.opensearch.painless.spi.AllowlistLoader; +import org.opensearch.script.DerivedFieldScript; +import org.opensearch.script.ScriptContext; +import org.opensearch.script.ScriptException; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class DerivedFieldScriptTests extends ScriptTestCase { + + private static PainlessScriptEngine SCRIPT_ENGINE; + + @Override + public void setUp() throws Exception { + super.setUp(); + + // Adding derived field script to the contexts for the script engine + Map, List> contexts = newDefaultContexts(); + List allowlists = new ArrayList<>(Allowlist.BASE_ALLOWLISTS); + allowlists.add(AllowlistLoader.loadFromResourceFiles(Allowlist.class, "org.opensearch.derived.txt")); + contexts.put(DerivedFieldScript.CONTEXT, allowlists); + + SCRIPT_ENGINE = new PainlessScriptEngine(Settings.EMPTY, contexts); + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + SCRIPT_ENGINE = null; + } + + @Override + protected PainlessScriptEngine getEngine() { + return SCRIPT_ENGINE; + } + + private DerivedFieldScript.LeafFactory compile(String expression, SearchLookup lookup) { + DerivedFieldScript.Factory factory = SCRIPT_ENGINE.compile( + "derived_script_test", + expression, + DerivedFieldScript.CONTEXT, + Collections.emptyMap() + ); + return factory.newFactory(Collections.emptyMap(), lookup); + } + + public void testEmittingDoubleField() throws IOException { + // Mocking field value to be returned + NumberFieldType fieldType = new NumberFieldType("test_double_field", NumberType.DOUBLE); + MapperService mapperService = mock(MapperService.class); + when(mapperService.fieldType("test_double_field")).thenReturn(fieldType); + + SortedNumericDoubleValues doubleValues = mock(SortedNumericDoubleValues.class); + when(doubleValues.docValueCount()).thenReturn(1); + when(doubleValues.advanceExact(anyInt())).thenReturn(true); + when(doubleValues.nextValue()).thenReturn(2.718); + + LeafNumericFieldData atomicFieldData = mock(LeafDoubleFieldData.class); // SortedNumericDoubleFieldData + when(atomicFieldData.getDoubleValues()).thenReturn(doubleValues); + + IndexNumericFieldData fieldData = mock(IndexNumericFieldData.class); // SortedNumericIndexFieldData + when(fieldData.getFieldName()).thenReturn("test_double_field"); + when(fieldData.load(any())).thenReturn(atomicFieldData); + + SearchLookup lookup = new SearchLookup(mapperService, (ignored, searchLookup) -> fieldData); + + // We don't need a real index, just need to construct a LeafReaderContext which cannot be mocked + MemoryIndex index = new MemoryIndex(); + LeafReaderContext leafReaderContext = index.createSearcher().getIndexReader().leaves().get(0); + + // Execute the script + DerivedFieldScript script = compile("emit(doc['test_double_field'].value)", lookup).newInstance(leafReaderContext); + script.setDocument(1); + script.execute(); + + List result = script.getEmittedValues(); + assertEquals(List.of(2.718), result); + } + + public void testEmittingGeoPoint() throws IOException { + // Mocking field value to be returned + GeoPointFieldType fieldType = new GeoPointFieldType("test_geo_field"); + MapperService mapperService = mock(MapperService.class); + when(mapperService.fieldType("test_geo_field")).thenReturn(fieldType); + + MultiGeoPointValues geoPointValues = mock(MultiGeoPointValues.class); + when(geoPointValues.docValueCount()).thenReturn(1); + when(geoPointValues.advanceExact(anyInt())).thenReturn(true); + when(geoPointValues.nextValue()).thenReturn(new GeoPoint(5, 8)); + + LeafGeoPointFieldData atomicFieldData = mock(AbstractLeafGeoPointFieldData.class); // LatLonPointDVLeafFieldData + when(atomicFieldData.getGeoPointValues()).thenReturn(geoPointValues); + + IndexGeoPointFieldData fieldData = mock(IndexGeoPointFieldData.class); + when(fieldData.getFieldName()).thenReturn("test_geo_field"); + when(fieldData.load(any())).thenReturn(atomicFieldData); + + SearchLookup lookup = new SearchLookup(mapperService, (ignored, searchLookup) -> fieldData); + + // We don't need a real index, just need to construct a LeafReaderContext which cannot be mocked + MemoryIndex index = new MemoryIndex(); + LeafReaderContext leafReaderContext = index.createSearcher().getIndexReader().leaves().get(0); + + // Execute the script + DerivedFieldScript script = compile("emit(doc['test_geo_field'].value.getLat(), doc['test_geo_field'].value.getLon())", lookup) + .newInstance(leafReaderContext); + script.setDocument(1); + script.execute(); + + List result = script.getEmittedValues(); + assertEquals(List.of(new Tuple<>(5.0, 8.0)), result); + } + + public void testEmittingMultipleValues() throws IOException { + SearchLookup lookup = mock(SearchLookup.class); + + // We don't need a real index, just need to construct a LeafReaderContext which cannot be mocked + MemoryIndex index = new MemoryIndex(); + LeafReaderContext leafReaderContext = index.createSearcher().getIndexReader().leaves().get(0); + + LeafSearchLookup leafSearchLookup = mock(LeafSearchLookup.class); + when(lookup.getLeafSearchLookup(leafReaderContext)).thenReturn(leafSearchLookup); + + // Execute the script + DerivedFieldScript script = compile( + "def l = new ArrayList(); l.add('test'); l.add('multiple'); l.add('values'); for (String x : l) emit(x)", + lookup + ).newInstance(leafReaderContext); + script.setDocument(1); + script.execute(); + + List result = script.getEmittedValues(); + assertEquals(List.of("test", "multiple", "values"), result); + } + + public void testExceedingByteSizeLimit() throws IOException { + SearchLookup lookup = mock(SearchLookup.class); + + // We don't need a real index, just need to construct a LeafReaderContext which cannot be mocked + MemoryIndex index = new MemoryIndex(); + LeafReaderContext leafReaderContext = index.createSearcher().getIndexReader().leaves().get(0); + + LeafSearchLookup leafSearchLookup = mock(LeafSearchLookup.class); + when(lookup.getLeafSearchLookup(leafReaderContext)).thenReturn(leafSearchLookup); + + // Emitting a large string to exceed the byte size limit + DerivedFieldScript stringScript = compile("for (int i = 0; i < 1024 * 1024; i++) emit('a' + i);", lookup).newInstance( + leafReaderContext + ); + expectThrows(ScriptException.class, () -> { + stringScript.setDocument(1); + stringScript.execute(); + }); + + // Emitting an integer to check byte size limit + DerivedFieldScript intScript = compile("for (int i = 0; i < 1024 * 1024; i++) emit(42)", lookup).newInstance(leafReaderContext); + expectThrows(ScriptException.class, "Expected IllegalStateException for exceeding byte size limit", () -> { + intScript.setDocument(1); + intScript.execute(); + }); + + // Emitting a long to check byte size limit + DerivedFieldScript longScript = compile("for (int i = 0; i < 1024 * 1024; i++) emit(1234567890123456789L)", lookup).newInstance( + leafReaderContext + ); + expectThrows(ScriptException.class, "Expected IllegalStateException for exceeding byte size limit", () -> { + longScript.setDocument(1); + longScript.execute(); + }); + + // Emitting a double to check byte size limit + DerivedFieldScript doubleScript = compile("for (int i = 0; i < 1024 * 1024; i++) emit(3.14159)", lookup).newInstance( + leafReaderContext + ); + expectThrows(ScriptException.class, "Expected IllegalStateException for exceeding byte size limit", () -> { + doubleScript.setDocument(1); + doubleScript.execute(); + }); + + // Emitting a GeoPoint to check byte size limit + DerivedFieldScript geoPointScript = compile("for (int i = 0; i < 1024 * 1024; i++) emit(1.23, 4.56);", lookup).newInstance( + leafReaderContext + ); + expectThrows(ScriptException.class, "Expected IllegalStateException for exceeding byte size limit", () -> { + geoPointScript.setDocument(1); + geoPointScript.execute(); + }); + } +} diff --git a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java index f3bf0c613415a..40aa2f9890965 100644 --- a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java +++ b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldValueFetcher.java @@ -31,8 +31,8 @@ public DerivedFieldValueFetcher(DerivedFieldScript.LeafFactory derivedFieldScrip @Override public List fetchValues(SourceLookup lookup) { derivedFieldScript.setDocument(lookup.docId()); - // TODO: remove List.of() when derivedFieldScript.execute() returns list of objects. - return List.of(derivedFieldScript.execute()); + derivedFieldScript.execute(); + return derivedFieldScript.getEmittedValues(); } public void setNextReader(LeafReaderContext context) { diff --git a/server/src/main/java/org/opensearch/script/DerivedFieldScript.java b/server/src/main/java/org/opensearch/script/DerivedFieldScript.java index 7f5b991950ec6..e9988ec5aeef2 100644 --- a/server/src/main/java/org/opensearch/script/DerivedFieldScript.java +++ b/server/src/main/java/org/opensearch/script/DerivedFieldScript.java @@ -9,14 +9,17 @@ package org.opensearch.script; import org.apache.lucene.index.LeafReaderContext; -import org.opensearch.common.logging.DeprecationLogger; +import org.opensearch.common.collect.Tuple; import org.opensearch.index.fielddata.ScriptDocValues; import org.opensearch.search.lookup.LeafSearchLookup; import org.opensearch.search.lookup.SearchLookup; import org.opensearch.search.lookup.SourceLookup; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.function.Function; @@ -30,7 +33,7 @@ public abstract class DerivedFieldScript { public static final String[] PARAMETERS = {}; public static final ScriptContext CONTEXT = new ScriptContext<>("derived_field", Factory.class); - private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(DynamicMap.class); + private static final int MAX_BYTE_SIZE = 1024 * 1024; // Maximum allowed byte size (1 MB) private static final Map> PARAMS_FUNCTIONS = Map.of( "doc", @@ -49,16 +52,27 @@ public abstract class DerivedFieldScript { */ private final LeafSearchLookup leafLookup; + /** + * The field values emitted from the script. + */ + private List emittedValues; + + private int totalByteSize; + public DerivedFieldScript(Map params, SearchLookup lookup, LeafReaderContext leafContext) { Map parameters = new HashMap<>(params); this.leafLookup = lookup.getLeafSearchLookup(leafContext); parameters.putAll(leafLookup.asMap()); this.params = new DynamicMap(parameters, PARAMS_FUNCTIONS); + this.emittedValues = new ArrayList<>(); + this.totalByteSize = 0; } - protected DerivedFieldScript() { - params = null; - leafLookup = null; + public DerivedFieldScript() { + this.params = null; + this.leafLookup = null; + this.emittedValues = new ArrayList<>(); + this.totalByteSize = 0; } /** @@ -75,14 +89,54 @@ public Map> getDoc() { return leafLookup.doc(); } + /** + * Return the emitted values from the script execution. + */ + public List getEmittedValues() { + return emittedValues; + } + /** * Set the current document to run the script on next. + * Clears the emittedValues as well since they should be scoped per document. */ public void setDocument(int docid) { + this.emittedValues = new ArrayList<>(); + this.totalByteSize = 0; leafLookup.setDocument(docid); } - public abstract Object execute(); + public void addEmittedValue(Object o) { + int byteSize = getObjectByteSize(o); + int newTotalByteSize = totalByteSize + byteSize; + if (newTotalByteSize <= MAX_BYTE_SIZE) { + emittedValues.add(o); + totalByteSize = newTotalByteSize; + } else { + throw new IllegalStateException("Exceeded maximum allowed byte size for emitted values"); + } + } + + private int getObjectByteSize(Object obj) { + if (obj instanceof String) { + return ((String) obj).getBytes(StandardCharsets.UTF_8).length; + } else if (obj instanceof Integer) { + return Integer.BYTES; + } else if (obj instanceof Long) { + return Long.BYTES; + } else if (obj instanceof Double) { + return Double.BYTES; + } else if (obj instanceof Boolean) { + return Byte.BYTES; // Assuming 1 byte for boolean + } else if (obj instanceof Tuple) { + // Assuming each element in the tuple is a double for GeoPoint case + return Double.BYTES * 2; + } else { + throw new IllegalArgumentException("Unsupported object type passed in emit()"); + } + } + + public void execute() {} /** * A factory to construct {@link DerivedFieldScript} instances. @@ -95,7 +149,6 @@ public interface LeafFactory { /** * A factory to construct stateful {@link DerivedFieldScript} factories for a specific index. - * * @opensearch.internal */ public interface Factory extends ScriptFactory { diff --git a/server/src/main/java/org/opensearch/script/ScriptEmitValues.java b/server/src/main/java/org/opensearch/script/ScriptEmitValues.java new file mode 100644 index 0000000000000..5d12f36442179 --- /dev/null +++ b/server/src/main/java/org/opensearch/script/ScriptEmitValues.java @@ -0,0 +1,63 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.script; + +import org.opensearch.common.collect.Tuple; + +/** + * Values that can be emitted in a derived field script context. + *

+ * The emit function can be called multiple times within a script definition + * so the function will handle collecting the values over the script execution. + */ +public final class ScriptEmitValues { + + /** + * Takes in a single value and emits it + * Could be a long, double, String, etc. + */ + public static final class EmitSingle { + + private final DerivedFieldScript derivedFieldScript; + + public EmitSingle(DerivedFieldScript derivedFieldScript) { + this.derivedFieldScript = derivedFieldScript; + } + + // TODO: Keeping this generic for the time being due to limitations with + // binding methods with the same name and arity. + // Ideally, we should have an emit signature per derived field type and try to scope + // that to the respective script execution so the other emits aren't allowed. + // One way to do this could be to create implementations of the DerivedFieldScript.LeafFactory + // per field type where they each define their own emit() method and then the engine that executes + // it can have custom compilation logic to perform class bindings on that emit implementation. + public void emit(Object val) { + derivedFieldScript.addEmittedValue(val); + } + + } + + /** + * Emits a GeoPoint value + */ + public static final class GeoPoint { + + private final DerivedFieldScript derivedFieldScript; + + public GeoPoint(DerivedFieldScript derivedFieldScript) { + this.derivedFieldScript = derivedFieldScript; + } + + public void emit(double lat, double lon) { + derivedFieldScript.addEmittedValue(new Tuple<>(lat, lon)); + } + + } + +} diff --git a/server/src/test/java/org/opensearch/index/mapper/DerivedFieldMapperQueryTests.java b/server/src/test/java/org/opensearch/index/mapper/DerivedFieldMapperQueryTests.java index 64f4f1f3e083e..bd6d7b88ade28 100644 --- a/server/src/test/java/org/opensearch/index/mapper/DerivedFieldMapperQueryTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/DerivedFieldMapperQueryTests.java @@ -197,8 +197,8 @@ public void setDocument(int docId) { } @Override - public Object execute() { - return raw_requests[docId][scriptIndex[0]]; + public void execute() { + addEmittedValue(raw_requests[docId][scriptIndex[0]]); } }; diff --git a/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java b/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java index 18d117fa8c0f5..1bb303a874b9a 100644 --- a/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java +++ b/server/src/test/java/org/opensearch/index/query/DerivedFieldQueryTests.java @@ -67,8 +67,8 @@ public void testDerivedField() throws IOException { when(searchLookup.getLeafSearchLookup(ctx)).thenReturn(leafLookup); return new DerivedFieldScript(params, lookup, ctx) { @Override - public Object execute() { - return raw_requests[sourceLookup.docId()][2]; + public void execute() { + addEmittedValue(raw_requests[sourceLookup.docId()][2]); } }; }; diff --git a/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java b/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java index 8c7e9718eb0cd..456b55883f91e 100644 --- a/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java +++ b/test/framework/src/main/java/org/opensearch/script/MockScriptEngine.java @@ -288,10 +288,10 @@ public double execute(Map params1, double[] values) { ctx ) { @Override - public Object execute() { + public void execute() { Map vars = new HashMap<>(derivedFieldsParams); vars.put("params", derivedFieldsParams); - return script.apply(vars); + script.apply(vars); } }; return context.factoryClazz.cast(factory);