From 17e34b549a5fadada8cf0156c08b17903a3d70ae Mon Sep 17 00:00:00 2001
From: yuye-aws <yuyezhu@amazon.com>
Date: Thu, 7 Mar 2024 10:47:59 +0800
Subject: [PATCH] implement chunk with map in document chunking processor

Signed-off-by: yuye-aws <yuyezhu@amazon.com>
---
 .../neuralsearch/plugin/NeuralSearch.java     |   2 +-
 .../processor/DocumentChunkingProcessor.java  | 301 +++++++-----------
 2 files changed, 122 insertions(+), 181 deletions(-)
diff --git a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
index dc5b6e8f2..80fcf90f4 100644
--- a/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
+++ b/src/main/java/org/opensearch/neuralsearch/plugin/NeuralSearch.java
@@ -118,7 +118,7 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
             new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
             DocumentChunkingProcessor.TYPE,
             new DocumentChunkingProcessor.Factory(
-                parameters.env.settings(),
+                parameters.env,
                 parameters.ingestService.getClusterService(),
                 parameters.indicesService,
                 parameters.analysisRegistry
diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java
index 290756c12..957652638 100644
--- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java
+++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java
@@ -9,17 +9,15 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
-import java.util.LinkedHashMap;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
 
-import com.google.common.annotations.VisibleForTesting;
 import lombok.extern.log4j.Log4j2;
+import org.apache.commons.lang3.StringUtils;
 import org.opensearch.cluster.metadata.IndexMetadata;
+import org.opensearch.env.Environment;
 import org.opensearch.index.IndexService;
 import org.opensearch.cluster.service.ClusterService;
-import org.opensearch.common.settings.Settings;
 import org.opensearch.index.analysis.AnalysisRegistry;
+import org.opensearch.index.mapper.MapperService;
 import org.opensearch.indices.IndicesService;
 import org.opensearch.index.IndexSettings;
 import org.opensearch.ingest.AbstractProcessor;
@@ -45,8 +43,6 @@ public final class DocumentChunkingProcessor extends AbstractProcessor {
 
     private final Set<String> supportedChunkers = ChunkerFactory.getAllChunkers();
 
-    private final Settings settings;
-
     private String chunkerType;
 
     private Map<String, Object> chunkerParameters;
@@ -59,12 +55,14 @@ public final class DocumentChunkingProcessor extends AbstractProcessor {
 
     private final AnalysisRegistry analysisRegistry;
 
+    private final Environment environment;
+
     public DocumentChunkingProcessor(
         String tag,
         String description,
         Map<String, Object> fieldMap,
         Map<String, Object> algorithmMap,
-        Settings settings,
+        Environment environment,
         ClusterService clusterService,
         IndicesService indicesService,
         AnalysisRegistry analysisRegistry
@@ -72,7 +70,7 @@ public DocumentChunkingProcessor(
         super(tag, description);
         validateAndParseAlgorithmMap(algorithmMap);
         this.fieldMap = fieldMap;
-        this.settings = settings;
+        this.environment = environment;
         this.clusterService = clusterService;
         this.indicesService = indicesService;
         this.analysisRegistry = analysisRegistry;
@@ -82,12 +80,6 @@ public String getType() {
         return TYPE;
     }
 
-    private List<String> chunk(String content) {
-        // assume that content is either a map, list or string
-        IFieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry);
-        return chunker.chunk(content, chunkerParameters);
-    }
-
     @SuppressWarnings("unchecked")
     private void validateAndParseAlgorithmMap(Map<String, Object> algorithmMap) {
         if (algorithmMap.size() != 1) {
@@ -120,23 +112,56 @@ private void validateAndParseAlgorithmMap(Map<String, Object> algorithmMap) {
         }
     }
 
-    @Override
-    public IngestDocument execute(IngestDocument ingestDocument) {
-        Map<String, Object> processMap = buildMapWithProcessorKeyAndOriginalValue(ingestDocument);
-        List<String> inferenceList = createInferenceList(processMap);
-        if (inferenceList.isEmpty()) {
-            return ingestDocument;
-        } else {
-            return doExecute(ingestDocument, processMap, inferenceList);
+    @SuppressWarnings("unchecked")
+    private boolean isListString(Object value) {
+        // an empty list is also List<String>
+        if (!(value instanceof List)) {
+            return false;
+        }
+        for (Object element : (List<Object>) value) {
+            if (!(element instanceof String)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private List<String> chunkString(String content) {
+        // assume that content is either a map, list or string
+        IFieldChunker chunker = ChunkerFactory.create(chunkerType, analysisRegistry);
+        return chunker.chunk(content, chunkerParameters);
+    }
+
+    private List<String> chunkList(List<String> contentList) {
+        // flatten the List<List<String>> output to List<String>
+        List<String> result = new ArrayList<>();
+        for (String content : contentList) {
+            result.addAll(chunkString(content));
+        }
+        return result;
+    }
+
+    @SuppressWarnings("unchecked")
+    private List<String> chunkLeafType(Object value) {
+        // leaf type is either String or List<String>
+        List<String> chunkedResult = null;
+        if (value instanceof String) {
+            chunkedResult = chunkString(String.valueOf(value));
+        } else if (isListString(value)) {
+            chunkedResult = chunkList((List<String>) value);
         }
+        return chunkedResult;
     }
 
-    public IngestDocument doExecute(IngestDocument ingestDocument, Map<String, Object> ProcessMap, List<String> inferenceList) {
+    @Override
+    public IngestDocument execute(IngestDocument ingestDocument) {
+        validateEmbeddingFieldsValue(ingestDocument);
+
         if (Objects.equals(chunkerType, FIXED_LENGTH_ALGORITHM)) {
             // add maxTokenCount setting from index metadata to chunker parameters
             Map<String, Object> sourceAndMetadataMap = ingestDocument.getSourceAndMetadata();
             String indexName = sourceAndMetadataMap.get(IndexFieldMapper.NAME).toString();
-            int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings);
+            int maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.get(environment.settings());
             IndexMetadata indexMetadata = clusterService.state().metadata().index(indexName);
             if (indexMetadata != null) {
                 // if the index exists, read maxTokenCount from the index setting
@@ -146,182 +171,93 @@ public IngestDocument doExecute(IngestDocument ingestDocument, Map<String, Objec
             chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount);
         }
 
-        List<List<String>> chunkedResults = new ArrayList<>();
-        for (String inferenceString : inferenceList) {
-            chunkedResults.add(chunk(inferenceString));
-        }
-        setTargetFieldsToDocument(ingestDocument, ProcessMap, chunkedResults);
+        Map<String, Object> sourceAndMetadataMap = ingestDocument.getSourceAndMetadata();
+        chunkMapType(sourceAndMetadataMap, fieldMap);
+        sourceAndMetadataMap.forEach(ingestDocument::setFieldValue);
         return ingestDocument;
     }
 
-    private List<?> buildResultForListType(List<Object> sourceValue, List<?> results, InferenceProcessor.IndexWrapper indexWrapper) {
-        Object peek = sourceValue.get(0);
-        if (peek instanceof String) {
-            List<Object> keyToResult = new ArrayList<>();
-            IntStream.range(0, sourceValue.size()).forEachOrdered(x -> keyToResult.add(results.get(indexWrapper.index++)));
-            return keyToResult;
-        } else {
-            List<List<Object>> keyToResult = new ArrayList<>();
-            for (Object nestedList : sourceValue) {
-                List<Object> nestedResult = new ArrayList<>();
-                IntStream.range(0, ((List) nestedList).size()).forEachOrdered(x -> nestedResult.add(results.get(indexWrapper.index++)));
-                keyToResult.add(nestedResult);
-            }
-            return keyToResult;
-        }
-    }
-
-    private Map<String, Object> buildMapWithProcessorKeyAndOriginalValue(IngestDocument ingestDocument) {
+    private void validateEmbeddingFieldsValue(IngestDocument ingestDocument) {
         Map<String, Object> sourceAndMetadataMap = ingestDocument.getSourceAndMetadata();
-        Map<String, Object> mapWithProcessorKeys = new LinkedHashMap<>();
-        for (Map.Entry<String, Object> fieldMapEntry : fieldMap.entrySet()) {
-            String originalKey = fieldMapEntry.getKey();
-            Object targetKey = fieldMapEntry.getValue();
-            if (targetKey instanceof Map) {
-                Map<String, Object> treeRes = new LinkedHashMap<>();
-                buildMapWithProcessorKeyAndOriginalValueForMapType(originalKey, targetKey, sourceAndMetadataMap, treeRes);
-                mapWithProcessorKeys.put(originalKey, treeRes.get(originalKey));
-            } else {
-                mapWithProcessorKeys.put(String.valueOf(targetKey), sourceAndMetadataMap.get(originalKey));
-            }
-        }
-        return mapWithProcessorKeys;
-    }
-
-    private void buildMapWithProcessorKeyAndOriginalValueForMapType(
-        String parentKey,
-        Object processorKey,
-        Map<String, Object> sourceAndMetadataMap,
-        Map<String, Object> treeRes
-    ) {
-        if (processorKey == null || sourceAndMetadataMap == null) return;
-        if (processorKey instanceof Map) {
-            Map<String, Object> next = new LinkedHashMap<>();
-            if (sourceAndMetadataMap.get(parentKey) instanceof Map) {
-                for (Map.Entry<String, Object> nestedFieldMapEntry : ((Map<String, Object>) processorKey).entrySet()) {
-                    buildMapWithProcessorKeyAndOriginalValueForMapType(
-                        nestedFieldMapEntry.getKey(),
-                        nestedFieldMapEntry.getValue(),
-                        (Map<String, Object>) sourceAndMetadataMap.get(parentKey),
-                        next
-                    );
-                }
-            } else if (sourceAndMetadataMap.get(parentKey) instanceof List) {
-                for (Map.Entry<String, Object> nestedFieldMapEntry : ((Map<String, Object>) processorKey).entrySet()) {
-                    List<Map<String, Object>> list = (List<Map<String, Object>>) sourceAndMetadataMap.get(parentKey);
-                    List<Object> listOfStrings = list.stream().map(x -> x.get(nestedFieldMapEntry.getKey())).collect(Collectors.toList());
-                    Map<String, Object> map = new LinkedHashMap<>();
-                    map.put(nestedFieldMapEntry.getKey(), listOfStrings);
-                    buildMapWithProcessorKeyAndOriginalValueForMapType(
-                        nestedFieldMapEntry.getKey(),
-                        nestedFieldMapEntry.getValue(),
-                        map,
-                        next
-                    );
+        for (Map.Entry<String, Object> embeddingFieldsEntry : fieldMap.entrySet()) {
+            Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey());
+            if (sourceValue != null) {
+                String sourceKey = embeddingFieldsEntry.getKey();
+                Class<?> sourceValueClass = sourceValue.getClass();
+                if (List.class.isAssignableFrom(sourceValueClass) || Map.class.isAssignableFrom(sourceValueClass)) {
+                    validateNestedTypeValue(sourceKey, sourceValue, 1);
+                } else if (!String.class.isAssignableFrom(sourceValueClass)) {
+                    throw new IllegalArgumentException("field [" + sourceKey + "] is neither string nor nested type, cannot process it");
+                } else if (StringUtils.isBlank(sourceValue.toString())) {
+                    throw new IllegalArgumentException("field [" + sourceKey + "] has empty string value, cannot process it");
                 }
             }
-            treeRes.put(parentKey, next);
-        } else {
-            String key = String.valueOf(processorKey);
-            treeRes.put(key, sourceAndMetadataMap.get(parentKey));
         }
     }
 
-    @SuppressWarnings({ "unchecked" })
-    private List<String> createInferenceList(Map<String, Object> knnKeyMap) {
-        List<String> texts = new ArrayList<>();
-        knnKeyMap.entrySet().stream().filter(knnMapEntry -> knnMapEntry.getValue() != null).forEach(knnMapEntry -> {
-            Object sourceValue = knnMapEntry.getValue();
-            if (sourceValue instanceof List) {
-                for (Object nestedValue : (List<Object>) sourceValue) {
-                    if (nestedValue instanceof String) {
-                        texts.add((String) nestedValue);
-                    } else {
-                        texts.addAll((List<String>) nestedValue);
-                    }
-                }
-            } else if (sourceValue instanceof Map) {
-                createInferenceListForMapTypeInput(sourceValue, texts);
-            } else {
-                texts.add(sourceValue.toString());
-            }
-        });
-        return texts;
-    }
-
-    @SuppressWarnings("unchecked")
-    private void createInferenceListForMapTypeInput(Object sourceValue, List<String> texts) {
-        if (sourceValue instanceof Map) {
-            ((Map<String, Object>) sourceValue).forEach((k, v) -> createInferenceListForMapTypeInput(v, texts));
-        } else if (sourceValue instanceof List) {
-            texts.addAll(((List<String>) sourceValue));
-        } else {
-            if (sourceValue == null) return;
-            texts.add(sourceValue.toString());
+    @SuppressWarnings({ "rawtypes", "unchecked" })
+    private void validateNestedTypeValue(String sourceKey, Object sourceValue, int maxDepth) {
+        if (maxDepth > MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING.get(environment.settings())) {
+            throw new IllegalArgumentException("map type field [" + sourceKey + "] reached max depth limit, cannot process it");
+        } else if ((List.class.isAssignableFrom(sourceValue.getClass()))) {
+            validateListTypeValue(sourceKey, sourceValue, maxDepth);
+        } else if (Map.class.isAssignableFrom(sourceValue.getClass())) {
+            ((Map) sourceValue).values()
+                .stream()
+                .filter(Objects::nonNull)
+                .forEach(x -> validateNestedTypeValue(sourceKey, x, maxDepth + 1));
+        } else if (!String.class.isAssignableFrom(sourceValue.getClass())) {
+            throw new IllegalArgumentException("map type field [" + sourceKey + "] has non-string type, cannot process it");
         }
     }
 
-    private void setTargetFieldsToDocument(IngestDocument ingestDocument, Map<String, Object> processorMap, List<?> results) {
-        Objects.requireNonNull(results, "embedding failed, inference returns null result!");
-        log.debug("Model inference result fetched, starting build vector output!");
-        Map<String, Object> result = buildResult(processorMap, results, ingestDocument.getSourceAndMetadata());
-        result.forEach(ingestDocument::setFieldValue);
-    }
-
-    @VisibleForTesting
-    Map<String, Object> buildResult(Map<String, Object> processorMap, List<?> results, Map<String, Object> sourceAndMetadataMap) {
-        InferenceProcessor.IndexWrapper indexWrapper = new InferenceProcessor.IndexWrapper(0);
-        Map<String, Object> result = new LinkedHashMap<>();
-        for (Map.Entry<String, Object> knnMapEntry : processorMap.entrySet()) {
-            String knnKey = knnMapEntry.getKey();
-            Object sourceValue = knnMapEntry.getValue();
-            if (sourceValue instanceof String) {
-                result.put(knnKey, results.get(indexWrapper.index++));
-            } else if (sourceValue instanceof List) {
-                result.put(knnKey, buildResultForListType((List<Object>) sourceValue, results, indexWrapper));
-            } else if (sourceValue instanceof Map) {
-                putResultToSourceMapForMapType(knnKey, sourceValue, results, indexWrapper, sourceAndMetadataMap);
+    @SuppressWarnings({ "rawtypes" })
+    private void validateListTypeValue(String sourceKey, Object sourceValue, int maxDepth) {
+        for (Object value : (List) sourceValue) {
+            if (value instanceof Map) {
+                validateNestedTypeValue(sourceKey, value, maxDepth + 1);
+            } else if (value == null) {
+                throw new IllegalArgumentException("list type field [" + sourceKey + "] has null, cannot process it");
+            } else if (!(value instanceof String)) {
+                throw new IllegalArgumentException("list type field [" + sourceKey + "] has non string value, cannot process it");
+            } else if (StringUtils.isBlank(value.toString())) {
+                throw new IllegalArgumentException("list type field [" + sourceKey + "] has empty string, cannot process it");
             }
         }
-        return result;
     }
 
-    @SuppressWarnings({ "unchecked" })
-    private void putResultToSourceMapForMapType(
-        String processorKey,
-        Object sourceValue,
-        List<?> results,
-        InferenceProcessor.IndexWrapper indexWrapper,
-        Map<String, Object> sourceAndMetadataMap
-    ) {
-        if (processorKey == null || sourceAndMetadataMap == null || sourceValue == null) return;
-        if (sourceValue instanceof Map) {
-            for (Map.Entry<String, Object> inputNestedMapEntry : ((Map<String, Object>) sourceValue).entrySet()) {
-                if (sourceAndMetadataMap.get(processorKey) instanceof List) {
-                    // build output for list of nested objects
-                    for (Map<String, Object> nestedElement : (List<Map<String, Object>>) sourceAndMetadataMap.get(processorKey)) {
-                        nestedElement.put(inputNestedMapEntry.getKey(), results.get(indexWrapper.index++));
+    @SuppressWarnings("unchecked")
+    private void chunkMapType(Map<String, Object> sourceAndMetadataMap, Map<String, Object> fieldMap) {
+        for (Map.Entry<String, Object> fieldMapEntry : fieldMap.entrySet()) {
+            String originalKey = fieldMapEntry.getKey();
+            Object targetKey = fieldMapEntry.getValue();
+            if (targetKey instanceof Map) {
+                // call this method recursively when target key is a map
+                Object sourceObject = sourceAndMetadataMap.get(originalKey);
+                if (sourceObject instanceof List) {
+                    List<Object> sourceObjectList = (List<Object>) sourceObject;
+                    for (Object source : sourceObjectList) {
+                        if (source instanceof Map) {
+                            chunkMapType((Map<String, Object>) source, (Map<String, Object>) targetKey);
+                        }
                     }
-                } else {
-                    putResultToSourceMapForMapType(
-                        inputNestedMapEntry.getKey(),
-                        inputNestedMapEntry.getValue(),
-                        results,
-                        indexWrapper,
-                        (Map<String, Object>) sourceAndMetadataMap.get(processorKey)
-                    );
+                } else if (sourceObject instanceof Map) {
+                    chunkMapType((Map<String, Object>) sourceObject, (Map<String, Object>) targetKey);
+                }
+            } else {
+                // chunk the object when target key is a string
+                Object chunkObject = sourceAndMetadataMap.get(originalKey);
+                List<String> chunkedResult = chunkLeafType(chunkObject);
+                if (chunkedResult != null) {
+                    sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
                 }
             }
-        } else if (sourceValue instanceof String) {
-            sourceAndMetadataMap.put(processorKey, results.get(indexWrapper.index++));
-        } else if (sourceValue instanceof List) {
-            sourceAndMetadataMap.put(processorKey, buildResultForListType((List<Object>) sourceValue, results, indexWrapper));
         }
     }
 
     public static class Factory implements Processor.Factory {
 
-        private final Settings settings;
+        private final Environment environment;
 
         private final ClusterService clusterService;
 
@@ -329,8 +265,13 @@ public static class Factory implements Processor.Factory {
 
         private final AnalysisRegistry analysisRegistry;
 
-        public Factory(Settings settings, ClusterService clusterService, IndicesService indicesService, AnalysisRegistry analysisRegistry) {
-            this.settings = settings;
+        public Factory(
+            Environment environment,
+            ClusterService clusterService,
+            IndicesService indicesService,
+            AnalysisRegistry analysisRegistry
+        ) {
+            this.environment = environment;
             this.clusterService = clusterService;
             this.indicesService = indicesService;
             this.analysisRegistry = analysisRegistry;
@@ -350,7 +291,7 @@ public DocumentChunkingProcessor create(
                 description,
                 fieldMap,
                 algorithmMap,
-                settings,
+                environment,
                 clusterService,
                 indicesService,
                 analysisRegistry