Fix parseToken array popping (#13620)

* Fix parseToken array popping Signed-off-by: naomichi-y <[email protected]> * Add fix description Signed-off-by: naomichi-y <[email protected]> * Refactor JsonToStringXContentParser#parseToken This method was pretty complicated, hard to follow, and therefore prone to bugs. This change introduces a proper stack (rather than a StringBuilder) to keep track of fields under fields. When ever we parse a field name, we push it onto the stack, recursively parse its value (an object, an array, or a primitive value), then we pop (guaranteeing pushes and pops are balanced). Signed-off-by: Michael Froh <[email protected]> * Remove use of org.apache.logging.log4j.util.Strings Signed-off-by: Michael Froh <[email protected]> * Make sure JsonToStringXContentParser ends on END_OBJECT There is logic in DocumentParser that expects any object parser to start with currentToken() pointing to START_OBJECT, and return with currentToken() pointing to END_OBJECT. My previous commits were stepping over the start/end, and returning on the token following the end. Signed-off-by: Michael Froh <[email protected]> * Throw exception when flat object is null The existing behavior throws an exception on null flat object by advancing the parser beyond the end of the flat object input. We could simply skip a null flat_object field, but this would be a behavioral change from 2.7. Signed-off-by: Michael Froh <[email protected]> --------- Signed-off-by: naomichi-y <[email protected]> Signed-off-by: Michael Froh <[email protected]> Co-authored-by: Michael Froh <[email protected]>
opensearch-project · Aug 13, 2024 · 6e27e5a · 6e27e5a
1 parent 625dd5a
commit 6e27e5a
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 72 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -46,6 +46,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix missing value of FieldSort for unsigned_long ([#14963](https://github.com/opensearch-project/OpenSearch/pull/14963))
 - Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080))
 - Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
+- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
 
 ### Security
 

diff --git a/server/src/main/java/org/opensearch/common/xcontent/JsonToStringXContentParser.java b/server/src/main/java/org/opensearch/common/xcontent/JsonToStringXContentParser.java
@@ -9,6 +9,7 @@
 package org.opensearch.common.xcontent;
 
 import org.opensearch.common.xcontent.json.JsonXContent;
+import org.opensearch.core.common.Strings;
 import org.opensearch.core.common.bytes.BytesReference;
 import org.opensearch.core.xcontent.AbstractXContentParser;
 import org.opensearch.core.xcontent.DeprecationHandler;
@@ -23,6 +24,9 @@
 import java.math.BigInteger;
 import java.nio.CharBuffer;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.LinkedList;
 
 /**
  * JsonToStringParser is the main parser class to transform JSON into stringFields in a XContentParser
@@ -32,21 +36,20 @@
  */
 public class JsonToStringXContentParser extends AbstractXContentParser {
     private final String fieldTypeName;
-    private XContentParser parser;
+    private final XContentParser parser;
 
-    private ArrayList<String> valueList = new ArrayList<>();
-    private ArrayList<String> valueAndPathList = new ArrayList<>();
-    private ArrayList<String> keyList = new ArrayList<>();
+    private final ArrayList<String> valueList = new ArrayList<>();
+    private final ArrayList<String> valueAndPathList = new ArrayList<>();
+    private final ArrayList<String> keyList = new ArrayList<>();
 
-    private XContentBuilder builder = XContentBuilder.builder(JsonXContent.jsonXContent);
+    private final XContentBuilder builder = XContentBuilder.builder(JsonXContent.jsonXContent);
 
-    private NamedXContentRegistry xContentRegistry;
+    private final NamedXContentRegistry xContentRegistry;
 
-    private DeprecationHandler deprecationHandler;
+    private final DeprecationHandler deprecationHandler;
 
     private static final String VALUE_AND_PATH_SUFFIX = "._valueAndPath";
     private static final String VALUE_SUFFIX = "._value";
-    private static final String DOT_SYMBOL = ".";
     private static final String EQUAL_SYMBOL = "=";
 
     public JsonToStringXContentParser(
@@ -63,9 +66,14 @@ public JsonToStringXContentParser(
     }
 
     public XContentParser parseObject() throws IOException {
+        assert currentToken() == Token.START_OBJECT;
+        parser.nextToken(); // Skip the outer START_OBJECT. Need to return on END_OBJECT.
+
         builder.startObject();
-        StringBuilder path = new StringBuilder(fieldTypeName);
-        parseToken(path, null);
+        LinkedList<String> path = new LinkedList<>(Collections.singleton(fieldTypeName));
+        while (currentToken() != Token.END_OBJECT) {
+            parseToken(path);
+        }
         builder.field(this.fieldTypeName, keyList);
         builder.field(this.fieldTypeName + VALUE_SUFFIX, valueList);
         builder.field(this.fieldTypeName + VALUE_AND_PATH_SUFFIX, valueAndPathList);
@@ -74,75 +82,55 @@ public XContentParser parseObject() throws IOException {
         return JsonXContent.jsonXContent.createParser(this.xContentRegistry, this.deprecationHandler, String.valueOf(jString));
     }
 
-    private void parseToken(StringBuilder path, String currentFieldName) throws IOException {
-
-        while (this.parser.nextToken() != Token.END_OBJECT) {
-            if (this.parser.currentName() != null) {
-                currentFieldName = this.parser.currentName();
+    private void parseToken(Deque<String> path) throws IOException {
+        if (this.parser.currentToken() == Token.FIELD_NAME) {
+            String fieldName = this.parser.currentName();
+            path.addLast(fieldName); // Pushing onto the stack *must* be matched by pop
+            String parts = fieldName;
+            while (parts.contains(".")) { // Extract the intermediate keys maybe present in fieldName
+                int dotPos = parts.indexOf('.');
+                String part = parts.substring(0, dotPos);
+                this.keyList.add(part);
+                parts = parts.substring(dotPos + 1);
             }
-            StringBuilder parsedFields = new StringBuilder();
-
-            if (this.parser.currentToken() == Token.FIELD_NAME) {
-                path.append(DOT_SYMBOL).append(currentFieldName);
-                int dotIndex = currentFieldName.indexOf(DOT_SYMBOL);
-                String fieldNameSuffix = currentFieldName;
-                // The field name may be of the form foo.bar.baz
-                // If that's the case, each "part" is a key.
-                while (dotIndex >= 0) {
-                    String fieldNamePrefix = fieldNameSuffix.substring(0, dotIndex);
-                    if (!fieldNamePrefix.isEmpty()) {
-                        this.keyList.add(fieldNamePrefix);
-                    }
-                    fieldNameSuffix = fieldNameSuffix.substring(dotIndex + 1);
-                    dotIndex = fieldNameSuffix.indexOf(DOT_SYMBOL);
-                }
-                if (!fieldNameSuffix.isEmpty()) {
-                    this.keyList.add(fieldNameSuffix);
-                }
-            } else if (this.parser.currentToken() == Token.START_ARRAY) {
-                parseToken(path, currentFieldName);
-                break;
-            } else if (this.parser.currentToken() == Token.END_ARRAY) {
-                // skip
-            } else if (this.parser.currentToken() == Token.START_OBJECT) {
-                parseToken(path, currentFieldName);
-                int dotIndex = path.lastIndexOf(DOT_SYMBOL, path.length());
-
-                if (dotIndex != -1 && path.length() > currentFieldName.length()) {
-                    path.setLength(path.length() - currentFieldName.length() - 1);
-                }
-            } else {
-                if (!path.toString().contains(currentFieldName)) {
-                    path.append(DOT_SYMBOL).append(currentFieldName);
-                }
-                parseValue(parsedFields);
-                this.valueList.add(parsedFields.toString());
-                this.valueAndPathList.add(path + EQUAL_SYMBOL + parsedFields);
-                int dotIndex = path.lastIndexOf(DOT_SYMBOL, path.length());
-                if (dotIndex != -1 && path.length() > currentFieldName.length()) {
-                    path.setLength(path.length() - currentFieldName.length() - 1);
-                }
+            this.keyList.add(parts); // parts has no dot, so either it's the original fieldName or it's the last part
+            this.parser.nextToken(); // advance to the value of fieldName
+            parseToken(path); // parse the value for fieldName (which will be an array, an object, or a primitive value)
+            path.removeLast(); // Here is where we pop fieldName from the stack (since we're done with the value of fieldName)
+            // Note that whichever other branch we just passed through has already ended with nextToken(), so we
+            // don't need to call it.
+        } else if (this.parser.currentToken() == Token.START_ARRAY) {
+            parser.nextToken();
+            while (this.parser.currentToken() != Token.END_ARRAY) {
+                parseToken(path);
             }
-
+            this.parser.nextToken();
+        } else if (this.parser.currentToken() == Token.START_OBJECT) {
+            parser.nextToken();
+            while (this.parser.currentToken() != Token.END_OBJECT) {
+                parseToken(path);
+            }
+            this.parser.nextToken();
+        } else if (this.parser.currentToken().isValue()) {
+            String parsedValue = parseValue();
+            if (parsedValue != null) {
+                this.valueList.add(parsedValue);
+                this.valueAndPathList.add(Strings.collectionToDelimitedString(path, ".") + EQUAL_SYMBOL + parsedValue);
+            }
+            this.parser.nextToken();
         }
     }
 
-    private void parseValue(StringBuilder parsedFields) throws IOException {
+    private String parseValue() throws IOException {
         switch (this.parser.currentToken()) {
             case VALUE_BOOLEAN:
             case VALUE_NUMBER:
             case VALUE_STRING:
             case VALUE_NULL:
-                parsedFields.append(this.parser.textOrNull());
-                break;
+                return this.parser.textOrNull();
             // Handle other token types as needed
-            case FIELD_NAME:
-            case VALUE_EMBEDDED_OBJECT:
-            case END_ARRAY:
-            case START_ARRAY:
-                break;
             default:
-                throw new IOException("Unsupported token type [" + parser.currentToken() + "]");
+                throw new IOException("Unsupported value token type [" + parser.currentToken() + "]");
         }
     }
 

diff --git a/server/src/main/java/org/opensearch/index/mapper/FlatObjectFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/FlatObjectFieldMapper.java
@@ -568,8 +568,12 @@ protected void parseCreateField(ParseContext context) throws IOException {
         if (context.externalValueSet()) {
             String value = context.externalValue().toString();
             parseValueAddFields(context, value, fieldType().name());
+        } else if (context.parser().currentToken() == XContentParser.Token.VALUE_NULL) {
+            context.parser().nextToken(); // This triggers an exception in DocumentParser.
+            // We could remove the above nextToken() call to skip the null value, but the existing
+            // behavior (since 2.7) throws the exception.
         } else {
-            JsonToStringXContentParser JsonToStringParser = new JsonToStringXContentParser(
+            JsonToStringXContentParser jsonToStringParser = new JsonToStringXContentParser(
                 NamedXContentRegistry.EMPTY,
                 DeprecationHandler.IGNORE_DEPRECATIONS,
                 context.parser(),
@@ -579,7 +583,7 @@ protected void parseCreateField(ParseContext context) throws IOException {
               JsonToStringParser is the main parser class to transform JSON into stringFields in a XContentParser
               It reads the JSON object and parsed to a list of string
              */
-            XContentParser parser = JsonToStringParser.parseObject();
+            XContentParser parser = jsonToStringParser.parseObject();
 
             XContentParser.Token currentToken;
             while ((currentToken = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
@@ -594,7 +598,6 @@ protected void parseCreateField(ParseContext context) throws IOException {
                 }
 
             }
-
         }
 
     }

diff --git a/server/src/test/java/org/opensearch/common/xcontent/JsonToStringXContentParserTests.java b/server/src/test/java/org/opensearch/common/xcontent/JsonToStringXContentParserTests.java
@@ -19,7 +19,6 @@
 public class JsonToStringXContentParserTests extends OpenSearchTestCase {
 
     private String flattenJsonString(String fieldName, String in) throws IOException {
-        String transformed;
         try (
             XContentParser parser = JsonXContent.jsonXContent.createParser(
                 xContentRegistry(),
@@ -33,10 +32,11 @@ private String flattenJsonString(String fieldName, String in) throws IOException
                 parser,
                 fieldName
             );
-            // Skip the START_OBJECT token:
+            // Point to the first token (should be START_OBJECT)
             jsonToStringXContentParser.nextToken();
 
             XContentParser transformedParser = jsonToStringXContentParser.parseObject();
+            assertSame(XContentParser.Token.END_OBJECT, jsonToStringXContentParser.currentToken());
             try (XContentBuilder jsonBuilder = XContentFactory.jsonBuilder()) {
                 jsonBuilder.copyCurrentStructure(transformedParser);
                 return jsonBuilder.toString();
@@ -110,4 +110,57 @@ public void testNestChildObjectWithDotsAndFieldWithDots() throws IOException {
         );
     }
 
+    public void testArrayOfObjects() throws IOException {
+        String jsonExample = "{"
+            + "\"field\": {"
+            + "  \"detail\": {"
+            + "    \"foooooooooooo\": ["
+            + "      {\"name\":\"baz\"},"
+            + "      {\"name\":\"baz\"}"
+            + "    ]"
+            + "  }"
+            + "}}";
+
+        assertEquals(
+            "{"
+                + "\"flat\":[\"field\",\"detail\",\"foooooooooooo\",\"name\",\"name\"],"
+                + "\"flat._value\":[\"baz\",\"baz\"],"
+                + "\"flat._valueAndPath\":["
+                + "\"flat.field.detail.foooooooooooo.name=baz\","
+                + "\"flat.field.detail.foooooooooooo.name=baz\""
+                + "]}",
+            flattenJsonString("flat", jsonExample)
+        );
+    }
+
+    public void testArraysOfObjectsAndValues() throws IOException {
+        String jsonExample = "{"
+            + "\"field\": {"
+            + "  \"detail\": {"
+            + "    \"foooooooooooo\": ["
+            + "      {\"name\":\"baz\"},"
+            + "      {\"name\":\"baz\"}"
+            + "    ]"
+            + "  },"
+            + "  \"numbers\" : ["
+            + "    1,"
+            + "    2,"
+            + "    3"
+            + "  ]"
+            + "}}";
+
+        assertEquals(
+            "{"
+                + "\"flat\":[\"field\",\"detail\",\"foooooooooooo\",\"name\",\"name\",\"numbers\"],"
+                + "\"flat._value\":[\"baz\",\"baz\",\"1\",\"2\",\"3\"],"
+                + "\"flat._valueAndPath\":["
+                + "\"flat.field.detail.foooooooooooo.name=baz\","
+                + "\"flat.field.detail.foooooooooooo.name=baz\","
+                + "\"flat.field.numbers=1\","
+                + "\"flat.field.numbers=2\","
+                + "\"flat.field.numbers=3\""
+                + "]}",
+            flattenJsonString("flat", jsonExample)
+        );
+    }
 }