From 59b5423c00163b3a099d45cf50ceb8a6251ba541 Mon Sep 17 00:00:00 2001 From: Zhe Liu <770120041@qq.com> Date: Tue, 5 Dec 2023 17:52:57 -0500 Subject: [PATCH] Fix NPE excpetion when reading dataset with nested array (#151) * Fix NPE excpetion when reading dataset with nested array * Resolve comments * Fix test * add tests --- .../java/org/apache/iceberg/SchemaParser.java | 2 +- .../org/apache/iceberg/avro/TypeToSchema.java | 37 ++- .../iceberg/avro/TestSchemaConversions.java | 101 +++++++++ ...rkAvroReaderForFieldsWithDefaultValue.java | 214 ++++++++++++++++++ 4 files changed, 352 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/SchemaParser.java b/core/src/main/java/org/apache/iceberg/SchemaParser.java index 3c5ce348f..90692d99a 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaParser.java +++ b/core/src/main/java/org/apache/iceberg/SchemaParser.java @@ -303,7 +303,7 @@ private static String defaultValueToJsonString(byte[] bytes) { private static String defaultValueToJsonString(Object value) { try { - return JsonUtil.mapper().writeValueAsString(value); + return JsonUtil.mapper().writeValueAsString(AvroSchemaUtil.convertToJavaDefaultValue(value)); } catch (JsonProcessingException e) { throw new RuntimeException(e); } diff --git a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java index a3cc60eba..7c5f3bf7a 100644 --- a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java +++ b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java @@ -19,6 +19,8 @@ package org.apache.iceberg.avro; +import java.util.ArrayList; +import java.util.Collection; import java.util.Deque; import java.util.List; import java.util.Map; @@ -103,7 +105,8 @@ public Schema struct(Types.StructType struct, List fieldSchemas) { String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName); Object defaultValue = structField.hasDefaultValue() ? structField.getDefaultValue() : (structField.isOptional() ? JsonProperties.NULL_VALUE : null); - Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(), defaultValue); + Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(), + convertComplexNullToJsonNull(defaultValue)); if (!isValidFieldName) { field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName); } @@ -232,4 +235,36 @@ public Schema primitive(Type.PrimitiveType primitive) { return primitiveSchema; } + + // This function ensures that all nested null are converted to JsonProperties.NULL_VALUE + // to make sure JacksonUtils.toJsonNode() converts them properly. + private Object convertComplexNullToJsonNull(Object defaultValue) { + if (defaultValue instanceof Map) { + for (Map.Entry entry : ((Map) defaultValue).entrySet()) { + if (entry.getValue() instanceof Map || entry.getValue() instanceof Collection) { + entry.setValue(convertComplexNullToJsonNull(entry.getValue())); + } else { + if (entry.getValue() == null) { + entry.setValue(JsonProperties.NULL_VALUE); + } + } + } + return defaultValue; + } else if (defaultValue instanceof List) { + List originalList = (List) defaultValue; + List copiedList = new ArrayList<>(); + + for (Object element : originalList) { + if (element instanceof Map || element instanceof Collection) { + copiedList.add(convertComplexNullToJsonNull(element)); + } else if (element == null) { + copiedList.add(JsonProperties.NULL_VALUE); + } else { + copiedList.add(element); + } + } + return copiedList; + } + return defaultValue; + } } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java index b1ebe5bd3..9b86b1230 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java @@ -516,4 +516,105 @@ public void testVariousTypesDefaultValues() { Assert.assertTrue(IntStream.range(0, roundTripiSchema.columns().size()) .allMatch(i -> roundTripiSchema.columns().get(i).equals(iSchema.columns().get(i)))); } + + @Test + public void testConversionOfRecordWithNestedSubElement() { + String schemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"Root\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"OuterRecord1\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElement\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerField1\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField1Param\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + " },\n" + + " \"default\": {\n" + + " \"InnerField1Param\": null\n" + + " }\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " },\n" + + " {\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField2\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"InnerElement\"\n" + + " },\n" + + " \"default\": []\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + "}"; + Schema schema = new Schema.Parser().parse(schemaString); + org.apache.iceberg.Schema iSchema = AvroSchemaUtil.toIceberg(schema); + String jSchema = SchemaParser.toJson(iSchema); + org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema); + } + @Test + public void testConversionOfRecordWithNestedSubElementWithNotNullDefaultValue() { + String schemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"OuterRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"nestedRecord\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"myArray\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"int\"\n" + + " },\n" + + " \"default\": [1, 2, 3]\n" + + " }\n" + + " ],\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " },\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " }\n" + + " ],\n" + + " \"default\": {\"nestedRecord\": {\"myArray\": [1, 2, 3]}}\n" + + "}"; + Schema schema = new Schema.Parser().parse(schemaString); + org.apache.iceberg.Schema iSchema = AvroSchemaUtil.toIceberg(schema); + String jSchema = SchemaParser.toJson(iSchema); + org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema); + } } diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java index a26181fdb..66881d95a 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java +++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java @@ -198,5 +198,219 @@ public void testAvroDefaultValues() throws IOException { } } + + /* + * Test nested array with default null on complex types + * if the table contains non-primitive Avro types (InnerElement in the test below) + * as the first field and arrays of InnerElement as the second field, + * it leads to a NullPointerException when operating on the table. + */ + @Test + public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException { + File testFile = temp.newFile(); + String writeSchemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"Root\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"OuterRecord1\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElement\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerField1\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField1Param\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + " },\n" + + " \"default\": {\n" + + " \"InnerField1Param\": null\n" + + " }\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " },\n" + + " {\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField2\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"InnerElement\"\n" + + " },\n" + + " \"default\": []\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + "}"; + org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); + org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); + List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); + Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } + } + + + /* + * Test nested array with default null on complex types. + * This test differs from testNestedArrayWithDefaultNullOnComplexTypes on the type + * of InnerField1Param, when it is a primitive type, no NPE is thrown when operating on the table. + */ + @Test + public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException { + File testFile = temp.newFile(); + String writeSchemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"Root\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"OuterRecord1\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElement\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerField1\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField1Param\",\n" + + " \"type\": \"int\",\n" + + " \"default\": 1\n" + + " }\n" + + " ]\n" + + " }\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " },\n" + + " {\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField2\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"InnerElement\"\n" + + " },\n" + + " \"default\": []\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + "}"; + org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); + org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); + + List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); + + + Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } + } + + @Test + public void testNestedArrayWithDefaultNullOnArrayTypes() throws IOException { + String writeSchemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"OuterRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"nestedRecord\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"myArray\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"int\"\n" + + " },\n" + + " \"default\": [1, 2, 3]\n" + + " }\n" + + " ],\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " },\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " }\n" + + " ],\n" + + " \"default\": {\"nestedRecord\": {\"myArray\": [1, 2, 3]}}\n" + + "}"; + org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); + org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); + + List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); + + File testFile = temp.newFile(); + Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } + } }