From 8bbf1d1c14bc0cbf2ab4bd2c9b4f7a937dc35e0a Mon Sep 17 00:00:00 2001 From: Zhe Liu <770120041@qq.com> Date: Wed, 29 Nov 2023 17:43:36 -0500 Subject: [PATCH 1/4] Fix NPE excpetion when reading dataset with nested array --- .../org/apache/iceberg/avro/TypeToSchema.java | 33 ++++++++- .../iceberg/avro/TestSchemaConversions.java | 68 +++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java index a3cc60eba..b9a7c5faa 100644 --- a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java +++ b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java @@ -19,6 +19,8 @@ package org.apache.iceberg.avro; +import java.util.ArrayList; +import java.util.Collection; import java.util.Deque; import java.util.List; import java.util.Map; @@ -103,7 +105,8 @@ public Schema struct(Types.StructType struct, List fieldSchemas) { String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName); Object defaultValue = structField.hasDefaultValue() ? structField.getDefaultValue() : (structField.isOptional() ? JsonProperties.NULL_VALUE : null); - Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(), defaultValue); + Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(), + convertToJsonNull(defaultValue)); if (!isValidFieldName) { field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName); } @@ -232,4 +235,32 @@ public Schema primitive(Type.PrimitiveType primitive) { return primitiveSchema; } + + // This function ensures that all nested null are converted to JsonProperties.NULL_VALUE + // to make sure JacksonUtils.toJsonNode() converts them properly. + private Object convertToJsonNull(Object defaultValue) { + if (defaultValue instanceof Map) { + for (Map.Entry entry : ((Map) defaultValue).entrySet()) { + if (entry.getValue() instanceof Map || entry.getValue() instanceof Collection) { + entry.setValue(convertToJsonNull(entry.getValue())); + } else { + entry.setValue(JsonProperties.NULL_VALUE); + } + } + return defaultValue; + } else if (defaultValue instanceof List) { + List originalList = (List) defaultValue; + List copiedList = new ArrayList<>(); + + for (Object element : originalList) { + if (element instanceof Map || element instanceof Collection) { + copiedList.add(convertToJsonNull(element)); + } else { + copiedList.add(JsonProperties.NULL_VALUE); + } + } + return copiedList; + } + return defaultValue; + } } diff --git a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java index b1ebe5bd3..560a54b70 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java @@ -516,4 +516,72 @@ public void testVariousTypesDefaultValues() { Assert.assertTrue(IntStream.range(0, roundTripiSchema.columns().size()) .allMatch(i -> roundTripiSchema.columns().get(i).equals(iSchema.columns().get(i)))); } + + @Test + public void testConversionOfRecordWithNestedSubElement() { + String schemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"Root\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"OuterRecord1\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElement\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerField1\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField1Param\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + " },\n" + + " \"default\": {\n" + + " \"InnerField1Param\": null\n" + + " }\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " },\n" + + " {\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField2\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"InnerElement\"\n" + + " },\n" + + " \"default\": []\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + "}"; + Schema schema = new Schema.Parser().parse(schemaString); + org.apache.iceberg.Schema iSchema = AvroSchemaUtil.toIceberg(schema); + String jSchema = SchemaParser.toJson(iSchema); + org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema); + } } From 6b30810d02d6fd3c7bc0f4ec9820e39436700a1e Mon Sep 17 00:00:00 2001 From: Zhe Liu <770120041@qq.com> Date: Thu, 30 Nov 2023 14:50:11 -0500 Subject: [PATCH 2/4] Resolve comments --- ...rkAvroReaderForFieldsWithDefaultValue.java | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java index a26181fdb..f7a3c772e 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java +++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java @@ -198,5 +198,162 @@ public void testAvroDefaultValues() throws IOException { } } + + /* + * Test nested array with default null on complex types + * if the table contains non-primitive Avro types (InnerElement in the test below) + * as the first field and arrays of InnerElement as the second field, + * it leads to a NullPointerException when operating on the table. + */ + @Test + public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException { + File testFile = temp.newFile(); + String writeSchemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"Root\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"OuterRecord1\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElement\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerField1\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField1Param\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " \"string\"\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + " },\n" + + " \"default\": {\n" + + " \"InnerField1Param\": null\n" + + " }\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " },\n" + + " {\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField2\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"InnerElement\"\n" + + " },\n" + + " \"default\": []\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + "}"; + org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); + org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); + Assert.assertThrows( + NullPointerException.class, () -> RandomData.generateList(icebergWriteSchema, 2, 0L)); + } + + + /* + * Test nested array with default null on complex types. + * This test differs from testNestedArrayWithDefaultNullOnComplexTypes on the type + * of InnerField1Param, when it is a primitive type, no NPE is thrown when operating on the table. + */ + @Test + public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException { + File testFile = temp.newFile(); + String writeSchemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"Root\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"OuterRecord1\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElement\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerField1\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField1Param\",\n" + + " \"type\": \"int\",\n" + + " \"default\": 1\n" + + " }\n" + + " ]\n" + + " }\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " },\n" + + " {\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"type\": [\n" + + " \"null\",\n" + + " {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerElementV2\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"InnerField2\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"InnerElement\"\n" + + " },\n" + + " \"default\": []\n" + + " }\n" + + " ]\n" + + " }\n" + + " ],\n" + + " \"default\": null\n" + + " }\n" + + " ]\n" + + "}"; + org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); + org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); + + List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); + + + Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } + } } From 7f685e63046417877d65a2d990348f3f2391005e Mon Sep 17 00:00:00 2001 From: Zhe Liu <770120041@qq.com> Date: Thu, 30 Nov 2023 15:34:59 -0500 Subject: [PATCH 3/4] Fix test --- ...rkAvroReaderForFieldsWithDefaultValue.java | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java index f7a3c772e..b35bc63d1 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java +++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java @@ -203,7 +203,7 @@ public void testAvroDefaultValues() throws IOException { * Test nested array with default null on complex types * if the table contains non-primitive Avro types (InnerElement in the test below) * as the first field and arrays of InnerElement as the second field, - * it leads to a NullPointerException when operating on the table. + * it previously leads to a NullPointerException when operating on the table. */ @Test public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException { @@ -270,15 +270,18 @@ public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException { "}"; org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); - Assert.assertThrows( - NullPointerException.class, () -> RandomData.generateList(icebergWriteSchema, 2, 0L)); + + List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); + + + Assert.assertTrue("Delete should succeed", testFile.delete()); } /* * Test nested array with default null on complex types. * This test differs from testNestedArrayWithDefaultNullOnComplexTypes on the type - * of InnerField1Param, when it is a primitive type, no NPE is thrown when operating on the table. + * of InnerField1Param, it is a primitive type in this test. */ @Test public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException { @@ -344,16 +347,6 @@ public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException Assert.assertTrue("Delete should succeed", testFile.delete()); - - // write records with initial writeSchema - try (FileAppender writer = Avro.write(Files.localOutput(testFile)) - .schema(icebergWriteSchema) - .named("test") - .build()) { - for (GenericData.Record rec : expected) { - writer.add(rec); - } - } } } From f4b36e3b74b4194dccb48a05f8c96d0fa9870b8e Mon Sep 17 00:00:00 2001 From: Zhe Liu <770120041@qq.com> Date: Tue, 5 Dec 2023 14:24:20 -0500 Subject: [PATCH 4/4] add tests --- .../java/org/apache/iceberg/SchemaParser.java | 2 +- .../org/apache/iceberg/avro/TypeToSchema.java | 16 ++-- .../iceberg/avro/TestSchemaConversions.java | 33 +++++++++ ...rkAvroReaderForFieldsWithDefaultValue.java | 74 +++++++++++++++++-- 4 files changed, 113 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/SchemaParser.java b/core/src/main/java/org/apache/iceberg/SchemaParser.java index 3c5ce348f..90692d99a 100644 --- a/core/src/main/java/org/apache/iceberg/SchemaParser.java +++ b/core/src/main/java/org/apache/iceberg/SchemaParser.java @@ -303,7 +303,7 @@ private static String defaultValueToJsonString(byte[] bytes) { private static String defaultValueToJsonString(Object value) { try { - return JsonUtil.mapper().writeValueAsString(value); + return JsonUtil.mapper().writeValueAsString(AvroSchemaUtil.convertToJavaDefaultValue(value)); } catch (JsonProcessingException e) { throw new RuntimeException(e); } diff --git a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java index b9a7c5faa..7c5f3bf7a 100644 --- a/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java +++ b/core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java @@ -106,7 +106,7 @@ public Schema struct(Types.StructType struct, List fieldSchemas) { Object defaultValue = structField.hasDefaultValue() ? structField.getDefaultValue() : (structField.isOptional() ? JsonProperties.NULL_VALUE : null); Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(), - convertToJsonNull(defaultValue)); + convertComplexNullToJsonNull(defaultValue)); if (!isValidFieldName) { field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName); } @@ -238,13 +238,15 @@ public Schema primitive(Type.PrimitiveType primitive) { // This function ensures that all nested null are converted to JsonProperties.NULL_VALUE // to make sure JacksonUtils.toJsonNode() converts them properly. - private Object convertToJsonNull(Object defaultValue) { + private Object convertComplexNullToJsonNull(Object defaultValue) { if (defaultValue instanceof Map) { for (Map.Entry entry : ((Map) defaultValue).entrySet()) { if (entry.getValue() instanceof Map || entry.getValue() instanceof Collection) { - entry.setValue(convertToJsonNull(entry.getValue())); + entry.setValue(convertComplexNullToJsonNull(entry.getValue())); } else { - entry.setValue(JsonProperties.NULL_VALUE); + if (entry.getValue() == null) { + entry.setValue(JsonProperties.NULL_VALUE); + } } } return defaultValue; @@ -254,9 +256,11 @@ private Object convertToJsonNull(Object defaultValue) { for (Object element : originalList) { if (element instanceof Map || element instanceof Collection) { - copiedList.add(convertToJsonNull(element)); - } else { + copiedList.add(convertComplexNullToJsonNull(element)); + } else if (element == null) { copiedList.add(JsonProperties.NULL_VALUE); + } else { + copiedList.add(element); } } return copiedList; diff --git a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java index 560a54b70..9b86b1230 100644 --- a/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java +++ b/core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java @@ -584,4 +584,37 @@ public void testConversionOfRecordWithNestedSubElement() { String jSchema = SchemaParser.toJson(iSchema); org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema); } + @Test + public void testConversionOfRecordWithNestedSubElementWithNotNullDefaultValue() { + String schemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"OuterRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"nestedRecord\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"myArray\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"int\"\n" + + " },\n" + + " \"default\": [1, 2, 3]\n" + + " }\n" + + " ],\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " },\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " }\n" + + " ],\n" + + " \"default\": {\"nestedRecord\": {\"myArray\": [1, 2, 3]}}\n" + + "}"; + Schema schema = new Schema.Parser().parse(schemaString); + org.apache.iceberg.Schema iSchema = AvroSchemaUtil.toIceberg(schema); + String jSchema = SchemaParser.toJson(iSchema); + org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema); + } } diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java index b35bc63d1..66881d95a 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java +++ b/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkAvroReaderForFieldsWithDefaultValue.java @@ -203,7 +203,7 @@ public void testAvroDefaultValues() throws IOException { * Test nested array with default null on complex types * if the table contains non-primitive Avro types (InnerElement in the test below) * as the first field and arrays of InnerElement as the second field, - * it previously leads to a NullPointerException when operating on the table. + * it leads to a NullPointerException when operating on the table. */ @Test public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException { @@ -270,18 +270,25 @@ public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException { "}"; org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); - List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); - - Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } } /* * Test nested array with default null on complex types. * This test differs from testNestedArrayWithDefaultNullOnComplexTypes on the type - * of InnerField1Param, it is a primitive type in this test. + * of InnerField1Param, when it is a primitive type, no NPE is thrown when operating on the table. */ @Test public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException { @@ -347,6 +354,63 @@ public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } + } + + @Test + public void testNestedArrayWithDefaultNullOnArrayTypes() throws IOException { + String writeSchemaString = "{\n" + + " \"type\": \"record\",\n" + + " \"name\": \"OuterRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"nestedRecord\",\n" + + " \"type\": {\n" + + " \"type\": \"record\",\n" + + " \"name\": \"InnerRecord\",\n" + + " \"fields\": [\n" + + " {\n" + + " \"name\": \"myArray\",\n" + + " \"type\": {\n" + + " \"type\": \"array\",\n" + + " \"items\": \"int\"\n" + + " },\n" + + " \"default\": [1, 2, 3]\n" + + " }\n" + + " ],\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " },\n" + + " \"default\": {\"myArray\": [1, 2, 3]}\n" + + " }\n" + + " ],\n" + + " \"default\": {\"nestedRecord\": {\"myArray\": [1, 2, 3]}}\n" + + "}"; + org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString); + org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema); + + List expected = RandomData.generateList(icebergWriteSchema, 2, 0L); + + File testFile = temp.newFile(); + Assert.assertTrue("Delete should succeed", testFile.delete()); + + // write records with initial writeSchema + try (FileAppender writer = Avro.write(Files.localOutput(testFile)) + .schema(icebergWriteSchema) + .named("test") + .build()) { + for (GenericData.Record rec : expected) { + writer.add(rec); + } + } } }