From c87faf82a410a71751fdf85391c0dca20088b7d6 Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Sun, 10 Dec 2023 18:01:35 -0800 Subject: [PATCH] GH-36588: [C#] Support blank column names and enable more integration tests. (#39167) ### What changes are included in this PR? Allows field names to be blank (but not null). Enables schema metadata to be read from JSON integration tests. Enables integration tests for cases that are now working. ### Are these changes tested? Yes. * Closes: #36588 Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow/Field.Builder.cs | 3 +- csharp/src/Apache.Arrow/Field.cs | 11 ++--- .../src/Apache.Arrow/Ipc/MessageSerializer.cs | 9 ++-- csharp/src/Apache.Arrow/Types/MapType.cs | 48 +++++++++++++++++-- .../Apache.Arrow.IntegrationTest/JsonFile.cs | 6 +++ .../CDataInterfacePythonTests.cs | 44 +++++++++++++++++ dev/archery/archery/integration/datagen.py | 6 +-- 7 files changed, 105 insertions(+), 22 deletions(-) diff --git a/csharp/src/Apache.Arrow/Field.Builder.cs b/csharp/src/Apache.Arrow/Field.Builder.cs index 1e7aa192e08b4..f7f33383e2adc 100644 --- a/csharp/src/Apache.Arrow/Field.Builder.cs +++ b/csharp/src/Apache.Arrow/Field.Builder.cs @@ -30,14 +30,13 @@ public class Builder public Builder() { - _name = string.Empty; _type = NullType.Default; _nullable = true; } public Builder Name(string value) { - if (string.IsNullOrWhiteSpace(value)) + if (value == null) { throw new ArgumentNullException(nameof(value)); } diff --git a/csharp/src/Apache.Arrow/Field.cs b/csharp/src/Apache.Arrow/Field.cs index 4fddd1bc4e2de..ac3cafac93e59 100644 --- a/csharp/src/Apache.Arrow/Field.cs +++ b/csharp/src/Apache.Arrow/Field.cs @@ -35,24 +35,23 @@ public partial class Field public Field(string name, IArrowType dataType, bool nullable, IEnumerable> metadata = default) - : this(name, dataType, nullable, false) + : this(name, dataType, nullable) { Metadata = metadata?.ToDictionary(kv => kv.Key, kv => kv.Value); - } internal Field(string name, IArrowType dataType, bool nullable, - IReadOnlyDictionary metadata, bool copyCollections, bool allowBlankName) - : this(name, dataType, nullable, allowBlankName) + IReadOnlyDictionary metadata, bool copyCollections) + : this(name, dataType, nullable) { Debug.Assert(copyCollections == false, "This internal constructor is to not copy the collections."); Metadata = metadata; } - private Field(string name, IArrowType dataType, bool nullable, bool allowBlankName) + private Field(string name, IArrowType dataType, bool nullable) { - if (name == null || (!allowBlankName && string.IsNullOrWhiteSpace(name))) + if (name == null) { throw new ArgumentNullException(nameof(name)); } diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs index 3f504cf3b975a..633554fc53261 100644 --- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -59,7 +59,7 @@ internal static Schema GetSchema(Flatbuf.Schema schema, ref DictionaryMemo dicti for (int i = 0; i < schema.FieldsLength; i++) { Flatbuf.Field field = schema.Fields(i).GetValueOrDefault(); - fields.Add(FieldFromFlatbuffer(field, ref dictionaryMemo, allowBlankName: false)); + fields.Add(FieldFromFlatbuffer(field, ref dictionaryMemo)); } Dictionary metadata = schema.CustomMetadataLength > 0 ? new Dictionary() : null; @@ -73,14 +73,13 @@ internal static Schema GetSchema(Flatbuf.Schema schema, ref DictionaryMemo dicti return new Schema(fields, metadata, copyCollections: false); } - private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField, ref DictionaryMemo dictionaryMemo, bool allowBlankName) + private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField, ref DictionaryMemo dictionaryMemo) { - bool allowBlankNameChild = flatbufField.ChildrenLength == 1 && flatbufField.TypeType == Flatbuf.Type.FixedSizeList; Field[] childFields = flatbufField.ChildrenLength > 0 ? new Field[flatbufField.ChildrenLength] : null; for (int i = 0; i < flatbufField.ChildrenLength; i++) { Flatbuf.Field? childFlatbufField = flatbufField.Children(i); - childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value, ref dictionaryMemo, allowBlankNameChild); + childFields[i] = FieldFromFlatbuffer(childFlatbufField.Value, ref dictionaryMemo); } Flatbuf.DictionaryEncoding? dictionaryEncoding = flatbufField.Dictionary; @@ -104,7 +103,7 @@ private static Field FieldFromFlatbuffer(Flatbuf.Field flatbufField, ref Diction metadata[keyValue.Key] = keyValue.Value; } - var arrowField = new Field(flatbufField.Name, type, flatbufField.Nullable, metadata, copyCollections: false, allowBlankName); + var arrowField = new Field(flatbufField.Name, type, flatbufField.Nullable, metadata, copyCollections: false); if (dictionaryEncoding.HasValue) { diff --git a/csharp/src/Apache.Arrow/Types/MapType.cs b/csharp/src/Apache.Arrow/Types/MapType.cs index 73112c815bfbf..47e0be48f9f78 100644 --- a/csharp/src/Apache.Arrow/Types/MapType.cs +++ b/csharp/src/Apache.Arrow/Types/MapType.cs @@ -13,12 +13,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System; using System.Collections.Generic; namespace Apache.Arrow.Types { public sealed class MapType : NestedType // MapType = ListType(StructType("key", "value")) { + private const string EntriesKey = "entries"; + private const string KeyKey = "key"; + private const string ValueKey = "value"; + public override ArrowTypeId TypeId => ArrowTypeId.Map; public override string Name => "map"; public readonly bool KeySorted; @@ -28,20 +33,23 @@ public sealed class MapType : NestedType // MapType = ListType(StructType("key", public Field ValueField => KeyValueType.Fields[1]; public MapType(IArrowType key, IArrowType value, bool nullable = true, bool keySorted = false) - : this(new Field("key", key, false), new Field("value", value, nullable), keySorted) + : base(Entries(key, value, nullable)) { + KeySorted = keySorted; } public MapType(Field key, Field value, bool keySorted = false) - : this(new StructType(new List() { key, value }), keySorted) + : base(Entries(key, value)) { + KeySorted = keySorted; } - public MapType(StructType entries, bool keySorted = false) : this(new Field("entries", entries, false), keySorted) + public MapType(StructType entries, bool keySorted = false) : base(Entries(entries)) { + KeySorted = keySorted; } - public MapType(Field entries, bool keySorted = false) : base(entries) + public MapType(Field entries, bool keySorted = false) : base(Entries(entries)) { KeySorted = keySorted; } @@ -54,5 +62,37 @@ public MapType UnsortedKey() return new MapType(Fields[0], keySorted: false); } + + private static Field Entries(IArrowType key, IArrowType value, bool nullable) => + new Field(EntriesKey, NewStruct(new Field(KeyKey, key, false), new Field(ValueKey, value, nullable)), false); + + private static Field Entries(Field key, Field value) => + new Field(EntriesKey, NewStruct(NamedField(KeyKey, key), NamedField(ValueKey, value)), false); + + private static Field Entries(StructType entries) + { + return new Field(EntriesKey, Struct(entries), false); + } + + private static StructType NewStruct(Field key, Field value) => new StructType(new[] { key, value }); + + private static StructType Struct(StructType entries) + { + Field key = NamedField(KeyKey, entries.Fields[0]); + Field value = NamedField(ValueKey, entries.Fields[1]); + return object.ReferenceEquals(key, entries.Fields[0]) && object.ReferenceEquals(value, entries.Fields[1]) ? entries : NewStruct(key, value); + } + + private static Field Entries(Field entries) + { + StructType structType = (StructType)entries.DataType; + StructType adjustedStruct = Struct(structType); + return StringComparer.Ordinal.Equals(entries.Name, EntriesKey) && object.ReferenceEquals(structType, adjustedStruct) ? entries : new Field(EntriesKey, adjustedStruct, false); + } + + private static Field NamedField(string name, Field field) + { + return StringComparer.Ordinal.Equals(name, field.Name) ? field : new Field(name, field.DataType, field.IsNullable, field.Metadata); + } } } diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index 51bcf6dd7583e..f3fe73588a7bb 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -120,6 +120,12 @@ private static Schema CreateSchema(JsonSchema jsonSchema, Dictionary CreateField(f, jsonSchema.Fields[i], dictionaryIndexes)); } + + if (jsonSchema.Metadata != null) + { + builder.Metadata(jsonSchema.Metadata); + } + return builder.Build(); } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index a3b53a40db064..83902d8d93c70 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -829,6 +829,50 @@ public unsafe void ExportArrayStream() CArrowArrayStream.Free(cArrayStream); } + [SkippableFact] + public unsafe void ImportRecordBatchFromBuffer() + { + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + dynamic batch = pa.record_batch( + new PyList(new PyObject[] + { + pa.array(new long?[] { 1, 2, 3, null, 5 }), + pa.array(new[] { "hello", "world", null, "foo", "bar" }), + pa.array(new[] { 0.0, 1.4, 2.5, 3.6, 4.7 }), + pa.array(new bool?[] { true, false, null, false, true }), + }), + new[] { null, "", "column", "column" }); + + dynamic sink = pa.BufferOutputStream(); + dynamic writer = pa.ipc.new_stream(sink, batch.schema); + writer.write_batch(batch); + ((IDisposable)writer).Dispose(); + + dynamic buf = sink.getvalue(); + // buf.address, buf.size + + IntPtr address = (IntPtr)(long)buf.address; + int size = buf.size; + byte[] buffer = new byte[size]; + byte* ptr = (byte*)address.ToPointer(); + for (int i = 0; i < size; i++) + { + buffer[i] = ptr[i]; + } + + using (ArrowStreamReader reader = new ArrowStreamReader(new ReadOnlyMemory(buffer))) + { + RecordBatch batch2 = reader.ReadNextRecordBatch(); + Assert.Equal("None", batch2.Schema.FieldsList[0].Name); + Assert.Equal("", batch2.Schema.FieldsList[1].Name); + Assert.Equal("column", batch2.Schema.FieldsList[2].Name); + Assert.Equal("column", batch2.Schema.FieldsList[3].Name); + } + } + } + private static PyObject List(params int?[] values) { return new PyList(values.Select(i => i == null ? PyObject.None : new PyInt(i.Value)).ToArray()); diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index eca1effff6fbe..29b203ae130c6 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1812,7 +1812,6 @@ def _temp_path(): generate_map_case(), generate_non_canonical_map_case() - .skip_tester('C#') .skip_tester('Java') # TODO(ARROW-8715) # Canonical map names are restored on import, so the schemas are unequal .skip_format(SKIP_C_SCHEMA, 'C++'), @@ -1827,11 +1826,9 @@ def _temp_path(): generate_unions_case(), - generate_custom_metadata_case() - .skip_tester('C#'), + generate_custom_metadata_case(), generate_duplicate_fieldnames_case() - .skip_tester('C#') .skip_tester('JS'), generate_dictionary_case(), @@ -1856,7 +1853,6 @@ def _temp_path(): .skip_tester('Rust'), generate_extension_case() - .skip_tester('C#') # TODO: ensure the extension is registered in the C++ entrypoint .skip_format(SKIP_C_SCHEMA, 'C++') .skip_format(SKIP_C_ARRAY, 'C++'),