Skip to content

Commit

Permalink
apacheGH-41231: [C#] Slice values array when writing a sliced list vi…
Browse files Browse the repository at this point in the history
…ew array to IPC format (apache#41255)

### Rationale for this change

Reduces IPC file sizes when writing sliced list view arrays.

### What changes are included in this PR?

Updates `ArrowSreamWriter` so it only writes the required range of values for a list view array, and adjusts the offset values accordingly.

### Are these changes tested?

Yes, this is covered by existing tests and I've also added a new test to verify the behaviour with list view arrays that have unordered offsets.

### Are there any user-facing changes?

Yes, this might reduce IPC file sizes for users writing sliced data.
* GitHub Issue: apache#41231

Lead-authored-by: Adam Reeve <[email protected]>
Co-authored-by: Curt Hagenlocher <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
  • Loading branch information
adamreeve and CurtHagenlocher authored Apr 19, 2024
1 parent bfc0dcb commit 2d977e4
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 8 deletions.
64 changes: 62 additions & 2 deletions csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,19 @@ public void Visit(ListArray array)

public void Visit(ListViewArray array)
{
var (valueOffsetsBuffer, minOffset, maxEnd) = GetZeroBasedListViewOffsets(array);

_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length));
_buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length));
_buffers.Add(CreateBuffer(valueOffsetsBuffer));
_buffers.Add(CreateSlicedBuffer<int>(array.SizesBuffer, array.Offset, array.Length));

VisitArray(array.Values);
IArrowArray values = array.Values;
if (minOffset != 0 || values.Length != maxEnd)
{
values = ArrowArrayFactory.Slice(values, minOffset, maxEnd - minOffset);
}

VisitArray(values);
}

public void Visit(FixedSizeListArray array)
Expand Down Expand Up @@ -319,6 +327,58 @@ private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int
}
}

private (ArrowBuffer Buffer, int minOffset, int maxEnd) GetZeroBasedListViewOffsets(ListViewArray array)
{
if (array.Length == 0)
{
return (ArrowBuffer.Empty, 0, 0);
}

var offsets = array.ValueOffsets;
var sizes = array.Sizes;

int minOffset = offsets[0];
int maxEnd = offsets[array.Length - 1] + sizes[array.Length - 1];

// Min possible offset is zero, and max possible end is the values length.
// If these match the first offset and last end we don't need to do anything further,
// but otherwise we need to iterate over each index in case the offsets aren't ordered.
if (minOffset != 0 || maxEnd != array.Values.Length)
{
for (int i = 0; i < array.Length; ++i)
{
minOffset = Math.Min(minOffset, offsets[i]);
maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
}
}

var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * array.Length);

if (minOffset == 0)
{
// No need to adjust the offsets, but we may need to slice the offsets buffer.
ArrowBuffer buffer = array.ValueOffsetsBuffer;
if (array.Offset != 0 || buffer.Length > requiredBytes)
{
var byteOffset = sizeof(int) * array.Offset;
var sliceLength = Math.Min(requiredBytes, buffer.Length - byteOffset);
buffer = new ArrowBuffer(buffer.Memory.Slice(byteOffset, sliceLength));
}

return (buffer, minOffset, maxEnd);
}

// Compute shifted offsets
var newOffsetsBuffer = _allocator.Allocate(requiredBytes);
var newOffsets = newOffsetsBuffer.Memory.Span.CastTo<int>();
for (int i = 0; i < array.Length; ++i)
{
newOffsets[i] = offsets[i] - minOffset;
}

return (new ArrowBuffer(newOffsetsBuffer), minOffset, maxEnd);
}

private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length)
{
if (buffer.IsEmpty)
Expand Down
77 changes: 77 additions & 0 deletions csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,71 @@ public async Task WriteSlicedArrays(int sliceOffset, int sliceLength)
await ValidateRecordBatchFile(stream, slicedBatch, strictCompare: false);
}

[Theory]
[InlineData(0, 100)]
[InlineData(0, 50)]
[InlineData(50, 50)]
[InlineData(25, 50)]
public async Task WriteListViewDataWithUnorderedOffsets(int sliceOffset, int sliceLength)
{
// A list-view array doesn't require that offsets are ordered,
// so verify that we can round trip a list-view array with out-of-order offsets.
const int length = 100;
var random = new Random();

var randomizedIndices = Enumerable.Range(0, length).ToArray();
Shuffle(randomizedIndices, random);

var offsetsBuilder = new ArrowBuffer.Builder<int>().Resize(length);
var sizesBuilder = new ArrowBuffer.Builder<int>().Resize(length);
var validityBuilder = new ArrowBuffer.BitmapBuilder().Reserve(length);

var valuesLength = 0;
for (int i = 0; i < length; ++i)
{
var index = randomizedIndices[i];
var listLength = random.Next(0, 10);
offsetsBuilder.Span[index] = valuesLength;
sizesBuilder.Span[index] = listLength;
valuesLength += listLength;

validityBuilder.Append(random.NextDouble() < 0.9);
}

var valuesBuilder = new Int64Array.Builder().Reserve(valuesLength);
for (int i = 0; i < valuesLength; ++i)
{
valuesBuilder.Append(random.Next(0, 1_000));
}

var type = new ListViewType(new Int64Type());
var offsets = offsetsBuilder.Build();
var sizes = sizesBuilder.Build();
var values = valuesBuilder.Build();
var nullCount = validityBuilder.UnsetBitCount;
var validityBuffer = validityBuilder.Build();

IArrowArray listViewArray = new ListViewArray(
type, length, offsets, sizes, values, validityBuffer, nullCount);

if (sliceOffset != 0 || sliceLength != length)
{
listViewArray = ArrowArrayFactory.Slice(listViewArray, sliceOffset, sliceLength);
}

var recordBatch = new RecordBatch.Builder().Append("x", true, listViewArray).Build();

var stream = new MemoryStream();
var writer = new ArrowFileWriter(stream, recordBatch.Schema, leaveOpen: true);

await writer.WriteRecordBatchAsync(recordBatch);
await writer.WriteEndAsync();

stream.Position = 0;

await ValidateRecordBatchFile(stream, recordBatch, strictCompare: false);
}

private async Task ValidateRecordBatchFile(Stream stream, RecordBatch recordBatch, bool strictCompare = true)
{
var reader = new ArrowFileReader(stream);
Expand Down Expand Up @@ -245,5 +310,17 @@ public async Task WriteListArrayWithEmptyOffsets()

await ValidateRecordBatchFile(stream, recordBatch, strictCompare: false);
}

private static void Shuffle(int[] values, Random random)
{
var length = values.Length;
for (int i = 0; i < length - 1; ++i)
{
var j = random.Next(i, length);
var tmp = values[i];
values[i] = values[j];
values[j] = tmp;
}
}
}
}
20 changes: 14 additions & 6 deletions csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -449,16 +449,24 @@ private void CompareArrays(ListViewArray actualArray)
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare));
}
else
{
int start = expectedArray.Offset * sizeof(int);
int length = expectedArray.Length * sizeof(int);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(start, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length)));
Assert.True(expectedArray.SizesBuffer.Span.Slice(start, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length)));
for (int i = 0; i < actualArray.Length; ++i)
{
if (expectedArray.IsNull(i))
{
Assert.True(actualArray.IsNull(i));
}
else
{
var expectedList = expectedArray.GetSlicedValues(i);
var actualList = actualArray.GetSlicedValues(i);
actualList.Accept(new ArrayComparer(expectedList, _strictCompare));
}
}
}

actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare));
}

private void CompareArrays(FixedSizeListArray actualArray)
Expand Down

0 comments on commit 2d977e4

Please sign in to comment.