From 4f772b327f707388b70ac280318ab95f60f0403b Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 5 Jun 2024 17:02:10 +0900 Subject: [PATCH] Add support for BinaryArray in arrow-vtab (#324) * Add support for BinaryArray in arrow-vtab * Fix lint --- crates/duckdb/src/vtab/arrow.rs | 54 +++++++++++++++++++++++++++----- crates/duckdb/src/vtab/vector.rs | 20 ++++++++++-- 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/crates/duckdb/src/vtab/arrow.rs b/crates/duckdb/src/vtab/arrow.rs index f1b8e9fe..fa92e64c 100644 --- a/crates/duckdb/src/vtab/arrow.rs +++ b/crates/duckdb/src/vtab/arrow.rs @@ -6,9 +6,9 @@ use std::ptr::null_mut; use crate::vtab::vector::Inserter; use arrow::array::{ - as_boolean_array, as_large_list_array, as_list_array, as_primitive_array, as_string_array, as_struct_array, Array, - ArrayData, AsArray, BooleanArray, Decimal128Array, FixedSizeListArray, GenericListArray, OffsetSizeTrait, - PrimitiveArray, StringArray, StructArray, + as_boolean_array, as_generic_binary_array, as_large_list_array, as_list_array, as_primitive_array, as_string_array, + as_struct_array, Array, ArrayData, AsArray, BinaryArray, BooleanArray, Decimal128Array, FixedSizeListArray, + GenericListArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray, }; use arrow::{ @@ -230,6 +230,9 @@ pub fn record_batch_to_duckdb_data_chunk( DataType::Utf8 => { string_array_to_vector(as_string_array(col.as_ref()), &mut chunk.flat_vector(i)); } + DataType::Binary => { + binary_array_to_vector(as_generic_binary_array(col.as_ref()), &mut chunk.flat_vector(i)); + } DataType::List(_) => { list_array_to_vector(as_list_array(col.as_ref()), &mut chunk.list_vector(i))?; } @@ -430,6 +433,15 @@ fn string_array_to_vector(array: &StringArray, out: &mut FlatVector) { } } +fn binary_array_to_vector(array: &BinaryArray, out: &mut FlatVector) { + assert!(array.len() <= out.capacity()); + + for i in 0..array.len() { + let s = array.value(i); + out.insert(i, s); + } +} + fn list_array_to_vector>( array: &GenericListArray, out: &mut ListVector, @@ -443,6 +455,9 @@ fn list_array_to_vector>( DataType::Utf8 => { string_array_to_vector(as_string_array(value_array.as_ref()), &mut child); } + DataType::Binary => { + binary_array_to_vector(as_generic_binary_array(value_array.as_ref()), &mut child); + } _ => { return Err("Nested list is not supported yet.".into()); } @@ -469,6 +484,9 @@ fn fixed_size_list_array_to_vector( DataType::Utf8 => { string_array_to_vector(as_string_array(value_array.as_ref()), &mut child); } + DataType::Binary => { + binary_array_to_vector(as_generic_binary_array(value_array.as_ref()), &mut child); + } _ => { return Err("Nested array is not supported yet.".into()); } @@ -493,6 +511,9 @@ fn struct_array_to_vector(array: &StructArray, out: &mut StructVector) -> Result DataType::Utf8 => { string_array_to_vector(as_string_array(column.as_ref()), &mut out.child(i)); } + DataType::Binary => { + binary_array_to_vector(as_generic_binary_array(column.as_ref()), &mut out.child(i)); + } DataType::List(_) => { list_array_to_vector(as_list_array(column.as_ref()), &mut out.list_vector_child(i))?; } @@ -560,10 +581,10 @@ mod test { use crate::{Connection, Result}; use arrow::{ array::{ - Array, ArrayRef, AsArray, Date32Array, Date64Array, Decimal256Array, FixedSizeListArray, Float64Array, - GenericListArray, Int32Array, ListArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray, - Time32SecondArray, Time64MicrosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, + Array, ArrayRef, AsArray, BinaryArray, Date32Array, Date64Array, Decimal256Array, FixedSizeListArray, + Float64Array, GenericListArray, Int32Array, ListArray, OffsetSizeTrait, PrimitiveArray, StringArray, + StructArray, Time32SecondArray, Time64MicrosecondArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }, buffer::{OffsetBuffer, ScalarBuffer}, datatypes::{i256, ArrowPrimitiveType, DataType, Field, Fields, Schema}, @@ -924,4 +945,23 @@ mod test { ) ); } + + #[test] + fn test_arrow_binary() { + let byte_array = BinaryArray::from_iter_values([b"test"].iter()); + let arc: ArrayRef = Arc::new(byte_array); + let batch = RecordBatch::try_from_iter(vec![("x", arc)]).unwrap(); + + let db = Connection::open_in_memory().unwrap(); + db.register_table_function::("arrow").unwrap(); + + let mut stmt = db.prepare("SELECT * FROM arrow(?, ?)").unwrap(); + + let mut arr = stmt.query_arrow(arrow_recordbatch_to_query_params(batch)).unwrap(); + let rb = arr.next().expect("no record batch"); + + let column = rb.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(column.len(), 1); + assert_eq!(column.value(0), b"test"); + } } diff --git a/crates/duckdb/src/vtab/vector.rs b/crates/duckdb/src/vtab/vector.rs index 030cf6ee..7de18578 100644 --- a/crates/duckdb/src/vtab/vector.rs +++ b/crates/duckdb/src/vtab/vector.rs @@ -7,8 +7,9 @@ use crate::ffi::{ duckdb_list_entry, duckdb_list_vector_get_child, duckdb_list_vector_get_size, duckdb_list_vector_reserve, duckdb_list_vector_set_size, duckdb_struct_type_child_count, duckdb_struct_type_child_name, duckdb_struct_vector_get_child, duckdb_validity_set_row_invalid, duckdb_vector, - duckdb_vector_assign_string_element, duckdb_vector_ensure_validity_writable, duckdb_vector_get_column_type, - duckdb_vector_get_data, duckdb_vector_get_validity, duckdb_vector_size, + duckdb_vector_assign_string_element, duckdb_vector_assign_string_element_len, + duckdb_vector_ensure_validity_writable, duckdb_vector_get_column_type, duckdb_vector_get_data, + duckdb_vector_get_validity, duckdb_vector_size, }; /// Vector trait. @@ -113,6 +114,21 @@ impl Inserter<&str> for FlatVector { } } +impl Inserter<&[u8]> for FlatVector { + fn insert(&self, index: usize, value: &[u8]) { + let value_size = value.len(); + unsafe { + // This function also works for binary data. https://duckdb.org/docs/api/c/api#duckdb_vector_assign_string_element_len + duckdb_vector_assign_string_element_len( + self.ptr, + index as u64, + value.as_ptr() as *const ::std::os::raw::c_char, + value_size as u64, + ); + } + } +} + /// A list vector. pub struct ListVector { /// ListVector does not own the vector pointer.