Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add and/or compute functions #481

Merged
merged 21 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 8 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,12 @@ resolver = "2"
version = "0.1.0"
homepage = "https://github.com/fulcrum-so/vortex"
repository = "https://github.com/fulcrum-so/vortex"
authors = ["Robert Kruszewski <[email protected]>, Nicholas Gates <[email protected]>, Will Manning <[email protected]>"]
authors = [
"Robert Kruszewski <[email protected]>, Nicholas Gates <[email protected]>, Will Manning <[email protected]>",
]
license = "Apache-2.0"
keywords = ["vortex"]
include = [
"benches/*.rs",
"src/**/*.rs",
"Cargo.toml",
]
include = ["benches/*.rs", "src/**/*.rs", "Cargo.toml"]
edition = "2021"
rust-version = "1.76"

Expand All @@ -36,6 +34,7 @@ ahash = "0.8.11"
allocator-api2 = "0.2.16"
arrayref = "0.3.7"
arrow = { version = "52.0.0", features = ["pyarrow"] }
arrow-arith = "52.0.0"
arrow-array = "52.0.0"
arrow-buffer = "52.0.0"
arrow-cast = "52.0.0"
Expand Down Expand Up @@ -116,3 +115,6 @@ warnings = "deny"
[workspace.lints.clippy]
all = { level = "deny", priority = -1 }
or_fun_call = "deny"

[profile.release]
debug = true
AdamGS marked this conversation as resolved.
Show resolved Hide resolved
7 changes: 6 additions & 1 deletion vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ arrow-cast = { workspace = true }
arrow-select = { workspace = true }
arrow-schema = { workspace = true }
arrow-ord = { workspace = true }
arrow-arith = { workspace = true }
bytes = "1"
AdamGS marked this conversation as resolved.
Show resolved Hide resolved
enum-iterator = { workspace = true }
flatbuffers = { workspace = true }
flexbuffers = { workspace = true }
Expand All @@ -43,7 +45,10 @@ vortex-dtype = { path = "../vortex-dtype", features = ["flatbuffers", "serde"] }
vortex-error = { path = "../vortex-error", features = ["flexbuffers"] }
vortex-expr = { path = "../vortex-expr" }
vortex-flatbuffers = { path = "../vortex-flatbuffers" }
vortex-scalar = { path = "../vortex-scalar", features = ["flatbuffers", "serde"] }
vortex-scalar = { path = "../vortex-scalar", features = [
"flatbuffers",
"serde",
] }
serde = { workspace = true, features = ["derive"] }

[target.'cfg(target_arch = "wasm32")'.dependencies]
Expand Down
56 changes: 54 additions & 2 deletions vortex-array/src/array/constant/compute.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use std::cmp::Ordering;

use vortex_error::VortexResult;
use vortex_dtype::Nullability;
use vortex_error::{vortex_bail, VortexResult};
use vortex_scalar::Scalar;

use crate::array::constant::ConstantArray;
use crate::compute::boolean::{and, AndFn, OrFn};
use crate::compute::unary::scalar_at::ScalarAtFn;
use crate::compute::{ArrayCompute, SliceFn, TakeFn};
use crate::compute::{SearchResult, SearchSortedFn, SearchSortedSide};
use crate::{Array, IntoArray};
use crate::{Array, ArrayDType, AsArray, IntoArray, IntoArrayVariant};

impl ArrayCompute for ConstantArray {
fn scalar_at(&self) -> Option<&dyn ScalarAtFn> {
Expand All @@ -25,6 +27,14 @@ impl ArrayCompute for ConstantArray {
fn take(&self) -> Option<&dyn TakeFn> {
Some(self)
}

fn and(&self) -> Option<&dyn AndFn> {
Some(self)
}

fn or(&self) -> Option<&dyn OrFn> {
Some(self)
}
}

impl ScalarAtFn for ConstantArray {
Expand Down Expand Up @@ -58,6 +68,48 @@ impl SearchSortedFn for ConstantArray {
}
}

impl AndFn for ConstantArray {
fn and(&self, array: &Array) -> VortexResult<Array> {
constant_array_bool_impl(self, array, |(l, r)| l & r)
}
}

impl OrFn for ConstantArray {
fn or(&self, array: &Array) -> VortexResult<Array> {
constant_array_bool_impl(self, array, |(l, r)| l | r)
}
}

fn constant_array_bool_impl(
constant_array: &ConstantArray,
other: &Array,
bool_op: impl Fn((bool, bool)) -> bool,
) -> VortexResult<Array> {
if constant_array.dtype().is_boolean()
&& other.dtype().is_boolean()
&& constant_array.len() == other.len()
{
if let Ok(array) = ConstantArray::try_from(other.clone()) {
AdamGS marked this conversation as resolved.
Show resolved Hide resolved
let lhs = constant_array.scalar().value().as_bool()?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe first check if either scalar is_null, and then I think the more canonical conversion would be bool::try_from(array.scalar())?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should chat through the scalar API next week, it's a bit weird, you might have some ideas to improve

let rhs = array.scalar().value().as_bool()?;

let scalar = match lhs.zip(rhs).map(bool_op) {
Some(b) => Scalar::bool(b, Nullability::Nullable),
None => Scalar::null(constant_array.dtype().as_nullable()),
};

Ok(ConstantArray::new(scalar, constant_array.len()).into_array())
} else {
let lhs = constant_array.clone().into_bool()?;
let rhs = other.clone().into_bool()?;

and(lhs.as_array_ref(), rhs.as_array_ref())
}
} else {
vortex_bail!("Boolean operations aren't supported on arrays of different lengths")
AdamGS marked this conversation as resolved.
Show resolved Hide resolved
}
}

#[cfg(test)]
mod test {
use crate::array::constant::ConstantArray;
Expand Down
13 changes: 6 additions & 7 deletions vortex-array/src/array/varbin/builder.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::mem;

use arrow_buffer::NullBufferBuilder;
use bytes::BytesMut;
use vortex_dtype::DType;
use vortex_dtype::NativePType;

Expand All @@ -11,7 +10,7 @@ use crate::IntoArray;

pub struct VarBinBuilder<O: NativePType> {
offsets: Vec<O>,
data: Vec<u8>,
data: BytesMut,
validity: NullBufferBuilder,
}

Expand All @@ -21,7 +20,7 @@ impl<O: NativePType> VarBinBuilder<O> {
offsets.push(O::zero());
Self {
offsets,
data: Vec::new(),
data: BytesMut::new(),
validity: NullBufferBuilder::new(len),
}
}
Expand All @@ -48,9 +47,9 @@ impl<O: NativePType> VarBinBuilder<O> {
self.validity.append_null();
}

pub fn finish(&mut self, dtype: DType) -> VarBinArray {
let offsets = PrimitiveArray::from(mem::take(&mut self.offsets));
let data = PrimitiveArray::from(mem::take(&mut self.data));
pub fn finish(mut self, dtype: DType) -> VarBinArray {
let offsets = PrimitiveArray::from(self.offsets);
let data = PrimitiveArray::from(Vec::from(self.data.freeze()));

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't zero copy is it? We can add a function to PrimitiveArray to construct directly from a buffer (for PType==u8?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, @robert3005 did we discuss VarBin data being a buffer instead of a child array?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we thought about it but then we wanted to have something like ZstdEncoding

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’m starting to think general purpose compression can be configured on buffers at write-time though; using the layouts mechanism.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not zero-copy, but still pretty cheap IMO. Constructing things from Bytes is hard because there's no guarantee the instance is exclusive

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But PrimitiveArray wraps a vortex-buffer, which itself wraps Bytes. So this copy is purely because the right API isn't exposed / isn't used

Copy link
Contributor Author

@AdamGS AdamGS Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

true, fixing. Added a from_bytes function.

let nulls = self.validity.finish();

Expand Down
6 changes: 2 additions & 4 deletions vortex-array/src/canonical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,8 @@ fn varbin_to_arrow(varbin_array: VarBinArray) -> ArrayRef {
PType::I32 | PType::I64 => offsets,
// Unless it's u64, everything else can be converted into an i32.
// FIXME(ngates): do not copy offsets again
PType::U64 => try_cast(&offsets.to_array(), PType::I64.into())
.expect("cast to i64")
.into_primitive()
.expect("flatten_primitive"),
PType::U64 => offsets.reinterpret_cast(PType::I64),
PType::U32 => offsets.reinterpret_cast(PType::I32),
_ => try_cast(&offsets.to_array(), PType::I32.into())
.expect("cast to i32")
.into_primitive()
Expand Down
44 changes: 44 additions & 0 deletions vortex-array/src/compute/boolean.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
use arrow_array::cast::AsArray;
use vortex_error::VortexResult;

use crate::{arrow::FromArrowArray, Array, ArrayData, IntoArray, IntoArrayVariant, IntoCanonical};

pub trait AndFn {
fn and(&self, array: &Array) -> VortexResult<Array>;
}

pub trait OrFn {
fn or(&self, array: &Array) -> VortexResult<Array>;
}

pub fn and(lhs: &Array, rhs: &Array) -> VortexResult<Array> {
if let Some(selection) = lhs.with_dyn(|lhs| lhs.and().map(|lhs| lhs.and(rhs))) {
gatesn marked this conversation as resolved.
Show resolved Hide resolved
gatesn marked this conversation as resolved.
Show resolved Hide resolved
return selection;
}

let lhs = lhs.clone().into_bool()?.into_canonical()?.into_arrow();
AdamGS marked this conversation as resolved.
Show resolved Hide resolved
let lhs_bool = lhs.as_boolean();
let rhs = rhs.clone().into_bool()?.into_canonical()?.into_arrow();
let rhs_bool = rhs.as_boolean();

let data =
ArrayData::from_arrow(&arrow_arith::boolean::and(lhs_bool, rhs_bool)?, true).into_array();

Ok(data)
}

pub fn or(lhs: &Array, rhs: &Array) -> VortexResult<Array> {
if let Some(selection) = lhs.with_dyn(|lhs| lhs.and().map(|lhs| lhs.and(rhs))) {
return selection;
}

let lhs = lhs.clone().into_bool()?.into_canonical()?.into_arrow();
let lhs_bool = lhs.as_boolean();
let rhs = rhs.clone().into_bool()?.into_canonical()?.into_arrow();
let rhs_bool = rhs.as_boolean();

let data =
ArrayData::from_arrow(&arrow_arith::boolean::or(lhs_bool, rhs_bool)?, true).into_array();

Ok(data)
}
18 changes: 17 additions & 1 deletion vortex-array/src/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//! implementations of these operators, else we will decode, and perform the equivalent operator
//! from Arrow.

use boolean::{AndFn, OrFn};
AdamGS marked this conversation as resolved.
Show resolved Hide resolved
pub use compare::{compare, CompareFn};
pub use filter::{filter, FilterFn};
pub use filter_indices::{filter_indices, FilterIndicesFn};
Expand All @@ -21,10 +22,11 @@ use unary::scalar_subtract::SubtractScalarFn;
mod compare;
mod filter;
mod filter_indices;
mod search_sorted;
mod slice;
mod take;

mod search_sorted;
pub mod boolean;
pub mod unary;

/// Trait providing compute functions on top of Vortex arrays.
Expand Down Expand Up @@ -98,4 +100,18 @@ pub trait ArrayCompute {
fn take(&self) -> Option<&dyn TakeFn> {
None
}

/// Perform a boolean AND operation over two arrays
///
/// See: [AndFn].
fn and(&self) -> Option<&dyn AndFn> {
None
}

/// Perform a boolean OR operation over two arrays
///
/// See: [OrFn].
fn or(&self) -> Option<&dyn OrFn> {
None
}
}
24 changes: 9 additions & 15 deletions vortex-datafusion/src/eval.rs
Original file line number Diff line number Diff line change
@@ -1,35 +1,29 @@
use datafusion_expr::{Expr, Operator as DFOperator};
use vortex::{
array::{bool::BoolArray, constant::ConstantArray},
array::constant::ConstantArray,
compute::boolean::{and, or},
compute::compare,
Array, IntoArray, IntoArrayVariant,
Array, IntoArray,
};
use vortex_error::{vortex_bail, vortex_err, VortexResult};
use vortex_expr::Operator;

use crate::can_be_pushed_down;

pub struct ExpressionEvaluator;

impl ExpressionEvaluator {
pub fn eval(array: Array, expr: &Expr) -> VortexResult<Array> {
debug_assert!(can_be_pushed_down(expr));

match expr {
Expr::BinaryExpr(expr) => {
let lhs = ExpressionEvaluator::eval(array.clone(), expr.left.as_ref())?;
let rhs = ExpressionEvaluator::eval(array, expr.right.as_ref())?;

// TODO(adamg): turn and/or into more general compute functions
match expr.op {
DFOperator::And => {
let lhs = lhs.into_bool()?;
let rhs = rhs.into_bool()?;
let buffer = &lhs.boolean_buffer() & &rhs.boolean_buffer();
Ok(BoolArray::from(buffer).into_array())
}
DFOperator::Or => {
let lhs = lhs.into_bool()?;
let rhs = rhs.into_bool()?;
let buffer = &lhs.boolean_buffer() | &rhs.boolean_buffer();
Ok(BoolArray::from(buffer).into_array())
}
DFOperator::And => and(&lhs, &rhs),
DFOperator::Or => or(&lhs, &rhs),
DFOperator::Eq => compare(&lhs, &rhs, Operator::Eq),
DFOperator::Gt => compare(&lhs, &rhs, Operator::Gt),
DFOperator::GtEq => compare(&lhs, &rhs, Operator::Gte),
Expand Down
11 changes: 9 additions & 2 deletions vortex-datafusion/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,10 @@ fn make_filter_then_take_plan(

fn supported_data_types(dt: DataType) -> bool {
dt.is_integer()
|| dt.is_floating()
|| dt.is_signed_integer()
|| dt.is_floating()
|| dt.is_null()
|| dt == DataType::Boolean
|| dt == DataType::Binary
|| dt == DataType::Utf8
|| dt == DataType::Binary
Expand Down Expand Up @@ -474,7 +475,7 @@ mod test {
use datafusion::functions_aggregate::count::count_distinct;
use datafusion::prelude::SessionContext;
use datafusion_common::{Column, TableReference};
use datafusion_expr::{col, lit, BinaryExpr, Expr, Operator};
use datafusion_expr::{and, col, lit, BinaryExpr, Expr, Operator};
use vortex::array::primitive::PrimitiveArray;
use vortex::array::struct_::StructArray;
use vortex::array::varbin::VarBinArray;
Expand Down Expand Up @@ -618,4 +619,10 @@ mod test {

assert!(!can_be_pushed_down(&e));
}

#[test]
fn test_can_be_pushed_down4() {
let e = and((col("a")).eq(lit(2u64)), col("b").eq(lit(true)));
assert!(can_be_pushed_down(&e));
}
}
4 changes: 4 additions & 0 deletions vortex-dtype/src/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ impl DType {
pub fn is_struct(&self) -> bool {
matches!(self, Struct(_, _))
}

pub fn is_boolean(&self) -> bool {
matches!(self, Bool(_))
}
}

impl Display for DType {
Expand Down
Loading