From ae401e9d4cf1caf7fb0d40bb6b9857bb0da3fdd5 Mon Sep 17 00:00:00 2001 From: Josh Casale Date: Wed, 15 May 2024 16:22:36 +0100 Subject: [PATCH] Minimal expressions API for vortex (#318) Minimal set of expressions and operators for defining predicates over vortex arrays. This does **_not_** express recursive expression trees, but instead restricts the algebra to disjunction-of-conjunction (OR-of-AND) of field/value comparison operations. I tried to restrict this to a manageable set of expressions that we will definitely need as to avoid bloat; the expectation here is that this may grow over time. Subsequent changes will introduce: - [ ] A pushdown API for vortex arrays - [ ] Array pushdown implementations for the various expressions Supercedes https://github.com/spiraldb/vortex/pull/308 --- Cargo.lock | 10 +++ Cargo.toml | 1 + vortex-dtype/src/dtype.rs | 3 +- vortex-expr/Cargo.toml | 28 +++++++++ vortex-expr/README.md | 6 ++ vortex-expr/src/display.rs | 98 +++++++++++++++++++++++++++++ vortex-expr/src/expressions.rs | 112 +++++++++++++++++++++++++++++++++ vortex-expr/src/lib.rs | 6 ++ vortex-expr/src/operators.rs | 35 +++++++++++ 9 files changed, 298 insertions(+), 1 deletion(-) create mode 100644 vortex-expr/Cargo.toml create mode 100644 vortex-expr/README.md create mode 100644 vortex-expr/src/display.rs create mode 100644 vortex-expr/src/expressions.rs create mode 100644 vortex-expr/src/lib.rs create mode 100644 vortex-expr/src/operators.rs diff --git a/Cargo.lock b/Cargo.lock index eab7e174fd..d0c1c520d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5145,6 +5145,16 @@ dependencies = [ "worker", ] +[[package]] +name = "vortex-expr" +version = "0.1.0" +dependencies = [ + "serde", + "vortex-dtype", + "vortex-error", + "vortex-scalar", +] + [[package]] name = "vortex-fastlanes" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 86849aa003..6083f64720 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ members = [ "vortex-datetime-parts", "vortex-dict", "vortex-error", + "vortex-expr", "vortex-fastlanes", "vortex-flatbuffers", "vortex-ipc", diff --git a/vortex-dtype/src/dtype.rs b/vortex-dtype/src/dtype.rs index c3befde2f8..a308020fa1 100644 --- a/vortex-dtype/src/dtype.rs +++ b/vortex-dtype/src/dtype.rs @@ -8,7 +8,8 @@ use DType::*; use crate::nullability::Nullability; use crate::{ExtDType, PType}; -pub type FieldNames = Arc<[Arc]>; +pub type FieldName = Arc; +pub type FieldNames = Arc<[FieldName]>; pub type Metadata = Vec; diff --git a/vortex-expr/Cargo.toml b/vortex-expr/Cargo.toml new file mode 100644 index 0000000000..874b0cb563 --- /dev/null +++ b/vortex-expr/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "vortex-expr" +version = { workspace = true } +description = "Vortex Expressions" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } +keywords = { workspace = true } +include = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +vortex-dtype = { path = "../vortex-dtype" } +vortex-error = { path = "../vortex-error" } +vortex-scalar = { path = "../vortex-scalar" } +serde = { workspace = true, optional = true, features = ["derive"] } + + +[dev-dependencies] + + +[features] +serde = ["dep:serde", "vortex-dtype/serde", "vortex-scalar/serde"] \ No newline at end of file diff --git a/vortex-expr/README.md b/vortex-expr/README.md new file mode 100644 index 0000000000..4dcaec21d6 --- /dev/null +++ b/vortex-expr/README.md @@ -0,0 +1,6 @@ +# Vortex Expressions + +A crate defining serializable predicate expressions. Used predominantly for filter push-down. + +Takes inspiration from postgres https://www.postgresql.org/docs/current/sql-expressions.html +and datafusion https://github.com/apache/datafusion/tree/5fac581efbaffd0e6a9edf931182517524526afd/datafusion/expr diff --git a/vortex-expr/src/display.rs b/vortex-expr/src/display.rs new file mode 100644 index 0000000000..78803b576e --- /dev/null +++ b/vortex-expr/src/display.rs @@ -0,0 +1,98 @@ +use core::fmt; +use std::fmt::{Display, Formatter}; + +use crate::expressions::{Conjunction, Disjunction, Predicate, Value}; +use crate::operators::Operator; + +impl Display for Disjunction { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.conjunctions + .iter() + .map(|v| format!("{}", v)) + .intersperse("\nOR \n".to_string()) + .try_for_each(|s| write!(f, "{}", s)) + } +} + +impl Display for Conjunction { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.predicates + .iter() + .map(|v| format!("{}", v)) + .intersperse(" AND ".to_string()) + .try_for_each(|s| write!(f, "{}", s)) + } +} + +impl Display for Predicate { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "({} {} {})", self.left, self.op, self.right) + } +} + +impl Display for Value { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Value::Field(expr) => std::fmt::Display::fmt(expr, f), + Value::Literal(scalar) => scalar.fmt(f), + } + } +} + +impl Display for Operator { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + let display = match &self { + Operator::EqualTo => "=", + Operator::NotEqualTo => "!=", + Operator::GreaterThan => ">", + Operator::GreaterThanOrEqualTo => ">=", + Operator::LessThan => "<", + Operator::LessThanOrEqualTo => "<=", + }; + write!(f, "{display}") + } +} + +#[cfg(test)] +mod tests { + use crate::expressions::{lit, Conjunction, Disjunction}; + + #[test] + fn test_predicate_formatting() { + // And + assert_eq!(format!("{}", lit(1u32).lt(lit(2u32))), "(1 < 2)"); + // Or + assert_eq!(format!("{}", lit(1u32).gte(lit(2u32))), "(1 >= 2)"); + // Not + assert_eq!(format!("{}", !lit(1u32).lte(lit(2u32))), "(1 > 2)"); + } + + #[test] + fn test_dnf_formatting() { + let d1 = Conjunction { + predicates: vec![ + lit(1u32).lt(lit(2u32)), + lit(1u32).gte(lit(2u32)), + !lit(1u32).lte(lit(2u32)), + ], + }; + let d2 = Conjunction { + predicates: vec![ + lit(2u32).lt(lit(3u32)), + lit(3u32).gte(lit(4u32)), + !lit(5u32).lte(lit(6u32)), + ], + }; + + let dnf = Disjunction { + conjunctions: vec![d1, d2], + }; + + let string = format!("{}", dnf); + print!("{}", string); + assert_eq!( + string, + "(1 < 2) AND (1 >= 2) AND (1 > 2)\nOR \n(2 < 3) AND (3 >= 4) AND (5 > 6)" + ); + } +} diff --git a/vortex-expr/src/expressions.rs b/vortex-expr/src/expressions.rs new file mode 100644 index 0000000000..342a1a76f4 --- /dev/null +++ b/vortex-expr/src/expressions.rs @@ -0,0 +1,112 @@ +use vortex_dtype::FieldName; +use vortex_scalar::Scalar; + +use crate::expressions::Value::Field; +use crate::operators::Operator; + +#[cfg_attr( + feature = "serde", + derive(serde::Serialize, serde::Deserialize), + serde(transparent) +)] +pub struct Disjunction { + pub conjunctions: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr( + feature = "serde", + derive(serde::Serialize, serde::Deserialize), + serde(transparent) +)] +pub struct Conjunction { + pub predicates: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum Value { + /// A named reference to a qualified field in a dtype. + Field(FieldName), + /// A constant scalar value. + Literal(Scalar), +} + +impl Value { + pub fn field(field_name: impl Into) -> Value { + Field(field_name.into()) + } + // comparisons + pub fn eq(self, other: Value) -> Predicate { + Predicate { + left: self, + op: Operator::EqualTo, + right: other, + } + } + + pub fn not_eq(self, other: Value) -> Predicate { + Predicate { + left: self, + op: Operator::NotEqualTo, + right: other, + } + } + + pub fn gt(self, other: Value) -> Predicate { + Predicate { + left: self, + op: Operator::GreaterThan, + right: other, + } + } + + pub fn gte(self, other: Value) -> Predicate { + Predicate { + left: self, + op: Operator::GreaterThanOrEqualTo, + right: other, + } + } + + pub fn lt(self, other: Value) -> Predicate { + Predicate { + left: self, + op: Operator::LessThan, + right: other, + } + } + + pub fn lte(self, other: Value) -> Predicate { + Predicate { + left: self, + op: Operator::LessThanOrEqualTo, + right: other, + } + } +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct Predicate { + pub left: Value, + pub op: Operator, + pub right: Value, +} + +pub fn lit>(n: T) -> Value { + Value::Literal(n.into()) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_lit() { + let scalar: Scalar = 1.into(); + let rhs: Value = lit(scalar); + let expr = Value::field("id").eq(rhs); + assert_eq!(format!("{}", expr), "(id = 1)"); + } +} diff --git a/vortex-expr/src/lib.rs b/vortex-expr/src/lib.rs new file mode 100644 index 0000000000..7856326d86 --- /dev/null +++ b/vortex-expr/src/lib.rs @@ -0,0 +1,6 @@ +#![feature(iter_intersperse)] +extern crate core; + +mod display; +pub mod expressions; +pub mod operators; diff --git a/vortex-expr/src/operators.rs b/vortex-expr/src/operators.rs new file mode 100644 index 0000000000..185a79a0c9 --- /dev/null +++ b/vortex-expr/src/operators.rs @@ -0,0 +1,35 @@ +use std::ops; + +use crate::expressions::Predicate; + +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum Operator { + // comparison + EqualTo, + NotEqualTo, + GreaterThan, + GreaterThanOrEqualTo, + LessThan, + LessThanOrEqualTo, +} + +impl ops::Not for Predicate { + type Output = Self; + + fn not(self) -> Self::Output { + let inverse_op = match self.op { + Operator::EqualTo => Operator::NotEqualTo, + Operator::NotEqualTo => Operator::EqualTo, + Operator::GreaterThan => Operator::LessThanOrEqualTo, + Operator::GreaterThanOrEqualTo => Operator::LessThan, + Operator::LessThan => Operator::GreaterThanOrEqualTo, + Operator::LessThanOrEqualTo => Operator::GreaterThan, + }; + Predicate { + left: self.left, + op: inverse_op, + right: self.right, + } + } +}