Skip to content

Commit

Permalink
Implement basic streaming validation
Browse files Browse the repository at this point in the history
Located in `simdutf8::basic::imp` with the public_imp feature flag
  • Loading branch information
hkratz authored May 14, 2021
1 parent 0c763e1 commit 5f020de
Show file tree
Hide file tree
Showing 9 changed files with 480 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
profile: minimal
override: true
- name: Run tests
run: cargo test --no-default-features ${{ matrix.features }} --verbose
run: cargo test --no-default-features ${{ matrix.features }} --all-targets --verbose
env:
RUSTFLAGS: ${{ matrix.rustflags }}

Expand Down
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,14 @@ For no-std support (compiled with `--no-default-features`) the implementation is
the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"`
for the SSE 4.2 implementation.

If you want to be able to call a SIMD implementation directly, use the `public_imp` feature flag. The validation
implementations are then accessible via `simdutf8::{basic, compat}::imp::x86::{avx2, sse42}::validate_utf8()`.

### ARM64
For ARM64 support Nightly Rust is needed and the crate feature `aarch64_neon` needs to be enabled. CAVE: If this features is
not turned on the non-SIMD std library implementation is used.

If you want to be able to call a SIMD implementation directly, use the `public_imp` feature flag. The validation implementations
are then accessible via `simdutf8::{basic, compat}::imp::aarch64::neon::validate_utf8()`.
### Access to low-level functionality

If you want to be able to call a SIMD implementation directly, use the `public_imp` feature flag. The validation implementations are then accessible in the `simdutf8::{basic, compat}::imp` hierarchy. Traits
facilitating streaming validation are available there as well.

## Optimisation flags
Do not use [`opt-level = "z"`](https://doc.rust-lang.org/cargo/reference/profiles.html), which prevents inlining and makes
Expand Down
36 changes: 36 additions & 0 deletions examples/streaming.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#[cfg(feature = "public_imp")]
use simdutf8::basic::imp::Utf8Validator;

#[allow(unused_imports)]
use std::io::{stdin, Read, Result};

#[cfg(feature = "public_imp")]
fn main() -> Result<()> {
unsafe {
if !std::is_x86_feature_detected!("avx2") {
panic!("This example only works with CPUs supporting AVX 2");
}

let mut validator = simdutf8::basic::imp::x86::avx2::Utf8ValidatorImp::new();
let mut buf = vec![0; 8192];
loop {
let bytes_read = stdin().read(buf.as_mut())?;
if bytes_read == 0 {
break;
}
validator.update(&buf);
}

if validator.finalize().is_ok() {
println!("Input is valid UTF-8");
} else {
println!("Input is not valid UTF-8");
}
}

Ok(())
}

/// Dummy main. This example requires the crate feature `public_imp`.
#[cfg(not(feature = "public_imp"))]
fn main() {}
147 changes: 145 additions & 2 deletions src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,27 +57,170 @@ pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {

/// Allows direct access to the platform-specific unsafe validation implementations.
#[cfg(feature = "public_imp")]
#[cfg_attr(docsrs, doc(cfg(feature = "public_imp")))]
pub mod imp {
/// A low-level interfacne for streaming validation of UTF-8 data. It is meant to be integrated
/// in high-performance data processing pipelines.
///
/// Data can be streamed in arbitrarily-sized chunks using the [`Self::update()`] method. There is
/// no way to find out if the input so far was valid UTF-8 during the validation. Only when
/// the validation is completed with the [`Self::finalize()`] method the result of the validation is
/// returned. Use [`ChunkedUtf8Validator`] is possible for highest performance.
///
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available which
/// is why all trait methods are `unsafe`.
///
/// General usage:
/// ```rust
/// use simdutf8::basic::imp::Utf8Validator;
/// use std::io::{stdin, Read, Result};
///
/// fn main() -> Result<()> {
/// unsafe {
/// if !std::is_x86_feature_detected!("avx2") {
/// panic!("This example only works with CPUs supporting AVX 2");
/// }
///
/// let mut validator = simdutf8::basic::imp::x86::avx2::Utf8ValidatorImp::new();
/// let mut buf = vec![0; 8192];
/// loop {
/// let bytes_read = stdin().read(buf.as_mut())?;
/// if bytes_read == 0 {
/// break;
/// }
/// validator.update(&buf);
/// }
///
/// if validator.finalize().is_ok() {
/// println!("Input is valid UTF-8");
/// } else {
/// println!("Input is not valid UTF-8");
/// }
/// }
///
/// Ok(())
/// }
/// ```
///
pub trait Utf8Validator {
/// Creates a new validator.
///
/// # Safety
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available.
#[must_use]
unsafe fn new() -> Self
where
Self: Sized;

/// Updates the validator with `input`.
///
/// # Safety
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available.
unsafe fn update(&mut self, input: &[u8]);

/// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
///
/// # Errors
/// A [`crate::basic::Utf8Error`] is returned if the input was not valid UTF-8. No
/// further information about the location of the error is provided.
///
/// # Safety
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available.
unsafe fn finalize(self) -> core::result::Result<(), crate::basic::Utf8Error>;
}

/// Like [`Utf8Validator`] this low-level API is for streaming validation of UTF-8 data.
/// It has additional restrictions imposed on how the input is passed in to allow
/// validation with as little overhead as possible.
///
/// To feed it data you need to call the [`Self::update_from_chunks()`] method which takes slices which
/// have to be a multiple of 64 bytes long. The method will panic otherwise. There is
/// no way to find out if the input so far was valid UTF-8 during the validation. Only when
/// the validation is completed with the [`Self::finalize()`] method the result of the validation is
/// returned.
///
/// The `Self::finalize()` method can be fed the rest of the data. There is no restriction on the
/// data passed to it.
///
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available which
/// is why all trait methods are `unsafe`.
pub trait ChunkedUtf8Validator {
/// Creates a new validator.
///
/// # Safety
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available.
#[must_use]
unsafe fn new() -> Self
where
Self: Sized;

/// Updates the validator with `input`.
///
/// # Panics
/// If `input.len()` is not a multiple of 64.
///
/// # Safety
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available.
unsafe fn update_from_chunks(&mut self, input: &[u8]);

/// Updates the validator with remaining input if any. There is no restriction on the
/// data provided.
///
/// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
///
/// # Errors
/// A [`crate::basic::Utf8Error`] is returned if the input was not valid UTF-8. No
/// further information about the location of the error is provided.
///
/// # Safety
/// This implementation requires CPU SIMD features specified by the module it resides in.
/// It is undefined behavior to call it if the required CPU features are not available.
unsafe fn finalize(
self,
remaining_input: core::option::Option<&[u8]>,
) -> core::result::Result<(), crate::basic::Utf8Error>;
}

/// Includes the x86/x86-64 SIMD implementations.
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
pub mod x86 {
/// Includes the validation implementation for AVX 2-compatible CPUs.
///
/// Using the provided functionality on CPUs which do not support AVX 2 is undefined
/// behavior and will very likely cause a crash.
pub mod avx2 {
pub use crate::implementation::x86::avx2::validate_utf8_basic as validate_utf8;
pub use crate::implementation::x86::avx2::ChunkedUtf8ValidatorImp;
pub use crate::implementation::x86::avx2::Utf8ValidatorImp;
}
/// Includes the validation implementation for SSE 4.2-compatible CPUs.
///
/// Using the provided functionality on CPUs which do not support AVX 2 is undefined
/// behavior and will very likely cause a crash.
pub mod sse42 {
pub use crate::implementation::x86::sse42::validate_utf8_basic as validate_utf8;
pub use crate::implementation::x86::sse42::ChunkedUtf8ValidatorImp;
pub use crate::implementation::x86::sse42::Utf8ValidatorImp;
}
}

/// Includes the aarch64 SIMD implementations.
#[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
pub mod aarch64 {
/// Includes the validation implementation for Neon SIMD.
/// Includes the Neon-based validation implementation for aarch64 CPUs.
///
/// Should be supported on all ARM64 CPUSs. If it is not supported by the operating
/// system using it is undefined behavior and will likely cause a crash.
pub mod neon {
pub use crate::implementation::aarch64::neon::validate_utf8_basic as validate_utf8;
pub use crate::implementation::aarch64::neon::ChunkedUtf8ValidatorImp;
pub use crate::implementation::aarch64::neon::Utf8ValidatorImp;
}
}
}
1 change: 0 additions & 1 deletion src/compat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {

/// Allows direct access to the platform-specific unsafe validation implementations.
#[cfg(feature = "public_imp")]
#[cfg_attr(docsrs, doc(cfg(feature = "public_imp")))]
pub mod imp {
/// Includes the x86/x86-64 SIMD implementations.
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
Expand Down
1 change: 1 addition & 0 deletions src/implementation/aarch64/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#[cfg(all(feature = "aarch64_neon", target_feature = "neon"))]
#[allow(dead_code)]
pub(crate) mod neon;

#[inline]
Expand Down
Loading

0 comments on commit 5f020de

Please sign in to comment.