From 9c0a7017b85cb43cdd161456d4f7be7b3e38463f Mon Sep 17 00:00:00 2001 From: sarah Date: Sat, 24 Aug 2024 00:36:50 +0000 Subject: [PATCH] oops: restore book guide --- book/Cargo.toml | 17 + book/brand-rust.svg | 57 +++ book/dev_guide.typ | 1034 +++++++++++++++++++++++++++++++++++++++++++ book/user_guide.typ | 512 +++++++++++++++++++++ 4 files changed, 1620 insertions(+) create mode 100644 book/Cargo.toml create mode 100644 book/brand-rust.svg create mode 100644 book/dev_guide.typ create mode 100644 book/user_guide.typ diff --git a/book/Cargo.toml b/book/Cargo.toml new file mode 100644 index 00000000..334a8178 --- /dev/null +++ b/book/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "faer-book" +version = "0.0.0" +edition = "2021" + +[dependencies] +faer = "0.15.0" +faer-core = "0.15.0" +faer-cholesky = "0.15.0" +faer-qr = "0.15.0" +faer-lu = "0.15.0" +faer-svd = "0.15.0" +faer-evd = "0.15.0" + +[[bin]] +name = "intro" +path = "src/intro.rs" diff --git a/book/brand-rust.svg b/book/brand-rust.svg new file mode 100644 index 00000000..dd830c26 --- /dev/null +++ b/book/brand-rust.svg @@ -0,0 +1,57 @@ + + + \ No newline at end of file diff --git a/book/dev_guide.typ b/book/dev_guide.typ new file mode 100644 index 00000000..131a0968 --- /dev/null +++ b/book/dev_guide.typ @@ -0,0 +1,1034 @@ +#set text(font: "New Computer Modern") + +#show raw: set text(font: "New Computer Modern Mono", size: 1.2em) + +#show par: set block(spacing: 0.55em) + +#show heading: set block(above: 1.4em, below: 1em) + +#show link: underline + +#set page(numbering: "1") + +#set par(leading: 0.55em, justify: true) + +#set heading(numbering: "1.1") + +#show heading.where(level: 1): it => pagebreak(weak:true) + block({ + set text(font: "New Computer Modern", weight: "black") + v(2cm) + block(text(18pt)[Chapter #counter(heading).display()]) + v(1cm) + block(text(22pt)[#it.body]) + v(1cm) +}) + +#import "@preview/codly:0.1.0" +#import "@preview/tablex:0.0.6": tablex, rowspanx, colspanx, gridx, hlinex, vlinex +#import "@preview/colorful-boxes:1.2.0": colorbox + +#let icon(codepoint) = { + box( + height: 0.8em, + baseline: 0.05em, + image(codepoint) + ) + h(0.1em) +} + +#show: codly.codly-init.with() + +#codly.codly( + languages: ( + rust: (name: "Rust", icon: icon("brand-rust.svg"), color: rgb("#CE412B")), + ), + breakable: false, + width-numbers: none, +) + +#outline() + +== Introduction +_`faer-rs`_ is a general-purpose linear algebra library for the Rust +programming language, with a focus on correctness, portability, and +performance. +In this book, we'll be assuming version `0.16.0` of the library. + +_`faer`_ is designed around a high level API that sacrifices some amount of +performance and customizability in exchange for ease of use, as well as a low +level API that offers more control over memory allocations and multithreading +capabilities. The two APIs share the same data structures and can be used +together or separately, depending on the user's needs. + +This book assumes some level of familiarity with Rust, linear algebra and _`faer`_'s API. +Users who are new to the library are encouraged to get started by taking a look +at the user guide, the library's examples directory +#footnote[`faer-rs/faer-libs/faer/examples`] and browsing the `docs.rs` +documentation #footnote[https://docs.rs/faer/0.16.0/faer/index.html]. + +We will go into detail over the various operations and matrix decompositions +that are provided by the library, as well as their implementation details. We +will also explain the architecture of _`faer`_'s data structures and how low +level operations are handled using vectorized SIMD instructions. + +#pagebreak() + += Data layout and the `Entity` trait + +In most linear algebra libraries, matrix data is stored contiguously in memory, +regardless of the scalar type. This can be done in two ways, either a row-major +layout or a column-major layout. + +Consider the matrix +$ mat( + a_11, a_12; + a_21, a_22; + a_31, a_32; +) $ +Storing it in row-major layout would place the values in memory in the following order: +$ ( + a_11, a_12, + a_21, a_22, + a_31, a_32 +), $ +while storing it in column-major order would place the values in memory in this order: +$ ( + a_11, a_21, a_31, + a_12, a_22, a_32 +). $ + +_`faer`_, on the other hand, first splits each scalar into its atomic units, +then stores each unit matrix separately in a contiguous fashion. The library +does not mandate the usage of one layout or the other, but heavily prefers to receive +data in column-major layout, with the notable exception of matrix multiplication which +we try to optimize for both column-major and row-major layouts. + +The way in which a scalar can be split is chosen by the scalar type itself. +For example, a complex floating point type may choose to either be stored as one unit +or as a group of two units. + +Given the following complex matrix: +$ mat( + a_11 + i b_11, a_12 + i b_12; + a_21 + i b_21, a_22 + i b_22; + a_31 + i b_31, a_32 + i b_32; +), $ +and assuming column-major layout, we can either choose the following storage scheme in which +the full number is considered a single unit: +$ ( + a_11, b_11, a_21, b_21, a_31, b_31, + a_12, b_12, a_22, b_22, a_32, b_32 +), $ + +or the following scheme in which the real and imaginary parts are considered two distinct units +$ ( + a_11, a_21, a_31, + a_12, a_22, a_32 +),\ +( + b_11, b_21, b_31, + b_12, b_22, b_32 +). $ + +The former is commonly referred to as AoS layout (array of structures), while +the latter is called SoA (structure of arrays). The choice of which one to use +depends on the context. As a general rule, types that are natively vectorizable +(have direct CPU support for arithmetic operations) prefer to be laid out in +AoS layout. On the other hand, types that do not have native vectorization +support but can still be vectorized by combining more primitive operations +prefer to be laid out in SoA layout. + +Types that are not vectorizable may be in either one, but the AoS layout is +typically easier to work with in that scenario. + +== `Entity` trait +The `Entity` trait determines how a type prefers to be stored in memory, +through its associated type `Group`. + +Given some type `E` that implements `Entity`, we can manipulate groups of +arbitrary types in a generic way. + +For example, `faer_core::GroupFor` is an `E`-group of `E::Unit`, which can be +thought of as a raw representation of `E`. + +Pre-existing data can be referred to using a reference to a slice or a raw +pointer, for example `GroupFor`. + +The `Entity` trait requires associated functions to convert from one `E`-group type to another. +For example, we can take a reference to each element in a group with +`E::faer_as_ref`, or `E::faer_as_mut`. + +```rust +use faer_core::{Entity, GroupFor}; + +fn value_to_unit_references(value: E) { + let units: GroupFor = value.into_units(); + let references: GroupFor = E::faer_as_ref(&units); +} +``` + +We can map one group type to another using `E::faer_map`. +```rust +use faer_core::{Entity, GroupFor}; + +fn slice_to_ptr( + slice: GroupFor +) -> GroupFor { + E::faer_map(slice, |slice| slice.as_ptr()) +} +``` + +We can also zip and unzip groups of values with `E::faer_zip` and `E::faer_unzip`. +```rust +use faer_core::{Entity, GroupFor}; + +unsafe fn ptr_to_slice<'a, E: Entity>( + ptr: GroupFor, + len: GroupFor +) -> GroupFor { + let zipped: GroupFor = E::faer_zip(ptr, len); + E::faer_map(zipped, |(ptr, len)| std::slice::from_raw_parts(ptr, len)) +} + +unsafe fn split_at( + slice: GroupFor, + mid: usize +) -> (GroupFor, GroupFor) { + E::faer_unzip(E::faer_map(slice, |slice| slice.split_at(mid))) +} +``` + +== Matrix layout +Matrices in _`faer`_ fall into two broad categories with respect to layout. Owned +matrices (`Mat`) which are always stored in column-major layout, and matrix views +(`MatRef`/`MatMut`) which allow any strided layout. + +Note that even though matrix views allow for any row and column stride, they +are still typically optimized for column major layout, since that happens to be +the preferred layout for most matrix decompositions. + +Matrix views are roughly defined as: +```rust +struct MatRef<'a, E: Entity> { + ptr: GroupFor, + nrows: usize, + ncols: usize, + row_stride: isize, + col_stride: isize, + __marker: PhantomData<&'a E>, +} + +struct MatMut<'a, E: Entity> { + ptr: GroupFor, + nrows: usize, + ncols: usize, + row_stride: isize, + col_stride: isize, + __marker: PhantomData<&'a mut E>, +} +``` + +The actual implementation is slightly different in order to allow `MatRef` to +have `Copy` semantics, as well as make use of the fact that `ptr` is never null +to allow for niche optimizations (such as `Option>` having the +same layout as `MatRef<'_, E>`). + +`ptr` is a group of non-null pointers to units, each pointing to a matrix with an +underlying contiguous allocation. In other words, even though the data itself +is strided, it has to have a contiguous underlying storage in order to allow +for pointer arithmetic to be valid. + +`nrows`, `ncols`, `row_stride` and `col_stride` are the matrix dimensions and +strides, which must be the same for every unit matrix in the group. + +Finally, `__marker` imbues `MatRef` and `MatMut` with the correct variance, +This allows `MatRef<'short_lifetime, E>` to be a subtype of +`MatRef<'long_lifetime, E>`, which allows for better ergonomics. + +In addition to `Copy` semantics for `MatRef`, both `MatRef` and `MatMut` naturally +provide `Move` semantics, as do most Rust types. On top of that, they also provide +`Reborrow` semantics, which currently need to be explicitly used, unlike native +references which are implicitly reborrowed. + +#pagebreak() + +Reborrowing is the act of temporarily borrowing a matrix view as another matrix +view with a shorter lifetime. For example, given a `MatMut<'a, E>`, we would like +to pass it to functions taking `MatMut<'_, E>` by value without having to consume +our object. Unlike `MatRef<'a, E>`, this is not done automatically as `MatMut` +is not `Copy`. The solution is to mutably reborrow our `MatMut` object like this +```rust +fn function_taking_mat_ref(mat: MatRef<'_, E>) {} +fn function_taking_mat_mut(mat: MatMut<'_, E>) {} + +fn mutable_reborrow_example(mut mat: MatMut<'_, E>) { + use faer::prelude::*; + + function_taking_mat_mut(mat.rb_mut()); + function_taking_mat_mut(mat.rb_mut()); + function_taking_mat_ref(mat.rb()); + function_taking_mat_ref(mat.rb()); + function_taking_mat_mut(mat); + + // does not compile, since `mat` was moved in the previous call + // function_taking_mat_mut(mat); +} +``` + +Owned matrices on the other hand are roughly defined as: +```rust +struct Mat { + ptr: GroupFor, + nrows: usize, + ncols: usize, + row_capacity: usize, + col_capacity: usize, + __marker: PhantomData, +} + +impl Drop for Mat { + fn drop(&mut self) { + // deallocate the storage + } +} +``` +Unlike matrix views, we don't need to explicitly store the strides. We know that +the row stride is equal to `1`, since the layout is column major, and the column +stride is equal to `row_capacity`. + +We also have two new fields: `row_capacity` and `col_capacity`, which represent +how much storage we have for resizing the matrix without having to reallocate. + +`Mat` can be converted to `MatRef` using `Mat::as_ref(&self)` or `MatMut` using +`Mat::as_mut(&mut self)`. + += Vector operations +== Componentwise operations +Componentwise operations are operations that take $n$ matrices with matching +dimensions, producing an output of the same shape. Addition and subtraction +are examples of commonly used componentwise operations. + +Componentwise operations can be expressed in _`faer`_ using the `zipped!` +macro, followed by a call to `for_each` (for in-place iteration) or `map` (for +producing an output value). + +```rust +use faer_core::{zipped, unzipped}; + +fn a_plus_3b(a: MatRef<'_, f64>, b: MatRef<'_, f64>) -> Mat { + zipped!(a, b).map(|unzipped!(a, b)| { + *a + 3.0 * *b + }) +} + +fn swap_a_b(a: MatMut<'_, f64>, b: MatMut<'_, f64>) { + zipped!(a, b).for_each(|unzipped!(mut a, mut b)| { + (*a, *b) = (*b, *a); + }) +} +``` + +`zipped!` function calls can be more efficient than naive nested loops. The +reason for this is that `zipped!` analyzes the layout of the input matrices in +order to determine the optimal iteration order. For example whether it should +iterate over rows first, before columns. Or whether the iteration should happen +in reverse order (starting from the last row/column) instead of the forward +order. + +Currently, `zipped!` determines the iteration order based on the preferred +iteration order of the first matrix, but this may change in a future release. + +== Vectorized operations +SIMD (Single Instruction, Multiple Data) refers to the usage of CPU instructions +that take vectors of inputs, packed together in CPU registers, and perform the +same operation on all of them. As an example, classic addition takes two scalars +as an input and produces one output, while SIMD addition could take two vectors, +each containing 4 scalars, and adds them componentwise, producing an output vector +of 4 scalars. Correct SIMD usage is a crucial part of any linear algebra +library, given that most linear algebra operations lend themselves well to +vectorization. + +== SIMD with _`pulp`_ + +_`faer`_ provides a common interface for generic and composable SIMD, using the +_`pulp`_ crate as a backend. _`pulp`_'s high level API abstracts away the differences +between various instruction sets and provides a common API that's generic over +them (but not the scalar type). This allows users to write a generic implementation +that gets turned into several functions, one for each possible instruction set +among a predetermined subset. Finally, the generic implementation can be used along +with an `Arch` structure that determines the best implementation at runtime. + +Here's an example of how _`pulp`_ could be used to compute the expression $x^2 + +2y - |z|$, and store it into an output vector. + +```rust +use core::iter::zip; + +fn compute_expr(out: &mut[f64], x: &[f64], y: &[f64], z: &[f64]) { + struct Impl<'a> { + out: &'a mut [f64], + x: &'a [f64], + y: &'a [f64], + z: &'a [f64], + } + + impl pulp::WithSimd for Impl<'_> { + type Output = (); + + #[inline(always)] + fn with_simd(self, simd: S) { + let Self { out, x, y, z } = self; + + let (out_head, out_tail) = S::f64s_as_mut_simd(out); + let (x_head, x_tail) = S::f64s_as_simd(x); + let (y_head, y_tail) = S::f64s_as_simd(y); + let (z_head, z_tail) = S::f64s_as_simd(z); + + let two = simd.f64s_splat(2.0); + for (out, (&x, (&y, &z))) in zip( + out_head, + zip(x_head, zip(y_head, z_head)), + ) { + *out = simd.f64s_add( + x, + simd.f64s_sub(simd.f64s_mul(two, y), simd.f64s_abs(z)), + ); + } + + for (out, (&x, (&y, &z))) in zip( + out_tail, + zip(x_tail, zip(y_tail, z_tail)), + ) { + *out = x - 2.0 * y - z.abs(); + } + } + } + + pulp::Arch::new().dispatch(Impl { out, x, y, z }); +} +``` + +There's a lot of things going on at the same time in this code example. Let us +go over them step by step. + +_`pulp`_'s generic SIMD implementation happens through the `WithSimd` trait, +which takes `self` by value to pass in the function parameters. It additionally +provides another parameter to `with_simd` describing the instruction set being +used. `WithSimd::with_simd` *must* be marked with the `#[inline(always)]` attribute. +Forgetting to do so could lead to a significant performance drop. + +Inside the body of the function, we split up each of `out`, `x`, `y` and +`z` into two parts using `S::f64s_as[_mut]_simd`. The first part (`head`) is a +slice of `S::f64s`, representing the vectorizable part of the original slice. +The second part (`tail`) contains the remainder that doesn't fit into a vector +register. + +Handling the head section is done using vectorized operation. Currently these +need to take `simd` as a parameter, in order to guarantee its availability in a +sound way. This is what allows the API to be safe. The tail section is handled +using scalar operations. + +The final step is actually calling into our SIMD implementation. This is done +by creating an instance of `pulp::Arch` that performs the runtime detection +(and caches the result, so that future invocations are as fast as possible), +then calling `Arch::dispatch` which takes a type that implements `WithSimd`, +and chooses the best SIMD implementation for it. + +=== Memory alignment + +Instead of splitting the input and output slices into two sections +(vectorizable head + non-vectorizable tail), an alternative approach would be +to split them up into three sections instead (vectorizable head + vectorizable +body + vectorizable tail). This can be accomplished using masked loads and +stores, which can speed things up if the slices are _similarly aligned_. + +Similarly aligned slices are slices which have the same base address modulo +the byte size of the CPU's vector registers. The simplest way to guarantee this +is to allocate the slices in aligned memory (such that the base address is a +multiple of the register size in bytes), in which case the slices are similarly +aligned, and any subslices of them (with a shared offset and size) will also be +similarly aligned. Aligned allocation is done automatically for matrices in _`faer`_, +which helps uphold these guarantees for maximum performance. + +Here's an example of how one might write an implementation that makes use of +memory alignment, using _`pulp`_. + +```rust +use core::iter::zip; +use pulp::{Read, Write}; + +#[inline(always)] +fn compute_expr_register( + simd: S, + mut out: impl Write, + x: impl Read, + y: impl Read, + z: impl Read, +) { + let zero = simd.f64s_splat(0.0); + let x = x.read_or(zero); + let y = y.read_or(zero); + let z = z.read_or(zero); + let two = simd.f64s_splat(2.0); + out.write(simd.f64s_add( + x, + simd.f64s_sub(simd.f64s_mul(two, y), simd.f64s_abs(z)), + )); +} +impl pulp::WithSimd for Impl<'_> { + type Output = (); + #[inline(always)] + fn with_simd(self, simd: S) { + let Self { out, x, y, z } = self; + let offset = simd.f64s_align_offset(out.as_ptr(), out.len()); + + let (out_head, out_body, out_tail) = + simd.f64s_as_aligned_mut_simd(out, offset); + let (x_head, x_body, x_tail) = simd.f64s_as_aligned_simd(x, offset); + let (y_head, y_body, y_tail) = simd.f64s_as_aligned_simd(y, offset); + let (z_head, z_body, z_tail) = simd.f64s_as_aligned_simd(z, offset); + + compute_expr_register(simd, out_head, x_head, y_head, z_head); + for (out, (x, (y, z))) in zip( + out_body, + zip(x_body, zip(y_body, z_body)), + ) { + compute_expr_register(simd, out, x, y, z); + } + compute_expr_register(simd, out_tail, x_tail, y_tail, z_tail); + } +} +``` + +_`faer`_ adds one more abstraction layer on top of _`pulp`_, in order to make the +SIMD operations generic over the scalar type. This is done using the +`faer_core::group_helpers::SimdFor` struct that's effectively a thin +wrapper over `S`, and only exposes operations specific to the type `E`. + +Here's how one might implement the previous operation for a generic real scalar type. + +```rust +use faer_core::group_helpers::{SliceGroup, SliceGroupMut}; +use faer_core::RealField; + +struct Impl<'a, E: RealField> { + out: SliceGroupMut<'a, E>, + x: SliceGroup<'a, E>, + y: SliceGroup<'a, E>, + z: SliceGroup<'a, E>, +} +``` + +`&[f64]` and `&mut [f64]` are replaced by `SliceGroup<'_, E>` and +`SliceGroupMut<'_, E>`, to accomodate the fact that `E` might be an SoA type +that wants to be decomposed into multiple units. Aside from that change, most +of the code looks similar to what we had before. + +```rust +use core::iter::zip; +use faer_core::{RealField, SimdGroupFor, group_helpers::SimdFor}; +use pulp::{Read, Write}; +use reborrow::*; +#[inline(always)] +fn compute_expr_register( + simd: SimdFor, + mut out: impl Write>, + x: impl Read>, + y: impl Read>, + z: impl Read>, +) { + let zero = simd.splat(E::faer_zero()); + let two = simd.splat(E::faer_from_f64(2.0)); + let x = x.read_or(zero); + let y = y.read_or(zero); + let z = z.read_or(zero); + out.write(simd.add(x, simd.sub(simd.mul(two, y), simd.abs(z)))); +} + +``` + +```rust +impl pulp::WithSimd for Impl<'_, E> { + type Output = (); + #[inline(always)] + fn with_simd(self, simd: S) { + let Self { out, x, y, z } = self; + let simd = SimdFor::::new(simd); + let offset = simd.align_offset(out.rb()); + let (out_head, out_body, out_tail) = + simd.as_aligned_simd_mut(out, offset); + let (x_head, x_body, x_tail) = simd.as_aligned_simd(x, offset); + let (y_head, y_body, y_tail) = simd.as_aligned_simd(y, offset); + let (z_head, z_body, z_tail) = simd.as_aligned_simd(z, offset); + compute_expr_register(simd, out_head, x_head, y_head, z_head); + for (out, (x, (y, z))) in zip( + out_body.into_mut_iter(), + zip( + x_body.into_ref_iter(), + zip(y_body.into_ref_iter(), z_body.into_ref_iter()) + ), + ) { + compute_expr_register(simd, out, x, y, z); + } + compute_expr_register(simd, out_tail, x_tail, y_tail, z_tail); + } +} +``` + +== SIMD reductions +The previous examples focused on _vertical_ operations, which compute the +output of a componentwise operation and storing the result in an output vector. +Another interesting kind of operations is _horizontal_ ones, which accumulate +the result of one or more vector into one or more scalar values. + +One example of this is the dot product, which takes two vectors $a$ and $b$ of +size $n$ and computes $sum_(i = 0)^n a_i b_i$. + +One way to implement it would be like this: + +```rust +use faer_core::group_helpers::{SliceGroup, SliceGroupMut}; +use faer_core::RealField; + +struct Impl<'a, E: RealField> { + a: SliceGroup<'a, E>, + b: SliceGroup<'a, E>, +} +#[inline(always)] +fn dot_register( + simd: SimdFor, + acc: SimdGroupFor, + b: impl Read>, + a: impl Read>, +) -> SimdGroupFor { + let zero = simd.splat(E::faer_zero()); + let a = a.read_or(zero); + let b = b.read_or(zero); + simd.mul_add(a, b, acc) +} + +impl pulp::WithSimd for Impl<'_, E> { + type Output = (); + #[inline(always)] + fn with_simd(self, simd: S) { + let Self { a, b } = self; + let simd = SimdFor::::new(simd); + let offset = simd.align_offset(a); + + let (a_head, a_body, a_tail) = simd.as_aligned_simd(a, offset); + let (b_head, b_body, b_tail) = simd.as_aligned_simd(b, offset); + + let mut acc = simd.splat(E::faer_zero()); + acc = dot_register(simd, acc, a_head, b_head); + for (a, b) in zip(a_body, b_body) { + acc = dot_register(simd, acc, a, b); + } + acc = dot_register(simd, acc, a_tail, b_tail); + + simd.reduce_add(simd.rotate_left(acc, offset.rotate_left_amount())) + } +} +``` +The code looks similar to what we've written before. An interesting addition +is the use of `simd.rotate_left` in the last line. The reason for this is to +make sure our computed reduction doesn't depend on the memory offset, which can +help avoid variations in the output due to the non-associativity of floating point +arithmetic. + +For example, suppose our register size is 4 elements, and we want to compute +the dot product of 13 elements from each of $a$ and $b$. + +In the case where the memory is aligned, this is what the head, body and tail +of $a$ and $b$ look like: + +$ +a_("head") &= (a_1, a_2, a_3, a_4),\ +b_("head") &= (b_1, b_2, b_3, b_4),\ +\ +a_("body") &= [(a_5, a_6, a_7, a_8), (a_9, a_10, a_11, a_12)],\ +b_("body") &= [(b_5, b_6, b_7, b_8), (b_9, b_10, b_11, b_12)],\ +\ +a_("tail") &= (a_13, 0, 0, 0),\ +b_("tail") &= (b_13, 0, 0, 0).\ +$ + +Right before we perform the rotation, the accumulator contains the following result +$ + "acc"_1 &= a_1 b_1 + a_5 b_5 + a_9 b_9 + a_13 b_13,\ + "acc"_2 &= a_2 b_2 + a_6 b_6 + a_10 b_10 ,\ + "acc"_3 &= a_3 b_3 + a_7 b_7 + a_11 b_11 ,\ + "acc"_4 &= a_4 b_4 + a_8 b_8 + a_12 b_12 .\ +$ + +If we assume the reduction operation `simd.reduce_add` sums the elements +sequentially, we get the final result: +$ + "acc"_"aligned" = &(a_1 b_1 + a_5 b_5 + a_9 b_9 + a_13 b_13)\ + &+ (a_2 b_2 + a_6 b_6 + a_10 b_10 )\ + &+ (a_3 b_3 + a_7 b_7 + a_11 b_11 )\ + &+ (a_4 b_4 + a_8 b_8 + a_12 b_12 ).\ +$ + +Now let's take a look at the case where the memory is unaligned, for example +with an offset of 1. In this case $a$ and $b$ look like: + +$ +a'_("head") &= (0, a_1, a_2, a_3),\ +b'_("head") &= (0, b_1, b_2, b_3),\ +\ +a'_("body") &= [(a_4, a_5, a_6, a_7), (a_8, a_9, a_10, a_11)],\ +b'_("body") &= [(a_4, b_5, b_6, b_7), (b_8, b_9, b_10, b_11)],\ +\ +a'_("tail") &= (a_12, a_13, 0, 0),\ +b'_("tail") &= (b_12, b_13, 0, 0).\ +$ + +Right before we perform the rotation, the accumulator contains the following result +$ + "acc'"_1 &= a_4 b_4 + a_8 b_8 + a_12 b_12,\ + "acc'"_2 &= a_1 b_1 + a_5 b_5 + a_9 b_9 + a_13 b_13,\ + "acc'"_3 &= a_2 b_2 + a_6 b_6 + a_10 b_10 ,\ + "acc'"_4 &= a_3 b_3 + a_7 b_7 + a_11 b_11 .\ +$ + +If we use `simd.reduce_add` directly, without going through `simd.rotate_left` first, we get +this result: + +$ + "result"_("unaligned"(1)) = &( a_4 b_4 + a_8 b_8 + a_12 b_12)\ + +&(a_1 b_1 + a_5 b_5 + a_9 b_9 + a_13 b_13)\ + +&(a_2 b_2 + a_6 b_6 + a_10 b_10 )\ + +&(a_3 b_3 + a_7 b_7 + a_11 b_11 )\ +$ + +Mathematically, the result is equivalent, but since floating point operations round the result, +we would get a slightly different result for the aligned and unaligned cases. + +Our solution is to first rotate the accumulator to the left by the alignment offset. +Doing this right before the accumulation would give us the rotated accumulator: +$ + "rotate"_1 &=& "acc'"_2 &= a_1 b_1 + a_5 b_5 + a_9 b_9 + a_13 b_13 &&= "acc"_1,\ + "rotate"_2 &=& "acc'"_3 &= a_2 b_2 + a_6 b_6 + a_10 b_10 &&= "acc"_2,\ + "rotate"_3 &=& "acc'"_4 &= a_3 b_3 + a_7 b_7 + a_11 b_11 &&= "acc"_3,\ + "rotate"_4 &=& "acc'"_1 &= a_4 b_4 + a_8 b_8 + a_12 b_12 &&= "acc"_4.\ +$ + +Summing these sequentially would then give us the exact result as $"acc"_"aligned"$. + +#pagebreak() + += Matrix multiplication +In this section we will give a detailed overview of the techniques used to +speed up matrix multiplication in _`faer`_. The approach we use is a +reimplementation of BLIS's matrix multiplication algorithm with some +modifications. + +Consider three matrices $A$, $B$ and $C$, such that we want to perform the operation +$ C "+=" A B. $ + +We can chunk $A$, $B$ and $C$ in a way that is compatible with matrix multiplication: +$ +A = mat( + A_11 , A_12 , ... , A_(1 k); + A_21 , A_22 , ... , A_(2 k); + dots.v , dots.v , dots.down, dots.v ; + A_(m 1), A_(m 2), ... , A_(m k); +),\ +B = mat( + B_11 , B_12 , ... , B_(1 n); + B_21 , B_22 , ... , B_(2 n); + dots.v , dots.v , dots.down, dots.v ; + B_(k 1), B_(k 2), ... , B_(k n); +),\ +C = mat( + C_11 , C_12 , ... , C_(1 n); + C_21 , C_22 , ... , C_(2 n); + dots.v , dots.v , dots.down, dots.v ; + C_(m 1), C_(m 2), ... , C_(m n); +). +$ + +Then the $C "+=" A B$ operation may be decomposed into: +#set math.mat(delim: none, column-gap: 2.0em) +$ +mat( + C_(1 1) "+=" sum_(p = 1)^(k) A_(1 p) B_(p 1), C_(1 2) "+=" sum_(p = 1)^(k) A_(1 p) B_(p 2), ... , C_(1 n) "+=" sum_(p = 1)^(k) A_(1 p) B_(p n); + C_(2 1) "+=" sum_(p = 1)^(k) A_(2 p) B_(p 1), C_(2 2) "+=" sum_(p = 1)^(k) A_(2 p) B_(p 2), ... , C_(2 n) "+=" sum_(p = 1)^(k) A_(2 p) B_(p n); + dots.v , dots.v , dots.down, dots.v ; + C_(m 1) "+=" sum_(p = 1)^(k) A_(m p) B_(p 1), C_(m 2) "+=" sum_(p = 1)^(k) A_(m p) B_(p 2), ... , C_(m n) "+=" sum_(p = 1)^(k) A_(m p) B_(p n); +). +$ + +#set math.mat(delim: "(", column-gap: 0.5em) + +Doing so does not decrease the number of flops (floating point operations). But +this restructuring step can lead to a large speedup if done correctly, by +making use of cache locality on modern CPUs/GPUs. + +The general idea revolves around memory reuse. For now, let us consider the +case of a single thread executing the entire operation. The multithreaded case +can be handled with a few adjustments. + + - The algorithm we use computes sequentially $C$ by column blocks. In other words, we first compute $C_(: 1)$, then $C_(: 2)$ and so on. + - For each column block $j$, we sequentially iterate over $p$, computing all the terms $A_(: p) B_(p j)$, and accumulate them to the output. + - Then for each row block $i$, we compute $A_(i p) B_(p j)$ and accumulate it to $C_(i j)$. + +Since most modern CPUs have a hierarchical cache structure (usually ranging +from L1 (smallest) to L3 (largest)), we would like to make use of this in our +algorithm for maximum efficiency. + +The way we exploit this is by choosing the chunk dimensions so that $B_(p j)$ +remains in the L3 cache during each iteration of the second loop, and $A_(i p)$ +remains in the L2 cache during each iteration of the third loop. This leaves us +with one more cache level to use: the L1 cache. + +We make the most use out of this by chunking the inner product once again, resulting in +two more loop levels: +$ +A_(i p) = mat( + A'_1p ; + A'_2p ; + dots.v ; + A'_(m' p); +),\ +B_(p j) = mat( + B'_(p 1) , B_(p 2) , ... , B_(p n'); +),\ +C_(i j) = mat( + C'_(1 1) , C'_(1 2) , ... , C'_(1 n') ; + C'_(2 1) , C'_(2 2) , ... , C'_(2 n') ; + dots.v , dots.v , dots.down, dots.v ; + C'_(m' 1), C'_(m' 2), ... , C'_(m' n'); +). +$ + +We iterate over each column block $B'_(p j')$, then over each row block +$A'_(i' p)$, +and accumulate the product $A'_(i' p) B'_(p j')$ to $C'_(i' j')$. + +In the outer loop, $B'_(p j')$ is brought from the L3 cache into the L1 cache, +and stays there until the outer loop iteration is done. +Then in the inner loop, it is brought once again from the L1 cache into +registers, while $A'_(i' p)$ is brought from the L2 cache into registers, +so that the computation can be performed. + +This last step is done using a vectorized microkernel, which heavily uses +SIMD instructions to maximize efficiency. +The number of registers limits the size of the microkernel. + +During each iteration $p'$ of the microkernel, we can bring in $m_r$ elements +from the $A'_(i' p')$, and $n_r$ elements from $B'_(p' j')$, where $m_r$ +and $n_r$ are the dimensions of the microkernel (as well as the dimensions of +each block $C'_(i' j')$). We use the following algorithm: + + + - Load one element from $B'$, + - Load $m_r$ elements from $A'$, + - Multiply the $m_r$ elements from $A'$ by the element from $B'$, and accumulate the result to $C'$ + +Consider x86-64 as an example, with the AVX2+FMA instruction set (256-bit +registers), and suppose the scalar type has a size of 64 bits so that each +register can hold $N = 256/64 = 4$ scalars. We have 16 available registers in +total, this means we can load one element from $B'$ into one register, and $m_r$ +elements from $A'$ into $m_r / N$ registers. + +Since we don't want to constantly read and write to $C'$ from main memory, we use a local accumulator +that occupies $m_r / N n_r$ registers. + +In this case we have a total of 16 available registers, which need to hold $1 + +m_r / N + m_r / N n_r$ registers. A good choice for our case is $m_r = 3N$, +$n_r = 4$, which requires exactly 16 registers. + +To determine the chunk sizes, we compute them starting from the innermost loop +to the outermost loop. + +Given that we've already computed $m_r$ and $n_r$, we determine $k_c$ (the +number of columns of $A_(i p)$, and also the number of rows of $B_(p j)$) so +that $B'_(p, j')$ fits into the L1 cache, then we determine $m_c$ (the number +of rows of $A_(i p)$) so that $A_(i p)$ fits into the L2 cache. +And finally we determine $n_c$ (the number of columns of $B_(p j)$) so that +$B_(: j)$ fits into the L3 cache. + +Note that bringing data into the cache is typically done automatically by the CPU. +However, in our case, we want to perform that explicitly by storing each L3/L2 chunk +into packed storage, which allows for contiguous access that's friendly to the CPU's +hardware prefetcher and minimizes TLB (Translation Lookaside Buffer) misses. + +In order to parallelize the algorithm, we have a few options. The second +loop can't be easily parallelized without allocating extra storage for the +accumulators, since we have a data dependency in the second loop. +The microkernel also doesn't perform enough work to compensate for the overhead +of core synchronization, so it also makes for a poor multithreading candidate. + +This leaves us with the first loop, as well as the third, fourth and fifth +loops. The first loop stores its data in the L3 cache, which is typically +shared between cores. So it's not usually a very attractive candidate for +parallelization. + +Loops three through five however can be parallelized in a straightforward way, +and make good use of each core's separate L1 (and often L2) cache, which leads +to a significant speedup. + +Since data is packed explicitly during matrix multiplication, the original +layout of the input matrices has little effect on efficiency when the +dimensions are medium or large. This has the side-effect of matrix multiplication +being highly efficient regardless of the matrix layout. + +== Special cases +In this section, we will refer to matrix multiplication with dimensions $(m, n) +× (n, k)$ with $(m, n, k)$ as a shorthand. + +For special common matrix dimensions, we do not usually want to go through the +aforementioned strategy, because the packing and unpacking steps, as well as +the microkernel indirection can add considerable overhead. + +Such cases include: +- inner product $(1, 1, k)$, +- outer product: $(m, n, 1)$, +- matrix-vector: $(m, 1, k)$, +- vector-matrix: $(1, n, k)$, which can be rewritten in terms of matrix-vector by transposing, if we assume that the scalar multiplication is commutative (which _`faer`_ generally does), since $C "+=" A B <=> C^top "+=" B^top A^top$. + +The $(1, 1, k)$ case can be optimized for when $A$ is row-major and $B$ is +column-major, and is written similarly to our previous dot product example, +with one difference: Instead of using one accumulator for the result, we use +multiple accumulators, and then sum them together at the end. This can speed up +the computation by making use of instruction level parallelism, since each +accumulator can be computed independently from the others. + +For the $(m, n, 1)$ case, we assume $C$ is column-major. If it is row-major, we +can implicitly transpose the matrix multiply operation. The algorithm we use +consists of computing $C$ column by column, which is equivalent to $C_(: j) "+=" +A b_(1 j)$. This can be vectorized as a vertical operation, if $A$ is column-major. +If it is not, we can store it to contiguous temporary storage before performing +the computation. Note that this is a relatively cheap operation since its +dimensions are $(m, 1)$, which is usually much smaller than the size of $C$: +$(m, n)$. + +For the $(m, 1, k)$ case, there are two interesting cases. The first one is +when $A$ is column major. In this case we assume $C$ is column major +(otherwise, we can compute the result in a temporary vector and accumulate it +to $C$ afterwards). For each column of $A$, we multiply it by the corresponding +element of $B$ and accumulate it to $C$. The inner kernel for this operation is +$C "+=" A_(: k) b_(k 1)$, which is essentially the same as the one from the outer +product. + +When $A$ is row-major, we assume $B$ is column-major, and compute $C_(i 1) "+=" +A_(i :) B_(: 1)$, which uses the same kernel as the $(1, 1, k)$ case. + +== Triangular matrix products +In some cases, one of the matrices $A$ and $B$ is triangular (with a possibly +implicit zero or one diagonal), or we only want to compute the lower or upper +half of the output. _`faer`_ currently uses recursive implementations that are +padded and handled as rectangular matrix multiplication in the base case. For +example, we may want to compute $A B$ where $A$ is lower triangular and $B$ is +upper triangular: + +$ +mat( + C_(1 1), C_(1 2); + C_(2 1), C_(2 2); +) "+=" +mat( + A_(1 1), 0; + A_(2 1), A_(2 2); +) +mat( + B_(1 1), B_(1 2); + 0 , B_(2 2); +) +$ + +This can be split up into a sequence of products: + +$ +C_(1 1) & "+=" A_(1 1) B_(1 1),\ +C_(1 2) & "+=" A_(1 1) B_(1 2),\ +C_(2 1) & "+=" A_(2 1) B_(1 1),\ +C_(2 2) & "+=" A_(2 1) B_(1 2) + A_(2 2) B_(2 2). +$ + +The steps $C_(1 1) "+=" A_(1 1) B_(1 1)$ and $C_(2 2) "+=" A_(2 2) B_(2 2)$ +are also matrix prodcuts where the LHS is lower triangular and the RHS is upper +triangular, so we call the algorithm recursively for them. + +All of these products can be performed either sequentially or in parallel. +In the parallel case, we group them like this to avoid load imbalances between +threads. + +#set align(center) +#gridx( + columns: (auto, auto), + align: center, + "Thread 1", vlinex(), "Thread 2", + $ + C_(1 1) & "+=" A_(1 1) B_(1 1)\ + C_(2 2) & "+=" A_(2 1) B_(1 2) + A_(2 2) B_(2 2) + $, (), + $ + C_(1 2) & "+=" A_(1 1) B_(1 2)\ + C_(2 1) & "+=" A_(2 1) B_(1 1)\ + $, +) +#set align(left) + +This way each thread performs roughtly the same number of flops, which helps +avoid idling threads that spend time waiting for the others to finish. +Every time we recurse to another triangular matrix multiplication, we can split +up the work again. And if we perform a rectangular matrix multiply, we can rely on +its inherent parallelism. + +This is one of the scenarios where _`faer`_'s fine control over multithreading shines, +as we can provide a hint for each nested operation that it doesn't need to use +all the available cores, which can reduce synchronization overhead and conflict +over shared resources by the different threads. + +#colorbox( + title: "PERF", + color: "blue", + radius: 2pt, + width: auto +)[ + The current strategy doesn't take advantage of the CPU's cache hierarchy for + deciding how to split up the work between threads. This could lead to + multiple threads contending over the L3 cache for very large matrices. + + In that case it could be worth investigating if it's better to only use + triangular threading when the sizes fall below a certain threshold. +] + +#colorbox( + title: "PERF", + color: "blue", + radius: 2pt, + width: auto +)[ + Specialized implementations for specific dimensions, similarly to what is + done for the rectangular matrix multiply. For example a lower triangular + matrix times a column vector. +] + +== Triangular matrix solve +Solving systems of the form $A X = B$ (where $A$ is a triangular matrix) is +another primitive that is implemented using a recursive implementation that +relies on matrix multiplication. For simplicity, we assume $A$ is lower +triangular. The cases where $A$ is unit lower triangular or [unit] upper +triangular are handled similarly. + +If we decompose $A$, $B$ and $X$ as follows: +$ +A = mat(A_(1 1), ; A_(2 1), A_(2 2)), +B = mat(B_1; B_2), +X = mat(X_1; X_2), +$ +then the system can be reformulated as +$ +A_(1 1) X_1 &= B_1,\ +A_(2 2) X_2 &= B_2 - A_(2 1) X_1. +$ +This system can be solved sequentially by first solving the first equation, +then substituting its solution in the second equation. Moreover, the system can +be solved in place, by taking an inout parameter that contains $B$ on input, +and $X$ on output. + +Once the recursion reaches a certain threshold, we fall back to a sequential +implementation to avoid the recursion overhead. diff --git a/book/user_guide.typ b/book/user_guide.typ new file mode 100644 index 00000000..2aaacaf2 --- /dev/null +++ b/book/user_guide.typ @@ -0,0 +1,512 @@ +#set text(font: "New Computer Modern") + +#show raw: set text(font: "New Computer Modern Mono", size: 1.2em) + +#show par: set block(spacing: 0.55em) + +#show heading: set block(above: 1.4em, below: 1em) + +#show link: underline + +#set page(numbering: "1") + +#set par(leading: 0.55em, justify: true) + +#set heading(numbering: "1.1") + +#show heading.where(level: 1): it => pagebreak(weak:true) + block({ + set text(font: "New Computer Modern", weight: "black") + v(2cm) + block(text(18pt)[Chapter #counter(heading).display()]) + v(1cm) + block(text(22pt)[#it.body]) + v(1cm) +}) + +#import "@preview/codly:0.1.0" +#import "@preview/tablex:0.0.6": tablex, rowspanx, colspanx, gridx, hlinex, vlinex +#import "@preview/colorful-boxes:1.2.0": colorbox + +#let icon(codepoint) = { + box( + height: 0.8em, + baseline: 0.05em, + image(codepoint) + ) + h(0.1em) +} + +#show: codly.codly-init.with() + +#codly.codly( + languages: ( + rust: (name: "Rust", icon: icon("brand-rust.svg"), color: rgb("#CE412B")), + ), + breakable: false, + width-numbers: none, +) + +#outline() + +== Introduction +_`faer-rs`_ is a general-purpose linear algebra library for the Rust +programming language, with a focus on correctness, portability, and +performance. +In this book, we'll be assuming version `0.16.0` of the library. + +A matrix is a 2-dimensional array of numerical values, which can represent +different things depending on the context. In the context of linear algebra, +it is often used to represent a linear transformation mapping vectors from +one finite-dimensional space to another. + +Column vectors are typically elements of the vector space, but may also be used +interchangeably with $n×1$ matrices. Row vectors are also similarly used +interchangeably with $1×n$ matrices. + += Dense linear algebra +== Creating a matrix +_`faer`_ provides several ways to create matrices and matrix views. + +The main matrix types are `faer_core::Mat`, `faer_core::MatRef` and `faer_core::MatMut`, +which can be thought of as being analogous to `Vec`, `&[_]` and `&mut [_]`. + +The most flexible way to initialize a matrix is to initialize a zero matrix, +then fill out the values by hand. + +```rust +let mut a = Mat::::zeros(4, 3); + +for j in 0..a.ncols() { + for i in 0..a.nrows() { + a[(i, j)] = 9.0; + } +} +``` + +Given a callable object that outputs the matrix elements, `Mat::from_fn`, can also be used. + +```rust +let a = Mat::from_fn(3, 4, |i, j| (i + j) as f64); +``` + +For common matrices such as the zero matrix and the identity matrix, shorthands +are provided. + +```rust +use faer_core::Mat; + +// creates a 10×4 matrix whose values are all `0.0`. +let a = Mat::::zeros(10, 4); + +// creates a 5×4 matrix containing `0.0` except on the main diagonal, +// which contains `1.0` instead. +let a = Mat::::identity(5, 4); +``` +#colorbox( + title: "Note", + color: "green", + radius: 2pt, + width: auto +)[ + In some cases, users may wish to avoid the cost of initializing the matrix to zero, + in which case, unsafe code may be used to allocate an uninitialized matrix, which + can then be filled out before it's used. + ```rust + // `a` is initially a 0×0 matrix. + let mut a = Mat::::with_capacity(4, 3); + + // `a` is now a 4×3 matrix, whose values are uninitialized. + unsafe { a.set_dims(4, 3) }; + + for j in 0..a.ncols() { + for i in 0..a.nrows() { + // we cannot write `a[(i, j)] = 9.0`, as that would + // create a reference to uninitialized data, + // which is currently disallowed by Rust. + a.write(i, j, 9.0); + } + } + ``` +] + +== Creating a matrix view +In some situations, it may be desirable to create a matrix view over existing +data. +In that case, we can use `faer_core::MatRef` (or `faer_core::MatMut` for +mutable views). + +They can be created in a safe way using: + +`faer_core::mat::{from_column_major_slice, from_row_major_slice}`, + +`faer_core::mat::{from_column_major_slice_mut, from_row_major_slice_mut}`, + +for contiguous matrix storage, or: + +`{from_column_major_slice_with_stride, from_row_major_slice_with_stride}`, + +`{from_column_major_slice_with_stride_mut, from_row_major_slice_with_stride_mut}`, + +for strided matrix storage. + +#colorbox( + title: "Note", + color: "green", + radius: 2pt, + width: auto +)[ + A lower level pointer API is also provided for handling uninitialized data + or arbitrary strides `{from_raw_parts, from_raw_parts_mut}`. +] + +== Converting to a view + +A `Mat` instance `m` can be converted to `MatRef` or `MatMut` by writing `m.as_ref()` +or `m.as_mut()`. + +== Reborrowing a mutable view + +Immutable matrix views can be freely copied around, since they are non-owning +wrappers around a pointer and the matrix dimensions/strides. + +Mutable matrices however are limited by Rust's borrow checker. Copying them +would be unsound since only a single active mutable view is allowed at a time. + +This means the following code does not compile. + +```rust +use faer::{Mat, MatMut}; + +fn takes_view_mut(m: MatMut) {} + +let mut a = Mat::::new(); +let view = a.as_mut(); + +takes_view_mut(view); + +// This would have failed to compile since `MatMut` is never `Copy` +// takes_view_mut(view); +``` + +The alternative is to temporarily give up ownership over the data, by creating +a view with a shorter lifetime, then recovering the ownership when the view is +no longer being used. + +This is also called reborrowing. + +```rust +use faer::{Mat, MatMut, MatRef}; +use reborrow::*; + +fn takes_view(m: MatRef) {} +fn takes_view_mut(m: MatMut) {} + +let mut a = Mat::::new(); +let mut view = a.as_mut(); + +takes_view_mut(view.rb_mut()); +takes_view_mut(view.rb_mut()); +takes_view(view.rb()); // We can also reborrow immutably + +{ + let short_view = view.rb_mut(); + + // This would have failed to compile since we can't use the original view + // while the reborrowed view is still being actively used + // takes_view_mut(view); + + takes_view_mut(short_view); +} + +// We can once again use the original view +takes_view_mut(view.rb_mut()); + +// Or consume it to convert it to an immutable view +takes_view(view.into_const()); +``` + +== Splitting a matrix view, slicing a submatrix +A matrix view can be split up along its row axis, column axis or both. +This is done using `MatRef::split_at_row`, `MatRef::split_at_col` or +`MatRef::split_at` (or `MatMut::split_at_row_mut`, `MatMut::split_at_col_mut` or +`MatMut::split_at_mut`). + +These functions take the middle index at which the split is performed, and return +the two sides (in top/bottom or left/right order) or the four corners (top +left, top right, bottom left, bottom right) + +We can also take a submatrix using `MatRef::subrows`, `MatRef::subcols` or +`MatRef::submatrix` (or `MatMut::subrows_mut`, `MatMut::subcols_mut` or +`MatMut::submatrix_mut`). + +Alternatively, we can also use `MatRef::get` or `MatMut::get_mut`, which take +as parameters the row and column ranges. + +#colorbox( + title: "Warning", + color: "red", + radius: 2pt, + width: auto +)[ + Note that `MatRef::submatrix` (and `MatRef::subrows`, `MatRef::subcols`) takes + as a parameter, the first row and column of the submatrix, then the number + of rows and columns of the submatrix. + + On the other hand, `MatRef::get` takes a range from the first row and column + to the last row and column. +] + +== Matrix arithmetic operations +_`faer`_ matrices implement most of the arithmetic operators, so two matrices +can be added simply by writing `&a + &b`, the result of the expression is a +`faer::Mat`, which allows chaining operations (e.g. `(&a + &b) * &c`), although +at the cost of allocating temporary matrices. + +#colorbox( + title: "Note", + color: "green", + radius: 2pt, + width: auto +)[ + Temporary allocations can be avoided by using the zip api: +```rust +use faer::{Mat, zipped, unzipped}; + +let a = Mat::::zeros(4, 3); +let b = Mat::::zeros(4, 3); +let mut c = Mat::::zeros(4, 3); + +// Sums `a` and `b` and stores the result in `c`. +zipped!(&mut c, &a, &b).for_each(|unzipped!(c, a, b)| *c = *a + *b); + +// Sums `a`, `b` and `c` into a new matrix `d`. +let d = zipped!(&mut c, &a, &b).map(|unzipped!(c, a, b)| *a + *b + *c); +``` + For matrix multiplication, the non-allocating api is provided in the + `faer_core::mul` module. + +```rust +use faer::{Mat, Parallelism}; +use faer_core::mul::matmul; + +let a = Mat::::zeros(4, 3); +let b = Mat::::zeros(3, 5); + +let mut c = Mat::::zeros(4, 5); + +// Computes `faer::scale(3.0) * &a * &b` and stores the result in `c`. +matmul(c.as_mut(), a.as_ref(), b.as_ref(), None, 3.0, Parallelism::None); + +// Computes `faer::scale(3.0) * &a * &b + 5.0 * &c` and stores the result in `c`. +matmul(c.as_mut(), a.as_ref(), b.as_ref(), Some(5.0), 3.0, Parallelism::None); +``` +] + +== Solving a linear system +Several applications require solving a linear system of the form $A x = b$. +The recommended method can vary depending on the properties of $A$, and the +desired numerical accuracy. + +=== $A$ is triangular +In this case, one can use $A$ and $b$ directly to find $x$, using the functions +provided in `faer_core::solve`. + +```rust +use faer::{Mat, Parallelism}; +use faer_core::solve::solve_lower_triangular_in_place; + +let a = Mat::::from_fn(4, 4, |i, j| if i >= j { 1.0 } else { 0.0 }); +let b = Mat::::from_fn(4, 2, |i, j| (i - j) as f64); + +let mut x = Mat::::zeros(4, 2); +x.copy_from(&b); +solve_lower_triangular_in_place(a.as_ref(), x.as_mut(), Parallelism::None); + +// x now contains the approximate solution +``` + +In the case where $A$ has a unit diagonal, one can use +`solve_unit_lower_triangular_in_place`, which avoids reading the diagonal, and +instead implicitly uses the value `1.0` as a replacement. + +=== $A$ is real-symmetric/complex-Hermitian +If $A$ is Hermitian and positive definite, users can use the Cholesky LLT +decomposition. + +```rust +use faer::{mat, Side}; +use faer::prelude::*; + +let a = mat![ + [10.0, 2.0], + [2.0, 10.0f64], +]; +let b = mat![[15.0], [-3.0f64]]; + +// Compute the Cholesky decomposition, +// reading only the lower triangular half of the matrix. +let llt = a.cholesky(Side::Lower).unwrap(); + +let x = llt.solve(&b); +``` + +Alternatively, a lower-level API could be used to avoid temporary allocations. +The corresponding code for other decompositions follows the same pattern, so we +will avoid repeating it. + +```rust +use faer::{mat, Parallelism, Conj}; +use faer_cholesky::llt::compute::cholesky_in_place_req; +use faer_cholesky::llt::compute::{cholesky_in_place, LltRegularization, LltParams}; +use faer_cholesky::llt::solve::solve_in_place_req; +use faer_cholesky::llt::solve::solve_in_place_with_conj; +use dyn_stack::{PodStack, GlobalPodBuffer}; + +let a = mat![ + [10.0, 2.0], + [2.0, 10.0f64], +]; +let mut b = mat![[15.0], [-3.0f64]]; + +let mut llt = Mat::::zeros(2, 2); +let no_par = Parallelism::None; + +// Compute the size and alignment of the required scratch space +let cholesky_memory = cholesky_in_place_req::( + a.nrows(), + Parallelism::None, + LltParams::default(), +).unwrap(); +let solve_memory = solve_in_place_req::( + a.nrows(), + b.ncols(), + Parallelism::None, +).unwrap(); + +// Allocate the scratch space +let mut memory = GlobalPodBuffer::new(cholesky_memory.or(solve_memory)); +let mut stack = PodStack::new(&mut mem); + +// Compute the decomposition +llt.copy_from(&a); +cholesky_in_place( + llt.as_mut(), + LltRegularization::default(), // no regularization + no_par, + stack.rb_mut(), // scratch space + LltParams::default(), // default settings +); +// Solve the linear system +solve_in_place_with_conj(llt.as_ref(), Conj::No, b.as_mut(), no_par, stack); +``` + +If $A$ is not positive definite, the Bunch-Kaufman LBLT decomposition is recommended instead. +```rust +use faer::{mat, Side}; +use faer::prelude::*; + +let a = mat![ + [10.0, 2.0], + [2.0, -10.0f64], +]; +let b = mat![[15.0], [-3.0f64]]; + +// Compute the Bunch-Kaufman LBLT decomposition, +// reading only the lower triangular half of the matrix. +let lblt = a.lblt(Side::Lower); + +let x = lblt.solve(&b); +``` + +=== $A$ is square +For a square matrix $A$, we can use the LU decomposition with partial pivoting, +or the full pivoting variant which is slower but can be more accurate when the +matrix is nearly singular. + +```rust +use faer::mat; +use faer::prelude::*; + +let a = mat![ + [10.0, 3.0], + [2.0, -10.0f64], +]; +let b = mat![[15.0], [-3.0f64]]; + +// Compute the LU decomposition with partial pivoting, +let plu = a.partial_piv_lu(); +let x1 = plu.solve(&b); + +// or the LU decomposition with full pivoting. +let flu = a.full_piv_lu(); +let x2 = flu.solve(&b); +``` + +=== $A$ is a tall matrix (least squares solution) +When the linear system is over-determined, an exact solution may not +necessarily exist, in which case we can get a best-effort result by computing +the least squares solution. +That is, the solution that minimizes $||A x - b||$. + +This can be done using the QR decomposition. + +```rust +use faer::mat; +use faer::prelude::*; + +let a = mat![ + [10.0, 3.0], + [2.0, -10.0], + [3.0, -45.0f64], +]; +let b = mat![[15.0], [-3.0], [13.1f64]]; + +// Compute the QR decomposition. +let qr = a.qr(); +let x = qr.solve_lstsq(&b); +``` + +== Computing the singular value decomposition +```rust +use faer::mat; +use faer::prelude::*; + +let a = mat![ + [10.0, 3.0], + [2.0, -10.0], + [3.0, -45.0f64], +]; + +// Compute the SVD decomposition. +let svd = a.svd(); +// Compute the thin SVD decomposition. +let svd = a.thin_svd(); +// Compute the singular values. +let svd = a.singular_values(); +``` + +== Computing the eigenvalue decomposition +```rust +use faer::mat; +use faer::prelude::*; +use faer::complex_native::c64; + +let a = mat![ + [10.0, 3.0], + [2.0, -10.0f64], +]; + +// Compute the eigendecomposition. +let evd = a.eigendecomposition::(); + +// Compute the eigenvalues. +let evd = a.eigen_values::(); + +// Compute the eigendecomposition assuming `a` is Hermitian. +let evd = a.selfadjoint_eigendecomposition(); + +// Compute the eigenvalues assuming `a` is Hermitian. +let evd = a.selfadjoint_eigenvalues(); +``` + += Sparse linear algebra