diff --git a/Cargo.toml b/Cargo.toml index 7c5cbc8f8..99abdc86b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,11 @@ repository.workspace = true [features] default = ["wgpu"] +# Enables GPU memory usage estimation. This performs additional computations +# in order to estimate the minimum required allocations for buffers backing +# bump-allocated GPU memory. +# TODO: Turn this into a runtime option used at resolve time and remove the feature. +bump_estimate = ["vello_encoding/bump_estimate"] hot_reload = [] buffer_labels = [] diff --git a/crates/encoding/Cargo.toml b/crates/encoding/Cargo.toml index f7868a8b3..c9032f7df 100644 --- a/crates/encoding/Cargo.toml +++ b/crates/encoding/Cargo.toml @@ -9,10 +9,16 @@ repository.workspace = true [features] default = ["full"] + # Enables support for the full pipeline including late-bound # resources (gradients, images and glyph runs) full = ["skrifa", "guillotiere"] +# Enables an optional GPU memory usage estimation utility. This can be used to +# perform additional computations in order to estimate the minimum required allocations +# for buffers backing bump-allocated GPU memory. +bump_estimate = [] + [lints] workspace = true diff --git a/crates/encoding/src/config.rs b/crates/encoding/src/config.rs index 37b1906ac..b6ef8857a 100644 --- a/crates/encoding/src/config.rs +++ b/crates/encoding/src/config.rs @@ -37,6 +37,73 @@ pub struct BumpAllocators { pub lines: u32, } +#[derive(Default)] +pub struct BumpAllocatorMemory { + pub total: u32, + pub binning: BufferSize, + pub ptcl: BufferSize, + pub tile: BufferSize, + pub seg_counts: BufferSize, + pub segments: BufferSize, + pub lines: BufferSize, +} + +impl BumpAllocators { + pub fn memory(&self) -> BumpAllocatorMemory { + let binning = BufferSize::new(self.binning); + let ptcl = BufferSize::new(self.ptcl); + let tile = BufferSize::new(self.tile); + let seg_counts = BufferSize::new(self.seg_counts); + let segments = BufferSize::new(self.segments); + let lines = BufferSize::new(self.lines); + BumpAllocatorMemory { + total: binning.size_in_bytes() + + ptcl.size_in_bytes() + + tile.size_in_bytes() + + seg_counts.size_in_bytes() + + segments.size_in_bytes() + + lines.size_in_bytes(), + binning, + ptcl, + tile, + seg_counts, + segments, + lines, + } + } +} + +impl std::fmt::Display for BumpAllocatorMemory { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "\n \ + \tTotal:\t\t\t{} bytes ({:.2} KB | {:.2} MB)\n\ + \tBinning\t\t\t{} elements ({} bytes)\n\ + \tPTCL\t\t\t{} elements ({} bytes)\n\ + \tTile:\t\t\t{} elements ({} bytes)\n\ + \tSegment Counts:\t\t{} elements ({} bytes)\n\ + \tSegments:\t\t{} elements ({} bytes)\n\ + \tLines:\t\t\t{} elements ({} bytes)", + self.total, + self.total as f32 / (1 << 10) as f32, + self.total as f32 / (1 << 20) as f32, + self.binning.len(), + self.binning.size_in_bytes(), + self.ptcl.len(), + self.ptcl.size_in_bytes(), + self.tile.len(), + self.tile.size_in_bytes(), + self.seg_counts.len(), + self.seg_counts.size_in_bytes(), + self.segments.len(), + self.segments.size_in_bytes(), + self.lines.len(), + self.lines.size_in_bytes() + ) + } +} + /// Storage of indirect dispatch size values. /// /// The original plan was to reuse [`BumpAllocators`], but the WebGPU compatible diff --git a/crates/encoding/src/estimate.rs b/crates/encoding/src/estimate.rs new file mode 100644 index 000000000..29dc31bf6 --- /dev/null +++ b/crates/encoding/src/estimate.rs @@ -0,0 +1,291 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! This utility provides conservative size estimation for buffer allocations backing +//! GPU bump memory. This estimate relies on heuristics and naturally overestimates. + +use super::{BufferSize, BumpAllocatorMemory, Transform}; +use peniko::kurbo::{Cap, Join, PathEl, Stroke, Vec2}; + +const RSQRT_OF_TOL: f64 = 2.2360679775; // tol = 0.2 + +#[derive(Clone, Default)] +pub struct BumpEstimator { + // TODO: support binning + // TODO: support ptcl + // TODO: support tile + // TODO: support segment counts + // TODO: support segments + lines: LineSoup, +} + +impl BumpEstimator { + pub fn new() -> Self { + Self::default() + } + + pub fn reset(&mut self) { + *self = Self::default(); + } + + /// Combine the counts of this estimator with `other` after applying an optional `transform`. + pub fn append(&mut self, other: &Self, transform: Option<&Transform>) { + self.lines.add(&other.lines, transform_scale(transform)); + } + + pub fn count_path( + &mut self, + path: impl Iterator, + t: &Transform, + stroke: Option<&Stroke>, + ) { + let mut caps = 1; + let mut joins: u32 = 0; + let mut lineto_lines = 0; + let mut fill_close_lines = 1; + let mut curve_lines = 0; + let mut curve_count = 0; + + // Track the path state to correctly count empty paths and close joins. + let mut first_pt = None; + let mut last_pt = None; + for el in path { + match el { + PathEl::MoveTo(p0) => { + first_pt = Some(p0); + if last_pt.is_none() { + continue; + } + caps += 1; + joins = joins.saturating_sub(1); + last_pt = None; + fill_close_lines += 1; + } + PathEl::ClosePath => { + if last_pt.is_some() { + joins += 1; + lineto_lines += 1; + } + last_pt = first_pt; + } + PathEl::LineTo(p0) => { + last_pt = Some(p0); + joins += 1; + lineto_lines += 1; + } + PathEl::QuadTo(p1, p2) => { + let Some(p0) = last_pt.or(first_pt) else { + continue; + }; + curve_count += 1; + curve_lines += + wang::quadratic(RSQRT_OF_TOL, p0.to_vec2(), p1.to_vec2(), p2.to_vec2(), t); + last_pt = Some(p2); + joins += 1; + } + PathEl::CurveTo(p1, p2, p3) => { + let Some(p0) = last_pt.or(first_pt) else { + continue; + }; + curve_count += 1; + curve_lines += wang::cubic( + RSQRT_OF_TOL, + p0.to_vec2(), + p1.to_vec2(), + p2.to_vec2(), + p3.to_vec2(), + t, + ); + last_pt = Some(p3); + joins += 1; + } + } + } + let Some(style) = stroke else { + self.lines.linetos += lineto_lines + fill_close_lines; + self.lines.curves += curve_lines; + self.lines.curve_count += curve_count; + return; + }; + + // For strokes, double-count the lines to estimate offset curves. + self.lines.linetos += 2 * lineto_lines; + self.lines.curves += 2 * curve_lines; + self.lines.curve_count += 2 * curve_count; + + let round_scale = transform_scale(Some(t)); + let width = style.width as f32; + self.count_stroke_caps(style.start_cap, width, caps, round_scale); + self.count_stroke_caps(style.end_cap, width, caps, round_scale); + self.count_stroke_joins(style.join, width, joins, round_scale); + } + + /// Produce the final total, applying an optional transform to all content. + pub fn tally(&self, transform: Option<&Transform>) -> BumpAllocatorMemory { + let scale = transform_scale(transform); + let binning = BufferSize::new(0); + let ptcl = BufferSize::new(0); + let tile = BufferSize::new(0); + let seg_counts = BufferSize::new(0); + let segments = BufferSize::new(0); + let lines = BufferSize::new(self.lines.tally(scale)); + BumpAllocatorMemory { + total: binning.size_in_bytes() + + ptcl.size_in_bytes() + + tile.size_in_bytes() + + seg_counts.size_in_bytes() + + lines.size_in_bytes(), + binning, + ptcl, + tile, + seg_counts, + segments, + lines, + } + } + + fn count_stroke_caps(&mut self, style: Cap, width: f32, count: u32, scale: f32) { + match style { + Cap::Butt => self.lines.linetos += count, + Cap::Square => self.lines.linetos += 3 * count, + Cap::Round => { + self.lines.curves += count * estimate_arc_lines(width, scale); + self.lines.curve_count += 1; + } + } + } + + fn count_stroke_joins(&mut self, style: Join, width: f32, count: u32, scale: f32) { + match style { + Join::Bevel => self.lines.linetos += count, + Join::Miter => self.lines.linetos += 2 * count, + Join::Round => { + self.lines.curves += count * estimate_arc_lines(width, scale); + self.lines.curve_count += 1; + } + } + } +} + +fn estimate_arc_lines(stroke_width: f32, scale: f32) -> u32 { + // These constants need to be kept consistent with the definitions in `flatten_arc` in + // flatten.wgsl. + const MIN_THETA: f32 = 1e-4; + const TOL: f32 = 0.1; + let radius = TOL.max(scale * stroke_width * 0.5); + let theta = (2. * (1. - TOL / radius).acos()).max(MIN_THETA); + ((std::f32::consts::FRAC_PI_2 / theta).ceil() as u32).max(1) +} + +#[derive(Clone, Default)] +struct LineSoup { + // Explicit lines (such as linetos and non-round stroke caps/joins) and Bezier curves + // get tracked separately to ensure that explicit lines remain scale invariant. + linetos: u32, + curves: u32, + + // Curve count is simply used to ensure a minimum number of lines get counted for each curve + // at very small scales to reduce the chance of under-allocating. + curve_count: u32, +} + +impl LineSoup { + fn tally(&self, scale: f32) -> u32 { + let curves = self + .scaled_curve_line_count(scale) + .max(5 * self.curve_count); + + self.linetos + curves + } + + fn scaled_curve_line_count(&self, scale: f32) -> u32 { + (self.curves as f32 * scale.sqrt()).ceil() as u32 + } + + fn add(&mut self, other: &LineSoup, scale: f32) { + self.linetos += other.linetos; + self.curves += other.scaled_curve_line_count(scale); + self.curve_count += other.curve_count; + } +} + +// TODO: The 32-bit Vec2 definition from cpu_shaders/util.rs could come in handy here. +fn transform(t: &Transform, v: Vec2) -> Vec2 { + Vec2::new( + t.matrix[0] as f64 * v.x + t.matrix[2] as f64 * v.y, + t.matrix[1] as f64 * v.x + t.matrix[3] as f64 * v.y, + ) +} + +fn transform_scale(t: Option<&Transform>) -> f32 { + match t { + Some(t) => { + let m = t.matrix; + let v1x = m[0] + m[3]; + let v2x = m[0] - m[3]; + let v1y = m[1] - m[2]; + let v2y = m[1] + m[2]; + (v1x * v1x + v1y * v1y).sqrt() + (v2x * v2x + v2y * v2y).sqrt() + } + None => 1., + } +} + +/// Wang's Formula (as described in Pyramid Algorithms by Ron Goldman, 2003, Chapter 5, Section +/// 5.6.3 on Bezier Approximation) is a fast method for computing a lower bound on the number of +/// recursive subdivisions required to approximate a Bezier curve within a certain tolerance. The +/// formula for a Bezier curve of degree `n`, control points p[0]...p[n], and number of levels of +/// subdivision `l`, and flattening tolerance `tol` is defined as follows: +/// +/// ```ignore +/// m = max([length(p[k+2] - 2 * p[k+1] + p[k]) for (0 <= k <= n-2)]) +/// l >= log_4((n * (n - 1) * m) / (8 * tol)) +/// ``` +/// +/// For recursive subdivisions that split a curve into 2 segments at each level, the minimum number +/// of segments is given by 2^l. From the formula above it follows that: +/// +/// ```ignore +/// segments >= 2^l >= 2^log_4(x) (1) +/// segments^2 >= 2^(2*log_4(x)) >= 4^log_4(x) (2) +/// segments^2 >= x +/// segments >= sqrt((n * (n - 1) * m) / (8 * tol)) (3) +/// ``` +/// +/// Wang's formula computes an error bound on recursive subdivision based on the second derivative +/// which tends to result in a suboptimal estimate when the curvature within the curve has a lot of +/// variation. This is expected to frequently overshoot the flattening formula used in vello, which +/// is closer to optimal (vello uses a method based on a numerical approximation of the integral +/// over the continuous change in the number of flattened segments, with an error expressed in terms +/// of curvature and infinitesimal arclength). +mod wang { + use super::*; + + // The curve degree term sqrt(n * (n - 1) / 8) specialized for cubics: + // + // sqrt(3 * (3 - 1) / 8) + // + const SQRT_OF_DEGREE_TERM_CUBIC: f64 = 0.86602540378; + + // The curve degree term sqrt(n * (n - 1) / 8) specialized for quadratics: + // + // sqrt(2 * (2 - 1) / 8) + // + const SQRT_OF_DEGREE_TERM_QUAD: f64 = 0.5; + + pub fn quadratic(rsqrt_of_tol: f64, p0: Vec2, p1: Vec2, p2: Vec2, t: &Transform) -> u32 { + let v = -2. * p1 + p0 + p2; + let v = transform(t, v); // transform is distributive + let m = v.length(); + (SQRT_OF_DEGREE_TERM_QUAD * m.sqrt() * rsqrt_of_tol).ceil() as u32 + } + + pub fn cubic(rsqrt_of_tol: f64, p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: &Transform) -> u32 { + let v1 = -2. * p1 + p0 + p2; + let v2 = -2. * p2 + p1 + p3; + let v1 = transform(t, v1); + let v2 = transform(t, v2); + let m = v1.length().max(v2.length()) as f64; + (SQRT_OF_DEGREE_TERM_CUBIC * m.sqrt() * rsqrt_of_tol).ceil() as u32 + } +} diff --git a/crates/encoding/src/lib.rs b/crates/encoding/src/lib.rs index 2ace2819a..30db95000 100644 --- a/crates/encoding/src/lib.rs +++ b/crates/encoding/src/lib.rs @@ -8,6 +8,8 @@ mod clip; mod config; mod draw; mod encoding; +#[cfg(feature = "bump_estimate")] +mod estimate; #[cfg(feature = "full")] mod glyph; #[cfg(feature = "full")] @@ -25,8 +27,8 @@ mod resolve; pub use binning::BinHeader; pub use clip::{Clip, ClipBbox, ClipBic, ClipElement}; pub use config::{ - BufferSize, BufferSizes, BumpAllocators, ConfigUniform, IndirectCount, RenderConfig, - WorkgroupCounts, WorkgroupSize, + BufferSize, BufferSizes, BumpAllocatorMemory, BumpAllocators, ConfigUniform, IndirectCount, + RenderConfig, WorkgroupCounts, WorkgroupSize, }; pub use draw::{ DrawBbox, DrawBeginClip, DrawColor, DrawImage, DrawLinearGradient, DrawMonoid, @@ -49,3 +51,6 @@ pub use { ramp_cache::Ramps, resolve::{Patch, Resolver}, }; + +#[cfg(feature = "bump_estimate")] +pub use estimate::BumpEstimator; diff --git a/src/scene.rs b/src/scene.rs index 6fffd358d..4e2c0d7fc 100644 --- a/src/scene.rs +++ b/src/scene.rs @@ -4,6 +4,8 @@ use peniko::kurbo::{Affine, Rect, Shape, Stroke}; use peniko::{BlendMode, BrushRef, Color, Fill, Font, Image, StyleRef}; use skrifa::instance::NormalizedCoord; +#[cfg(feature = "bump_estimate")] +use vello_encoding::BumpAllocatorMemory; use vello_encoding::{Encoding, Glyph, GlyphRun, Patch, Transform}; // TODO - Document invariants and edge cases (#470) @@ -17,6 +19,8 @@ use vello_encoding::{Encoding, Glyph, GlyphRun, Patch, Transform}; #[derive(Clone, Default)] pub struct Scene { encoding: Encoding, + #[cfg(feature = "bump_estimate")] + estimator: vello_encoding::BumpEstimator, } impl Scene { @@ -28,6 +32,16 @@ impl Scene { /// Removes all content from the scene. pub fn reset(&mut self) { self.encoding.reset(); + #[cfg(feature = "bump_estimate")] + self.estimator.reset(); + } + + /// Tally up the bump allocator estimate for the current state of the encoding, + /// taking into account an optional `transform` applied to the entire scene. + #[cfg(feature = "bump_estimate")] + pub fn bump_estimate(&self, transform: Option) -> BumpAllocatorMemory { + self.estimator + .tally(transform.as_ref().map(Transform::from_kurbo).as_ref()) } /// Returns the underlying raw encoding. @@ -50,14 +64,17 @@ impl Scene { clip: &impl Shape, ) { let blend = blend.into(); - self.encoding - .encode_transform(Transform::from_kurbo(&transform)); + let t = Transform::from_kurbo(&transform); + self.encoding.encode_transform(t); self.encoding.encode_fill_style(Fill::NonZero); if !self.encoding.encode_shape(clip, true) { // If the layer shape is invalid, encode a valid empty path. This suppresses // all drawing until the layer is popped. self.encoding .encode_shape(&Rect::new(0.0, 0.0, 0.0, 0.0), true); + } else { + #[cfg(feature = "bump_estimate")] + self.estimator.count_path(clip.path_elements(0.1), &t, None); } self.encoding .encode_begin_clip(blend, alpha.clamp(0.0, 1.0)); @@ -77,8 +94,8 @@ impl Scene { brush_transform: Option, shape: &impl Shape, ) { - self.encoding - .encode_transform(Transform::from_kurbo(&transform)); + let t = Transform::from_kurbo(&transform); + self.encoding.encode_transform(t); self.encoding.encode_fill_style(style); if self.encoding.encode_shape(shape, true) { if let Some(brush_transform) = brush_transform { @@ -90,6 +107,9 @@ impl Scene { } } self.encoding.encode_brush(brush, 1.0); + #[cfg(feature = "bump_estimate")] + self.estimator + .count_path(shape.path_elements(0.1), &t, None); } } @@ -118,22 +138,35 @@ impl Scene { const GPU_STROKES: bool = false; // Set this to `true` to enable GPU-side stroking if GPU_STROKES { - self.encoding - .encode_transform(Transform::from_kurbo(&transform)); + let t = Transform::from_kurbo(&transform); + self.encoding.encode_transform(t); self.encoding.encode_stroke_style(style); // We currently don't support dashing on the GPU. If the style has a dash pattern, then // we convert it into stroked paths on the CPU and encode those as individual draw // objects. let encode_result = if style.dash_pattern.is_empty() { + #[cfg(feature = "bump_estimate")] + self.estimator + .count_path(shape.path_elements(SHAPE_TOLERANCE), &t, Some(style)); self.encoding.encode_shape(shape, false) } else { + // TODO: We currently collect the output of the dash iterator because + // `encode_path_elements` wants to consume the iterator. We want to avoid calling + // `dash` twice when `bump_estimate` is enabled because it internally allocates. + // Bump estimation will move to resolve time rather than scene construction time, + // so we can revert this back to not collecting when that happens. let dashed = peniko::kurbo::dash( shape.path_elements(SHAPE_TOLERANCE), style.dash_offset, &style.dash_pattern, - ); - self.encoding.encode_path_elements(dashed, false) + ) + .collect::>(); + #[cfg(feature = "bump_estimate")] + self.estimator + .count_path(dashed.iter().copied(), &t, Some(style)); + self.encoding + .encode_path_elements(dashed.into_iter(), false) }; if encode_result { if let Some(brush_transform) = brush_transform { @@ -170,6 +203,7 @@ impl Scene { /// Returns a builder for encoding a glyph run. pub fn draw_glyphs(&mut self, font: &Font) -> DrawGlyphs { + // TODO: Integrate `BumpEstimator` with the glyph cache. DrawGlyphs::new(&mut self.encoding, font) } @@ -178,10 +212,10 @@ impl Scene { /// The given transform is applied to every transform in the child. /// This is an O(N) operation. pub fn append(&mut self, other: &Scene, transform: Option) { - self.encoding.append( - &other.encoding, - &transform.map(|xform| Transform::from_kurbo(&xform)), - ); + let t = transform.as_ref().map(Transform::from_kurbo); + self.encoding.append(&other.encoding, &t); + #[cfg(feature = "bump_estimate")] + self.estimator.append(&other.estimator, t.as_ref()); } } @@ -283,8 +317,7 @@ impl<'a> DrawGlyphs<'a> { self } - /// Encodes a fill or stroke for for the given sequence of glyphs and consumes - /// the builder. + /// Encodes a fill or stroke for the given sequence of glyphs and consumes the builder. /// /// The `style` parameter accepts either `Fill` or `&Stroke` types. pub fn draw(mut self, style: impl Into>, glyphs: impl Iterator) {