From eead0121975aa5521bf32d6e0d0801ec597ab0a4 Mon Sep 17 00:00:00 2001 From: Ross MacArthur Date: Fri, 17 Jan 2025 11:23:25 +0200 Subject: [PATCH] Rework custom syntax - Implement a dedicated optimized searcher for the default syntax. - Use the `aho-corasick` crate for custom syntax. - Put custom syntax behind a feature. --- .github/workflows/build.yaml | 6 +- .vscode/settings.json | 2 + Cargo.lock | 2 + Cargo.toml | 5 + benches/Cargo.toml | 4 +- benches/benchdata/syntax/minijinja.html | 23 ++ benches/benchdata/syntax/upon.html | 23 ++ benches/benches/engines.rs | 20 ++ benches/src/lib.rs | 53 +++- benches/src/testdata/syntax_minijinja.golden | 25 ++ benches/src/testdata/syntax_upon.golden | 25 ++ benches/src/tests.rs | 46 ++- src/compile/lex.rs | 83 +++-- src/compile/search/aho_corasick.rs | 41 +++ src/compile/search/ahocorasick/build.rs | 301 ------------------- src/compile/search/ahocorasick/mod.rs | 291 ------------------ src/compile/search/ahocorasick/state.rs | 41 --- src/compile/search/mod.rs | 124 ++++++-- src/lib.rs | 24 +- src/syntax.rs | 4 +- src/types/delimiter.rs | 15 + src/types/mod.rs | 2 + src/types/syntax.rs | 100 ++---- tests/lex.rs | 20 +- 24 files changed, 499 insertions(+), 781 deletions(-) create mode 100644 benches/benchdata/syntax/minijinja.html create mode 100644 benches/benchdata/syntax/upon.html create mode 100644 benches/src/testdata/syntax_minijinja.golden create mode 100644 benches/src/testdata/syntax_upon.golden create mode 100644 src/compile/search/aho_corasick.rs delete mode 100644 src/compile/search/ahocorasick/build.rs delete mode 100644 src/compile/search/ahocorasick/mod.rs delete mode 100644 src/compile/search/ahocorasick/state.rs create mode 100644 src/types/delimiter.rs diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d10323e..c175f53 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -71,9 +71,13 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@1.60 - - name: Check (no filters) + - name: Check (no filters, no syntax) run: cargo check --no-default-features --features serde,unicode + - uses: dtolnay/rust-toolchain@1.61 + - name: Check (no filters) + run: cargo check --no-default-features --features serde,syntax,unicode + - uses: dtolnay/rust-toolchain@1.65 - name: Test run: cargo test diff --git a/.vscode/settings.json b/.vscode/settings.json index 9bd2ca4..3988348 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,6 @@ { + "rust-analyzer.check.features": "all", + "rust-analyzer.cargo.features": "all", "rust-analyzer.server.extraEnv": { "RUSTFLAGS": "--cfg internal_debug" }, diff --git a/Cargo.lock b/Cargo.lock index 25c7267..30c6ea2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -747,6 +747,7 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f98b09920c8be9ff96a5625aca5b5db7a4f4ba025132ff7d7aacb72c0244a45" dependencies = [ + "aho-corasick", "serde", ] @@ -1329,6 +1330,7 @@ dependencies = [ name = "upon" version = "0.8.1" dependencies = [ + "aho-corasick", "serde", "unicode-ident", "unicode-width", diff --git a/Cargo.toml b/Cargo.toml index 96ba98c..7e6e7c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ include = ["src/**/*", "LICENSE-*", "README.md"] rustdoc-args = ["--cfg", "docsrs"] [dependencies] +aho-corasick = { version = "1.1.2", optional = true } serde = { version = "1.0.137", optional = true } unicode-ident = { version = "1.0.5", optional = true } unicode-width = { version = "0.1.9", optional = true } @@ -35,6 +36,10 @@ filters = [] # the context using `Value`'s '`From` impls. serde = ["dep:serde"] +# Enables support for configuring custom delimiters in templates and pulls in +# the `aho-corasick` crate. +syntax = ["dep:aho-corasick"] + # Allows unicode identifiers in templates and enables improved error # formatting. unicode = ["dep:unicode-ident", "dep:unicode-width"] diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 3e7e1f3..396fc3e 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -7,13 +7,13 @@ publish = false [dependencies] handlebars = "4.3.7" liquid = "0.26.4" -minijinja = "1.0.5" +minijinja = { version = "1.0.5", features = ["custom_syntax"] } rand = "0.8.5" serde = { version = "1.0.137", features = ["derive"] } serde_json = "1.0.103" tera = "1.19.0" tinytemplate = "1.2.1" -upon = { path = ".." } +upon = { path = "..", features = ["syntax"] } [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } diff --git a/benches/benchdata/syntax/minijinja.html b/benches/benchdata/syntax/minijinja.html new file mode 100644 index 0000000..d234a79 --- /dev/null +++ b/benches/benchdata/syntax/minijinja.html @@ -0,0 +1,23 @@ + + + + { title } + + + + + + + + <%- for user in users -%> + <% if not user.is_disabled %> + <# This is a comment #> + + + + + <% endif %> + <%- endfor -%> +
NameAge
{ user.name }{ user.age }
+ + diff --git a/benches/benchdata/syntax/upon.html b/benches/benchdata/syntax/upon.html new file mode 100644 index 0000000..d234a79 --- /dev/null +++ b/benches/benchdata/syntax/upon.html @@ -0,0 +1,23 @@ + + + + { title } + + + + + + + + <%- for user in users -%> + <% if not user.is_disabled %> + <# This is a comment #> + + + + + <% endif %> + <%- endfor -%> +
NameAge
{ user.name }{ user.age }
+ + diff --git a/benches/benches/engines.rs b/benches/benches/engines.rs index 24ceb45..c336e16 100644 --- a/benches/benches/engines.rs +++ b/benches/benches/engines.rs @@ -11,6 +11,7 @@ criterion_group! { benches, bench_init, bench_compile, + bench_syntax, bench_render, bench_filters, } @@ -57,6 +58,25 @@ pub fn bench_compile(c: &mut Criterion) { bench!(Upon, "../benchdata/basic/upon.html"); } +/// Benchmarks the time taken to compile a template with custom syntax. +pub fn bench_syntax(c: &mut Criterion) { + let mut g = c.benchmark_group("syntax"); + + macro_rules! bench { + ($E:ty, $source:literal) => {{ + g.bench_function(<$E as Engine>::name(), |b| { + let source = repeat(include_str!($source), 50); + let mut engine = + <$E as Engine>::with_syntax(("{", "}"), ("<%", "%>"), ("<#", "#>")); + b.iter(|| engine.add_template("bench", &source)); + }); + }}; + } + + bench!(Minijinja, "../benchdata/syntax/minijinja.html"); + bench!(Upon, "../benchdata/syntax/upon.html"); +} + /// Benchmarks the time taken to render a template as a string. pub fn bench_render(c: &mut Criterion) { let mut g = c.benchmark_group("render"); diff --git a/benches/src/lib.rs b/benches/src/lib.rs index 92ea83e..5a54631 100644 --- a/benches/src/lib.rs +++ b/benches/src/lib.rs @@ -5,10 +5,19 @@ mod tests; use std::collections::HashMap; /// Abstraction for a template engine. -pub trait Engine<'a> { +pub trait Engine<'a>: Sized { fn name() -> &'static str; fn new() -> Self; - fn add_filters(&mut self); + fn with_syntax( + _expr: (&'static str, &'static str), + _block: (&'static str, &'static str), + _comment: (&'static str, &'static str), + ) -> Self { + unimplemented!() + } + fn add_filters(&mut self) { + unimplemented!() + } fn add_template(&mut self, name: &'static str, source: &'a str); fn render(&self, name: &'static str, ctx: &S) -> String where @@ -83,9 +92,6 @@ impl<'engine> Engine<'engine> for Liquid { } } - #[inline] - fn add_filters(&mut self) {} - #[inline] fn add_template(&mut self, name: &'static str, source: &'engine str) { let template = self.parser.parse(source).unwrap(); @@ -123,6 +129,24 @@ impl<'engine> Engine<'engine> for Minijinja<'engine> { } #[inline] + fn with_syntax( + (variable_start, variable_end): (&'static str, &'static str), + (block_start, block_end): (&'static str, &'static str), + (comment_start, comment_end): (&'static str, &'static str), + ) -> Self { + let mut env = minijinja::Environment::new(); + env.set_syntax(minijinja::Syntax { + block_start: block_start.into(), + block_end: block_end.into(), + variable_start: variable_start.into(), + variable_end: variable_end.into(), + comment_start: comment_start.into(), + comment_end: comment_end.into(), + }) + .unwrap(); + env + } + fn add_filters(&mut self) {} #[inline] @@ -156,7 +180,6 @@ impl<'engine> Engine<'engine> for Tera { tera::Tera::default() } - #[inline] fn add_filters(&mut self) {} #[inline] @@ -195,9 +218,6 @@ impl<'engine> Engine<'engine> for TinyTemplate<'engine> { tt } - #[inline] - fn add_filters(&mut self) {} - #[inline] fn add_template(&mut self, name: &'static str, source: &'engine str) { self.add_template(name, source).unwrap(); @@ -229,6 +249,21 @@ impl<'engine> Engine<'engine> for upon::Engine<'engine> { upon::Engine::new() } + #[inline] + fn with_syntax( + (begin_expr, end_expr): (&'static str, &'static str), + (begin_block, end_block): (&'static str, &'static str), + (begin_comment, end_comment): (&'static str, &'static str), + ) -> Self { + upon::Engine::with_syntax( + upon::Syntax::builder() + .expr(begin_expr, end_expr) + .block(begin_block, end_block) + .comment(begin_comment, end_comment) + .build(), + ) + } + #[inline] fn add_filters(&mut self) { self.add_filter("lower", str::to_lowercase); diff --git a/benches/src/testdata/syntax_minijinja.golden b/benches/src/testdata/syntax_minijinja.golden new file mode 100644 index 0000000..d46e5f1 --- /dev/null +++ b/benches/src/testdata/syntax_minijinja.golden @@ -0,0 +1,25 @@ + + + + My awesome webpage! + + + + + + + + + + + + + + + + + + +
NameAge
Nancy Wheeler17
Steve Harrington18
+ + \ No newline at end of file diff --git a/benches/src/testdata/syntax_upon.golden b/benches/src/testdata/syntax_upon.golden new file mode 100644 index 0000000..006f59b --- /dev/null +++ b/benches/src/testdata/syntax_upon.golden @@ -0,0 +1,25 @@ + + + + My awesome webpage! + + + + + + + + + + + + + + + + + + +
NameAge
Nancy Wheeler17
Steve Harrington18
+ + diff --git a/benches/src/tests.rs b/benches/src/tests.rs index aefabfc..02eb7d3 100644 --- a/benches/src/tests.rs +++ b/benches/src/tests.rs @@ -3,7 +3,21 @@ use crate::{Engine, Handlebars, Liquid, Minijinja, Tera, TinyTemplate, Upon}; macro_rules! t { ($E:ty, $source:literal) => {{ - let result = render::<$E>(include_str!($source)); + let result = render::<$E>(include_str!($source), false, false); + goldie::assert!(result); + }}; +} + +macro_rules! t_filters { + ($E:ty, $source:literal) => {{ + let result = render::<$E>(include_str!($source), false, true); + goldie::assert!(result); + }}; +} + +macro_rules! t_syntax { + ($E:ty, $source:literal) => {{ + let result = render::<$E>(include_str!($source), true, false); goldie::assert!(result); }}; } @@ -39,25 +53,35 @@ fn basic_upon() { #[test] fn filters_handlebars() { - t!(Handlebars, "../benchdata/filters/handlebars.html"); + t_filters!(Handlebars, "../benchdata/filters/handlebars.html"); } #[test] fn filters_minijinja() { - t!(Minijinja, "../benchdata/filters/minijinja.html"); + t_filters!(Minijinja, "../benchdata/filters/minijinja.html"); } #[test] fn filters_tera() { - t!(Tera, "../benchdata/filters/tera.html"); + t_filters!(Tera, "../benchdata/filters/tera.html"); } #[test] fn filters_upon() { - t!(Upon, "../benchdata/filters/upon.html"); + t_filters!(Upon, "../benchdata/filters/upon.html"); +} + +#[test] +fn syntax_minijinja() { + t_syntax!(Minijinja, "../benchdata/syntax/minijinja.html"); } -fn render<'a, E: Engine<'a>>(source: &'a str) -> String { +#[test] +fn syntax_upon() { + t_syntax!(Upon, "../benchdata/syntax/upon.html"); +} + +fn render<'a, E: Engine<'a>>(source: &'a str, syntax: bool, filters: bool) -> String { let ctx = Context { title: "My awesome webpage!".to_owned(), users: vec![ @@ -79,8 +103,14 @@ fn render<'a, E: Engine<'a>>(source: &'a str) -> String { ], }; - let mut engine = E::new(); - engine.add_filters(); + let mut engine = if syntax { + E::with_syntax(("{", "}"), ("<%", "%>"), ("<#", "#>")) + } else { + E::new() + }; + if filters { + engine.add_filters(); + } engine.add_template("bench", source); engine.render("bench", &ctx) } diff --git a/src/compile/lex.rs b/src/compile/lex.rs index c5ae961..18bd26e 100644 --- a/src/compile/lex.rs +++ b/src/compile/lex.rs @@ -1,6 +1,6 @@ use crate::compile::parse::Keyword; +use crate::types::delimiter::Delimiter; use crate::types::span::Span; -use crate::types::syntax; use crate::{Engine, Error, Result}; /// A lexer that tokenizes the template source into distinct chunks so that the @@ -196,8 +196,8 @@ impl<'engine, 'source> Lexer<'engine, 'source> { }; match self.engine.searcher.find_at(self.source, i) { - Some((kind, j, k)) => { - let (tk, trim) = Token::from_kind(kind); + Some((delimiter, j, k)) => { + let (tk, trim) = Token::from_delimiter(delimiter); if !tk.is_begin_tag() { return Err(self.err_unexpected_token(tk, j..k)); @@ -247,8 +247,8 @@ impl<'engine, 'source> Lexer<'engine, 'source> { // for the corresponding end tag `end`. let (tk, j) = match self.engine.searcher.starts_with(self.source, i) { - Some((kind, j)) => { - let (tk, trim) = Token::from_kind(kind); + Some((delimiter, j)) => { + let (tk, trim) = Token::from_delimiter(delimiter); if tk.is_begin_tag() { return Err(self.err_unclosed(begin, end)); @@ -364,8 +364,8 @@ impl<'engine, 'source> Lexer<'engine, 'source> { // i j k match self.engine.searcher.find_at(self.source, i) { - Some((kind, j, k)) => { - let (tk, trim) = Token::from_kind(kind); + Some((delimiter, j, k)) => { + let (tk, trim) = Token::from_delimiter(delimiter); if tk.is_begin_tag() { return Err(self.err_unclosed(begin, end)); @@ -566,20 +566,20 @@ impl Token { matches!(self, Self::Whitespace) } - fn from_kind(tk: syntax::Kind) -> (Self, bool) { - match tk { - syntax::Kind::BeginExpr => (Self::BeginExpr, false), - syntax::Kind::EndExpr => (Self::EndExpr, false), - syntax::Kind::BeginExprTrim => (Self::BeginExpr, true), - syntax::Kind::EndExprTrim => (Self::EndExpr, true), - syntax::Kind::BeginBlock => (Self::BeginBlock, false), - syntax::Kind::EndBlock => (Self::EndBlock, false), - syntax::Kind::BeginBlockTrim => (Self::BeginBlock, true), - syntax::Kind::EndBlockTrim => (Self::EndBlock, true), - syntax::Kind::BeginComment => (Self::BeginComment, false), - syntax::Kind::EndComment => (Self::EndComment, false), - syntax::Kind::BeginCommentTrim => (Self::BeginComment, true), - syntax::Kind::EndCommentTrim => (Self::EndComment, true), + fn from_delimiter(d: Delimiter) -> (Self, bool) { + match d { + Delimiter::BeginExpr => (Self::BeginExpr, false), + Delimiter::EndExpr => (Self::EndExpr, false), + Delimiter::BeginExprTrim => (Self::BeginExpr, true), + Delimiter::EndExprTrim => (Self::EndExpr, true), + Delimiter::BeginBlock => (Self::BeginBlock, false), + Delimiter::EndBlock => (Self::EndBlock, false), + Delimiter::BeginBlockTrim => (Self::BeginBlock, true), + Delimiter::EndBlockTrim => (Self::EndBlock, true), + Delimiter::BeginComment => (Self::BeginComment, false), + Delimiter::EndComment => (Self::EndComment, false), + Delimiter::BeginCommentTrim => (Self::BeginComment, true), + Delimiter::EndCommentTrim => (Self::EndComment, true), } } } @@ -641,6 +641,15 @@ mod tests { ); } + #[test] + fn lex_begin_expr_trickery() { + let tokens = lex("lorem { ipsum {{").unwrap(); + assert_eq!( + tokens, + [(Token::Raw, "lorem { ipsum "), (Token::BeginExpr, "{{"),] + ); + } + #[test] fn lex_begin_expr_trim() { let tokens = lex("lorem ipsum \t\n{{-").unwrap(); @@ -836,6 +845,23 @@ mod tests { ); } + #[test] + fn lex_block_trim() { + let tokens = lex("lorem ipsum {%- dolor -%} sit").unwrap(); + assert_eq!( + tokens, + [ + (Token::Raw, "lorem ipsum"), + (Token::BeginBlock, "{%-"), + (Token::Whitespace, " "), + (Token::Ident, "dolor"), + (Token::Whitespace, " "), + (Token::EndBlock, "-%}"), + (Token::Raw, "sit"), + ] + ); + } + #[test] fn lex_block_and_expr() { let tokens = @@ -899,6 +925,21 @@ mod tests { ); } + #[test] + fn lex_end_comment() { + let tokens = lex("lorem ipsum {# dolor #} sit amet").unwrap(); + assert_eq!( + tokens, + [ + (Token::Raw, "lorem ipsum "), + (Token::BeginComment, "{#"), + (Token::Raw, " dolor "), + (Token::EndComment, "#}"), + (Token::Raw, " sit amet"), + ] + ); + } + #[test] fn lex_end_comment_trim() { let tokens = lex("lorem ipsum {# -#} \t\ndolor sit amet").unwrap(); diff --git a/src/compile/search/aho_corasick.rs b/src/compile/search/aho_corasick.rs new file mode 100644 index 0000000..e52ef30 --- /dev/null +++ b/src/compile/search/aho_corasick.rs @@ -0,0 +1,41 @@ +use crate::types::delimiter::Delimiter; +use crate::types::syntax::Syntax; +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; + +#[cfg_attr(internal_debug, derive(Debug))] +pub struct AhoCorasickSearcher { + imp: AhoCorasick, + kinds: Vec, +} + +impl AhoCorasickSearcher { + pub fn new(syntax: Syntax) -> Self { + let imp = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(syntax.patterns) + .expect("failed to build AhoCorasick"); + Self { + imp, + kinds: syntax.marks, + } + } + + #[inline] + pub fn find_at(&self, source: &str, at: usize) -> Option<(Delimiter, usize, usize)> { + let sb = source.as_bytes(); + self.imp.find(&sb[at..]).map(|m| { + let kind = self.kinds[m.pattern()]; + (kind, at + m.start(), at + m.end()) + }) + } + + #[inline] + pub fn starts_with(&self, source: &str, at: usize) -> Option<(Delimiter, usize)> { + let (kind, i, j) = self.find_at(source, at)?; + if at == i { + Some((kind, j)) + } else { + None + } + } +} diff --git a/src/compile/search/ahocorasick/build.rs b/src/compile/search/ahocorasick/build.rs deleted file mode 100644 index 2afffcc..0000000 --- a/src/compile/search/ahocorasick/build.rs +++ /dev/null @@ -1,301 +0,0 @@ -//! A builder for an Aho-Corasick automaton. -//! -//! From the given set of patterns we build a state machine with a series of -//! states that encode a transition for every possible byte. This state machine -//! can then used to simultaneously search a string for the patterns. -//! -//! Consider building an Aho-Corasick automaton with the following patterns: -//! 'ab' and 'cd', the trie would look the following. Where the states are -//! represented as `S?` and have an asterisk (`*`) if there any matches at that -//! state. -//! -//! ```text -//! a - S1 - b - S2* -//! / -//! S0 - c - S3 - d - S4* -//! ``` -//! -//! In the above state machine there are no bytes that are the same between the -//! patterns. Now consider the following patterns: 'abe' and 'bcd'. In the case -//! of an input text of 'abcd', when at S2 we would end up failing to transition -//! to S3. But we can encode the failure in the automaton as a transition from -//! S2 to S4 and continue the search. What is not shown in these diagrams is that -//! *all* states have a failure transition, but only S2 has a *non-trivial* -//! failure transition. That is, all other states have a failure transition back -//! to the start state. -//! -//! ```text -//! a - S1 - b - S2 - e - S3* -//! / / -//! / ------- -//! / / -//! S0 - b - S4 - c - S5 - d - S6* -//! ``` -//! -//! Encoding the failure transitions is the most complex part of building the -//! automaton. Traditionally, this is implemented using a breadth-first search -//! starting with all transitions from the start state. For each state and for -//! every input transition at that state we follow the failure transitions -//! backward until we find a failure state that has a forward transition for -//! that input. That state must be the fail state for the original state. -//! -//! In order to support leftmost-longest match first semantics we also need -//! to make a few modifications to the way the failure transitions are built. - -use std::collections::VecDeque; - -use super::{AhoCorasick, Pattern, State, DEAD, FAIL, S, START}; - -#[derive(Default)] -pub struct Builder { - states: Vec, -} - -impl Builder { - pub fn build(mut self, patterns: I) -> AhoCorasick - where - I: IntoIterator, - X: Into, - P: AsRef<[u8]>, - { - self.push_state(0); // the fail state - self.push_state(0); // the dead state - self.push_state(0); // the start state - self.build_initial_trie(patterns); - - // Set the failure transitions in the start state to loop back to the - // start state. - let start = self.start_mut(); - for byte in all() { - if start.next_state(byte) == FAIL { - start.set_transition(byte, START); - } - } - - // Set the failure transitions in the dead state to loop back to the - // dead state. - let dead = self.state_mut(DEAD); - for byte in all() { - if dead.next_state(byte) == FAIL { - dead.set_transition(byte, DEAD); - } - } - - self.fill_failure_transitions(); - - // Remove the start state loop by rewriting any transitions on the start - // state back to the start state with transitions to the dead state. - if self.start().is_match() { - let start = self.start_mut(); - for byte in all() { - if start.next_state(byte) == START { - start.set_transition(byte, DEAD); - } - } - } - - let Self { states } = self; - AhoCorasick { states } - } - - /// Build the initial trie where each pattern has a path from the start - /// state until the end of the pattern. - fn build_initial_trie(&mut self, patterns: I) - where - I: IntoIterator, - X: Into, - P: AsRef<[u8]>, - { - for (pattern_id, pattern) in patterns.into_iter() { - let pattern = pattern.as_ref(); - - let mut id = START; - for (depth, &byte) in pattern.iter().enumerate() { - let next = self.state(id).next_state(byte); - if next == FAIL { - let next = self.push_state(depth + 1); - self.state_mut(id).set_transition(byte, next); - id = next; - } else { - id = next; - } - } - - let p = Pattern::new(pattern_id.into(), pattern.len()); - self.state_mut(id).push_match(p); - } - } - - fn fill_failure_transitions(&mut self) { - // Initialize the queue for breadth first search with all transitions - // out of the start state. We handle the start state specially because - // we only want to follow non-self transitions. If we followed self - // transitions, then this would never terminate. - let mut queue = VecDeque::new(); - for byte in all() { - let next = self.start().next_state(byte); - if next != START { - let match_depth = if self.start().is_match() { - Some(0) - } else { - None - }; - queue.push_back((next, match_depth)); - - // If a state immediately following the start state is a match - // state, then we never want to follow its failure transition - // since the failure transition necessarily leads back to the - // start state, which we never want to do for leftmost matching - // after a match has been found. - // - // N.B. This is a special case of the more general handling - // found below. - if self.state(next).is_match() { - self.state_mut(next).fail = DEAD; - } - } - } - - while let Some((curr, match_depth)) = queue.pop_front() { - let prev_len = queue.len(); - - for byte in all() { - let next = self.state(curr).next_state(byte); - if next == FAIL { - continue; - } - - let next_match_depth = match match_depth { - Some(d) => Some(d), - None if self.state(next).is_match() => { - let depth = self.state(next).depth - - self.state(next).get_longest_match_len().unwrap() - + 1; - Some(depth) - } - None => None, - }; - - queue.push_back((next, next_match_depth)); - - let fail = { - let mut id = self.state(curr).fail; - while self.state(id).next_state(byte) == FAIL { - id = self.state(id).fail; - } - self.state(id).next_state(byte) - }; - - // Thanks Andrew Gallant - if let Some(match_depth) = next_match_depth { - let fail_depth = self.state(fail).depth; - let next_depth = self.state(next).depth; - if next_depth - match_depth + 1 > fail_depth { - self.state_mut(next).fail = DEAD; - continue; - } - assert_ne!( - self.state(next).fail, - START, - "states that are match states or follow match \ - states should never have a failure transition \ - back to the start state in leftmost searching", - ); - } - - self.state_mut(next).fail = fail; - self.copy_matches(fail, next); - } - - // If there are no transitions for this state and if it's a match - // state, then we must set its failure transition to the dead - // state since we never want it to restart the search. - if queue.len() == prev_len && self.state(curr).is_match() { - self.state_mut(curr).fail = DEAD; - } - - // We don't need to copy empty matches from the start state here - // because that's only necessary for overlapping matches and - // leftmost match kinds don't support overlapping matches. - } - } - - fn copy_matches(&mut self, src: S, dst: S) { - assert!(src != dst, "src {src} must not be equal to dst {dst}"); - - // Simply gets a mutable reference to both states. - let i = src; - let j = dst; - let (src, dst) = if i < j { - let (left, right) = self.states.split_at_mut(j); - (&mut left[i], &mut right[0]) - } else { - let (left, right) = self.states.split_at_mut(i); - (&mut right[0], &mut left[j]) - }; - - dst.matches.extend_from_slice(&src.matches); - } - - fn push_state(&mut self, depth: usize) -> S { - let id = self.states.len(); - self.states.push(State { - depth, - fail: START, - trans: [FAIL; 256], - matches: vec![], - }); - id - // match id.try_into() { - // Ok(id) => id, - // Err(_) => { - // panic!( - // "state id type `{}` too small for the \ - // number of states in the automaton", - // std::any::type_name::() - // ); - // } - // } - } - - fn state(&self, id: S) -> &State { - &self.states[id] - } - - fn state_mut(&mut self, id: S) -> &mut State { - &mut self.states[id] - } - - fn start(&self) -> &State { - self.state(START) - } - - fn start_mut(&mut self) -> &mut State { - self.state_mut(START) - } -} - -impl State { - fn push_match(&mut self, p: Pattern) { - self.matches.push(p); - } - - fn set_transition(&mut self, byte: u8, to: S) { - self.trans[byte as usize] = to; - } - - fn get_longest_match_len(&self) -> Option { - // Why is this true? Because the first match in any matching state - // will always correspond to the match added to it during trie - // construction (since when we copy matches due to failure transitions, - // we always append them). Therefore, it follows that the first match - // must always be longest since any subsequent match must be from a - // failure transition, and a failure transition by construction points - // to a proper suffix. A proper suffix is, by definition, smaller. - self.matches.first().map(|&p| p.len) - } -} - -fn all() -> impl Iterator { - 0..=255 -} diff --git a/src/compile/search/ahocorasick/mod.rs b/src/compile/search/ahocorasick/mod.rs deleted file mode 100644 index 6b9772c..0000000 --- a/src/compile/search/ahocorasick/mod.rs +++ /dev/null @@ -1,291 +0,0 @@ -//! A fast, multi-pattern searcher based on [Aho-Corasick algorithm][wikipedia]. -//! -//! The design presented here mostly implements the standard algorithm as well -//! as some unique ideas from the excellent [`aho-corasick`][aho-corasick] -//! crate. This implementation only supports non-overlapping, leftmost-longest -//! match first semantics. -//! -//! [aho-corasick]: https://crates.io/crates/aho-corasick -//! [wikipedia]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm - -mod build; -mod state; - -use self::build::Builder; -use self::state::{State, DEAD, FAIL, S, START}; - -#[cfg_attr(internal_debug, derive(Debug))] -pub struct AhoCorasick { - states: Vec, -} - -#[cfg_attr(internal_debug, derive(Debug))] -pub struct Match { - pattern: Pattern, - end: usize, -} - -#[derive(Clone, Copy)] -#[cfg_attr(internal_debug, derive(Debug))] -pub struct Pattern { - id: usize, - len: usize, -} - -impl AhoCorasick { - pub fn new(patterns: I) -> Self - where - I: IntoIterator, - X: Into, - P: AsRef<[u8]>, - { - Builder::default().build(patterns) - } - - pub fn find_at(&self, haystack: T, mut at: usize) -> Option - where - T: AsRef<[u8]>, - { - let haystack = haystack.as_ref(); - - let mut state = START; - let mut last_match = self.get_match(state, 0, at); - while at < haystack.len() { - state = self.next_state(state, haystack[at]); - debug_assert!( - state != FAIL, - "an automaton should never return fail state for next state" - ); - at += 1; - - if state == DEAD { - debug_assert!( - last_match.is_some(), - "an automaton should never return a dead state without a prior match" - ); - return last_match; - } - - if let Some(m) = self.get_match(state, 0, at) { - last_match = Some(m); - } - } - last_match - } - - fn get_match(&self, id: S, match_id: usize, end: usize) -> Option { - self.state(id) - .matches - .get(match_id) - .map(|&pattern| Match { pattern, end }) - } - - fn next_state(&self, mut id: S, byte: u8) -> S { - loop { - let state = self.state(id); - let next = state.next_state(byte); - if next != FAIL { - return next; - } - id = state.fail; - } - } - - fn state(&self, id: S) -> &State { - &self.states[id] - } -} - -impl Match { - pub fn pattern_id(&self) -> usize { - self.pattern.id - } - - /// The starting position of the match. - pub fn start(&self) -> usize { - self.end - self.pattern.len - } - - /// The ending position of the match. - pub fn end(&self) -> usize { - self.end - } -} - -impl Pattern { - fn new(id: usize, len: usize) -> Self { - Self { id, len } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn aho_corasick_basics() { - t(&[], "", &[]); - t(&["a"], "", &[]); - t(&["a"], "a", &[(0, 0, 1)]); - t(&["a"], "aa", &[(0, 0, 1), (0, 1, 2)]); - t(&["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]); - t(&["a"], "aba", &[(0, 0, 1), (0, 2, 3)]); - t(&["a"], "bba", &[(0, 2, 3)]); - t(&["a"], "bbb", &[]); - t(&["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]); - t(&["aa"], "", &[]); - t(&["aa"], "aa", &[(0, 0, 2)]); - t(&["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]); - t(&["aa"], "abbab", &[]); - t(&["aa"], "abbabaa", &[(0, 5, 7)]); - t(&["abc"], "abc", &[(0, 0, 3)]); - t(&["abc"], "zazabzabcz", &[(0, 6, 9)]); - t(&["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]); - t(&["a", "b"], "", &[]); - t(&["a", "b"], "z", &[]); - t(&["a", "b"], "b", &[(1, 0, 1)]); - t(&["a", "b"], "a", &[(0, 0, 1)]); - t( - &["a", "b"], - "abba", - &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4)], - ); - t( - &["b", "a"], - "abba", - &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4)], - ); - t(&["abc", "bc"], "xbc", &[(1, 1, 3)]); - t(&["foo", "bar"], "", &[]); - t(&["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6)]); - t(&["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6)]); - t(&["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6)]); - t(&["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6)]); - t(&["foo", "bar"], "bafofoo", &[(0, 4, 7)]); - t(&["bar", "foo"], "bafofoo", &[(1, 4, 7)]); - t(&["foo", "bar"], "fobabar", &[(1, 4, 7)]); - t(&["bar", "foo"], "fobabar", &[(0, 4, 7)]); - t(&[""], "", &[(0, 0, 0)]); - t(&[""], "a", &[(0, 0, 0), (0, 1, 1)]); - t(&[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]); - t(&["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7)]); - t(&["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10)]); - t( - &["yabcdef", "bcdeyabc", "abcdezghi"], - "yabcdezghi", - &[(2, 1, 10)], - ); - } - - #[test] - fn aho_corasick_non_overlapping() { - t(&["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]); - t(&["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]); - t(&["abc", "bc"], "zazabcz", &[(0, 3, 6)]); - t(&["ab", "ba"], "abababa", &[(0, 0, 2), (0, 2, 4), (0, 4, 6)]); - t(&["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9)]); - t(&["", ""], "", &[(0, 0, 0)]); - t(&["", ""], "a", &[(0, 0, 0), (0, 1, 1)]); - } - - #[test] - fn aho_corasick_leftmost() { - t(&["ab", "ab"], "abcd", &[(0, 0, 2)]); - t(&["a", ""], "a", &[(0, 0, 1), (1, 1, 1)]); - t(&["", ""], "a", &[(0, 0, 0), (0, 1, 1)]); - t(&["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]); - t(&["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]); - t(&["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]); - t(&["abcd", "bce", "b"], "abce", &[(1, 1, 4)]); - t(&["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]); - t(&["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]); - t(&["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]); - t(&["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]); - t(&["abc", "bd", "ab"], "abd", &[(2, 0, 2)]); - t(&["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8)]); - t( - &["abcdefghi", "cde", "hz", "abcdefgh"], - "abcdefghz", - &[(3, 0, 8)], - ); - t( - &["abcdefghi", "hz", "abcdefgh", "a"], - "abcdefghz", - &[(2, 0, 8)], - ); - t( - &["b", "abcdefghi", "hz", "abcdefgh"], - "abcdefghz", - &[(3, 0, 8)], - ); - t( - &["h", "abcdefghi", "hz", "abcdefgh"], - "abcdefghz", - &[(3, 0, 8)], - ); - t( - &["z", "abcdefghi", "hz", "abcdefgh"], - "abcdefghz", - &[(3, 0, 8), (0, 8, 9)], - ); - } - - #[test] - fn aho_corasick_leftmost_longest() { - t(&["ab", "abcd"], "abcd", &[(1, 0, 4)]); - t(&["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]); - t(&["", "a"], "a", &[(1, 0, 1), (0, 1, 1)]); - t(&["", "a", ""], "a", &[(1, 0, 1), (0, 1, 1)]); - t(&["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1)]); - t(&["", "", "a"], "a", &[(2, 0, 1), (0, 1, 1)]); - t(&["", "a"], "aa", &[(1, 0, 1), (1, 1, 2), (0, 2, 2)]); - t(&["a", "ab"], "a", &[(0, 0, 1)]); - t(&["a", "ab"], "ab", &[(1, 0, 2)]); - t(&["ab", "a"], "a", &[(1, 0, 1)]); - t(&["ab", "a"], "ab", &[(0, 0, 2)]); - t(&["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]); - t(&["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]); - t(&["abcd", "b", "bce"], "abce", &[(2, 1, 4)]); - t( - &["a", "abcdefghi", "hz", "abcdefgh"], - "abcdefghz", - &[(3, 0, 8)], - ); - t(&["a", "abab"], "abab", &[(1, 0, 4)]); - t(&["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4)]); - t(&["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]); - } - - #[track_caller] - fn t(patterns: &[&str], haystack: &str, exp: &[(usize, usize, usize)]) { - let ac = AhoCorasick::new(patterns.iter().enumerate()); - let matches: Vec<_> = ac - .find_iter(haystack.as_ref()) - .map(|m| (m.pattern_id(), m.start(), m.end())) - .take(10) - .collect(); - assert_eq!(matches, exp); - } - - impl AhoCorasick { - pub fn find_iter<'a>(&'a self, haystack: &'a [u8]) -> impl Iterator + 'a { - let mut pos = 0; - std::iter::from_fn(move || { - if pos > haystack.len() { - return None; - } - let mat = self.find_at(haystack, pos)?; - if mat.end() == pos { - // If the automaton can match the empty string and if we - // found an empty match, then we need to forcefully move the - // position. - pos += 1; - } else { - pos = mat.end(); - } - - Some(mat) - }) - } - } -} diff --git a/src/compile/search/ahocorasick/state.rs b/src/compile/search/ahocorasick/state.rs deleted file mode 100644 index d2f0806..0000000 --- a/src/compile/search/ahocorasick/state.rs +++ /dev/null @@ -1,41 +0,0 @@ -use super::Pattern; - -/// A unique identifier for a state. -pub type S = usize; - -/// The identifier for an automaton's fail state. -pub const FAIL: S = 0; - -/// The identifier for an automaton's dead state. -pub const DEAD: S = 1; - -/// The identifier for an automaton's start state. -pub const START: S = 2; - -/// A state in an Aho-Corasick automaton. -#[cfg_attr(internal_debug, derive(Debug))] -pub struct State { - /// The transitions to the next state. - pub trans: [S; 256], - - /// The failure transition. - pub fail: S, - - /// The patterns that are matched at this state. - pub matches: Vec, - - /// The distance from the start state in the automaton. - pub depth: usize, -} - -impl State { - /// Returns the next state for the given input byte. - pub fn next_state(&self, byte: u8) -> S { - self.trans[byte as usize] - } - - /// Whether or not this state contains any matches. - pub fn is_match(&self) -> bool { - !self.matches.is_empty() - } -} diff --git a/src/compile/search/mod.rs b/src/compile/search/mod.rs index 9cb308b..ed8620d 100644 --- a/src/compile/search/mod.rs +++ b/src/compile/search/mod.rs @@ -1,38 +1,112 @@ -mod ahocorasick; +#[cfg(feature = "syntax")] +mod aho_corasick; -use crate::compile::search::ahocorasick::AhoCorasick; -use crate::types::syntax::{Kind, Syntax}; +#[cfg(feature = "syntax")] +use crate::compile::search::aho_corasick::AhoCorasickSearcher; +use crate::types::delimiter::Delimiter; +#[cfg(feature = "syntax")] +use crate::types::syntax::Syntax; #[cfg_attr(internal_debug, derive(Debug))] -pub struct Searcher { - imp: AhoCorasick, +pub enum Searcher { + Default(DefaultSearcher), + #[cfg(feature = "syntax")] + AhoCorasick(AhoCorasickSearcher), } impl Searcher { - pub fn new(syntax: Syntax) -> Self { - let imp = AhoCorasick::new(syntax.patterns); - Self { imp } + pub fn new() -> Self { + Self::Default(DefaultSearcher) } - pub fn find_at(&self, haystack: T, at: usize) -> Option<(Kind, usize, usize)> - where - T: AsRef<[u8]>, - { - self.imp.find_at(haystack, at).map(|m| { - let kind = Kind::from_usize(m.pattern_id()); - (kind, m.start(), m.end()) - }) + #[cfg(feature = "syntax")] + pub fn with_syntax(syntax: Syntax) -> Self { + Self::AhoCorasick(AhoCorasickSearcher::new(syntax)) } - pub fn starts_with(&self, haystack: T, at: usize) -> Option<(Kind, usize)> - where - T: AsRef<[u8]>, - { - let (kind, i, j) = self.find_at(haystack, at)?; - if at == i { - Some((kind, j)) - } else { - None + #[inline] + pub fn find_at(&self, source: &str, at: usize) -> Option<(Delimiter, usize, usize)> { + match self { + Self::Default(searcher) => searcher.find_at(source, at), + #[cfg(feature = "syntax")] + Self::AhoCorasick(searcher) => searcher.find_at(source, at), + } + } + + #[inline] + pub fn starts_with(&self, source: &str, i: usize) -> Option<(Delimiter, usize)> { + match self { + Self::Default(searcher) => searcher.starts_with(source, i), + #[cfg(feature = "syntax")] + Self::AhoCorasick(searcher) => searcher.starts_with(source, i), + } + } +} + +#[cfg_attr(internal_debug, derive(Debug))] +pub struct DefaultSearcher; + +impl DefaultSearcher { + #[inline] + fn find_at(&self, source: &str, mut at: usize) -> Option<(Delimiter, usize, usize)> { + let sb = source.as_bytes(); + loop { + let mark = at + sb[at..].iter().position(|&b| b == b'{' || b == b'}')?; + if sb[mark] == b'{' { + let i = mark; + match &sb[i..] { + // expr + [b'{', b'{', b'-', ..] => return Some((Delimiter::BeginExprTrim, i, i + 3)), + [b'{', b'{', ..] => return Some((Delimiter::BeginExpr, i, i + 2)), + // block + [b'{', b'%', b'-', ..] => return Some((Delimiter::BeginBlockTrim, i, i + 3)), + [b'{', b'%', ..] => return Some((Delimiter::BeginBlock, i, i + 2)), + // comment + [b'{', b'#', b'-', ..] => return Some((Delimiter::BeginCommentTrim, i, i + 3)), + [b'{', b'#', ..] => return Some((Delimiter::BeginComment, i, i + 2)), + _ => at = i + 1, + } + } else { + let j = mark + 1; + let i = j.saturating_sub(3); + match &sb[i..] { + // expr + [b'-', b'}', b'}', ..] => return Some((Delimiter::EndExprTrim, i, i + 3)), + [_, b'}', b'}', ..] => return Some((Delimiter::EndExprTrim, i + 1, i + 3)), + [b'}', b'}', ..] => return Some((Delimiter::EndExpr, i, i + 2)), + // block + [b'-', b'%', b'}', ..] => return Some((Delimiter::EndBlockTrim, i, i + 3)), + [_, b'%', b'}', ..] => return Some((Delimiter::EndBlock, i + 1, i + 3)), + [b'%', b'}', ..] => return Some((Delimiter::EndBlock, i, i + 2)), + // comment + [b'-', b'#', b'}', ..] => return Some((Delimiter::EndCommentTrim, i, i + 3)), + [_, b'#', b'}', ..] => return Some((Delimiter::EndComment, i + 1, i + 3)), + [b'#', b'}', ..] => return Some((Delimiter::EndComment, i, i + 2)), + _ => at = j, + } + } + } + } + + #[inline] + fn starts_with(&self, source: &str, i: usize) -> Option<(Delimiter, usize)> { + let sb = source.as_bytes(); + match &sb[i..] { + // begin + [b'{', b'{', b'-', ..] => Some((Delimiter::BeginExprTrim, i + 3)), + [b'{', b'{', ..] => Some((Delimiter::BeginExpr, i + 2)), + [b'{', b'%', b'-', ..] => Some((Delimiter::BeginBlockTrim, i + 3)), + [b'{', b'%', ..] => Some((Delimiter::BeginBlock, i + 2)), + [b'{', b'#', b'-', ..] => Some((Delimiter::BeginCommentTrim, i + 3)), + [b'{', b'#', ..] => Some((Delimiter::BeginComment, i + 2)), + // end + [b'-', b'}', b'}', ..] => Some((Delimiter::EndExprTrim, i + 3)), + [b'}', b'}', ..] => Some((Delimiter::EndExpr, i + 2)), + [b'-', b'%', b'}', ..] => Some((Delimiter::EndBlockTrim, i + 3)), + [b'%', b'}', ..] => Some((Delimiter::EndBlock, i + 2)), + [b'-', b'#', b'}', ..] => Some((Delimiter::EndCommentTrim, i + 3)), + [b'#', b'}', ..] => Some((Delimiter::EndComment, i + 2)), + _ => None, } } } diff --git a/src/lib.rs b/src/lib.rs index 11410f4..dc5aa5d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,6 +117,10 @@ //! [`render_from(..)`][TemplateRef::render_from] to render templates and //! construct the context using [`Value`]'s `From` impls. //! +//! - **`syntax`** _(disabled by default)_ — Enables support for configuring +//! custom delimiters in templates (see [`Engine::with_syntax`]) and pulls in +//! the [`aho-corasick`][aho_corasick] crate. +//! //! - **`unicode`** _(enabled by default)_ — Enables unicode support and pulls //! in the [`unicode-ident`][unicode_ident] and //! [`unicode-width`][unicode_width] crates. If disabled then unicode @@ -239,6 +243,8 @@ use std::collections::BTreeMap; pub use crate::error::Error; pub use crate::render::Renderer; +#[cfg(feature = "syntax")] +#[cfg_attr(docsrs, doc(cfg(feature = "syntax")))] pub use crate::types::syntax::{Syntax, SyntaxBuilder}; #[cfg(feature = "serde")] #[cfg_attr(docsrs, doc(cfg(feature = "serde")))] @@ -344,7 +350,7 @@ impl<'engine> Engine<'engine> { /// Construct a new engine. #[inline] pub fn new() -> Self { - Self::with_syntax(Syntax::default()) + Self::with_searcher(Searcher::new()) } /// Construct a new engine with custom syntax. @@ -357,10 +363,24 @@ impl<'engine> Engine<'engine> { /// let syntax = Syntax::builder().expr("<{", "}>").block("<[", "]>").build(); /// let engine = Engine::with_syntax(syntax); /// ``` + /// + /// # Note + /// + /// Passing a custom syntax to this function always uses the `aho-corasick` + /// implementation for searching. This means that even if you pass the + /// default syntax to this function it is *not* equivalent to + /// [`Engine::new()`][Engine::new]. + #[cfg_attr(docsrs, doc(cfg(feature = "syntax")))] + #[cfg(feature = "syntax")] #[inline] pub fn with_syntax(syntax: Syntax<'engine>) -> Self { + Self::with_searcher(Searcher::with_syntax(syntax)) + } + + #[inline] + fn with_searcher(searcher: Searcher) -> Self { Self { - searcher: Searcher::new(syntax), + searcher, default_formatter: &fmt::default, functions: BTreeMap::new(), templates: BTreeMap::new(), diff --git a/src/syntax.rs b/src/syntax.rs index c74e1e2..0d321f7 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -5,8 +5,8 @@ //! contains [**expressions**](#expressions) for rendering values and //! [**blocks**](#blocks) for controlling logic. These require you to use //! specific syntax delimiters in the template. Because `upon` allows you to -//! configure these delimiters, this document will only refer to the -//! [**default**][crate::Syntax::default] configuration. +//! configure these delimiters, this document will only refer to the **default** +//! configuration. //! //! # Expressions //! diff --git a/src/types/delimiter.rs b/src/types/delimiter.rs new file mode 100644 index 0000000..10aad74 --- /dev/null +++ b/src/types/delimiter.rs @@ -0,0 +1,15 @@ +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Delimiter { + BeginExpr, + EndExpr, + BeginExprTrim, + EndExprTrim, + BeginBlock, + EndBlock, + BeginBlockTrim, + EndBlockTrim, + BeginComment, + EndComment, + BeginCommentTrim, + EndCommentTrim, +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 98ca6de..3ab9a8c 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,4 +1,6 @@ pub mod ast; +pub mod delimiter; pub mod program; pub mod span; +#[cfg(feature = "syntax")] pub mod syntax; diff --git a/src/types/syntax.rs b/src/types/syntax.rs index 2f4c141..2254c82 100644 --- a/src/types/syntax.rs +++ b/src/types/syntax.rs @@ -1,18 +1,21 @@ use std::marker::PhantomData; +use crate::types::delimiter::Delimiter; + /// The template syntax configuration. /// /// Use [`Syntax::default()`] to get the default syntax configuration and /// [`Syntax::builder()`] to create a custom syntax configuration. #[derive(Debug, Clone, PartialEq, Eq)] pub struct Syntax<'a> { - pub(crate) patterns: Vec<(Kind, String)>, + pub(crate) marks: Vec, + pub(crate) patterns: Vec, _marker: PhantomData<&'a ()>, } /// A builder for the syntax configuration. /// -/// This struct is typically created using [`Syntax::builder()`]. +/// This struct is created using [`Syntax::builder()`]. #[derive(Debug, Clone)] pub struct SyntaxBuilder<'a> { expr: Option<(&'a str, &'a str)>, @@ -20,30 +23,6 @@ pub struct SyntaxBuilder<'a> { comment: Option<(&'a str, &'a str)>, } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Kind { - BeginExpr = 0, - EndExpr = 1, - BeginExprTrim = 2, - EndExprTrim = 3, - BeginBlock = 4, - EndBlock = 5, - BeginBlockTrim = 6, - EndBlockTrim = 7, - BeginComment = 8, - EndComment = 9, - BeginCommentTrim = 10, - EndCommentTrim = 11, -} - -#[test] -fn kind_usize() { - for p in 0..12 { - let k = Kind::from_usize(p); - assert_eq!(k as usize, p); - } -} - impl Default for Syntax<'_> { /// Returns the default syntax configuration. /// @@ -81,22 +60,15 @@ impl<'a> Syntax<'a> { /// ``` #[inline] pub fn builder() -> SyntaxBuilder<'a> { - SyntaxBuilder::new() - } -} - -impl<'a> SyntaxBuilder<'a> { - /// Creates a new syntax builder. - #[inline] - #[allow(clippy::new_without_default)] - pub fn new() -> Self { - Self { + SyntaxBuilder { expr: None, block: None, comment: None, } } +} +impl<'a> SyntaxBuilder<'a> { /// Set the block syntax. /// /// If not set then the expression syntax will not be available. @@ -141,54 +113,34 @@ impl<'a> SyntaxBuilder<'a> { /// Builds the syntax configuration. pub fn build(&self) -> Syntax<'a> { + let mut kinds = Vec::new(); let mut patterns = Vec::new(); + let mut push = |kind, pattern| { + kinds.push(kind); + patterns.push(pattern); + }; if let Some((begin, end)) = self.expr { - patterns.push((Kind::BeginExpr, begin.into())); - patterns.push((Kind::EndExpr, end.into())); - patterns.push((Kind::BeginExprTrim, format!("{begin}-"))); - patterns.push((Kind::EndExprTrim, format!("-{end}"))); + push(Delimiter::BeginExpr, begin.into()); + push(Delimiter::EndExpr, end.into()); + push(Delimiter::BeginExprTrim, format!("{begin}-")); + push(Delimiter::EndExprTrim, format!("-{end}")); }; if let Some((begin, end)) = self.block { - patterns.push((Kind::BeginBlock, begin.into())); - patterns.push((Kind::EndBlock, end.into())); - patterns.push((Kind::BeginBlockTrim, format!("{begin}-"))); - patterns.push((Kind::EndBlockTrim, format!("-{end}"))); + push(Delimiter::BeginBlock, begin.into()); + push(Delimiter::EndBlock, end.into()); + push(Delimiter::BeginBlockTrim, format!("{begin}-")); + push(Delimiter::EndBlockTrim, format!("-{end}")); } if let Some((begin, end)) = self.comment { - patterns.push((Kind::BeginComment, begin.into())); - patterns.push((Kind::EndComment, end.into())); - patterns.push((Kind::BeginCommentTrim, format!("{begin}-"))); - patterns.push((Kind::EndCommentTrim, format!("-{end}"))); + push(Delimiter::BeginComment, begin.into()); + push(Delimiter::EndComment, end.into()); + push(Delimiter::BeginCommentTrim, format!("{begin}-")); + push(Delimiter::EndCommentTrim, format!("-{end}")); } Syntax { + marks: kinds, patterns, _marker: PhantomData, } } } - -impl Kind { - pub fn from_usize(id: usize) -> Self { - match id { - 0 => Self::BeginExpr, - 1 => Self::EndExpr, - 2 => Self::BeginExprTrim, - 3 => Self::EndExprTrim, - 4 => Self::BeginBlock, - 5 => Self::EndBlock, - 6 => Self::BeginBlockTrim, - 7 => Self::EndBlockTrim, - 8 => Self::BeginComment, - 9 => Self::EndComment, - 10 => Self::BeginCommentTrim, - 11 => Self::EndCommentTrim, - _ => unreachable!(), - } - } -} - -impl From for usize { - fn from(k: Kind) -> Self { - k as usize - } -} diff --git a/tests/lex.rs b/tests/lex.rs index 89fb1d1..dc77674 100644 --- a/tests/lex.rs +++ b/tests/lex.rs @@ -1,4 +1,4 @@ -use upon::{Engine, Error, Syntax}; +use upon::{Engine, Error}; #[test] fn lex_while_eof() { @@ -17,17 +17,25 @@ fn lex_while_eof() { ); } +#[cfg(feature = "syntax")] #[test] fn lex_syntax_overlapping() { - let syntax = Syntax::builder().expr("{", "}").block("{{", "}}").build(); + let syntax = upon::Syntax::builder() + .expr("{", "}") + .block("{{", "}}") + .build(); Engine::with_syntax(syntax) .compile("lorem { ipsum } {{ if dolor }} {{ endif }} sit amet") .unwrap(); } +#[cfg(feature = "syntax")] #[test] fn lex_syntax_overlapping_flipped() { - let syntax = Syntax::builder().expr("{{", "}}").block("{", "}").build(); + let syntax = upon::Syntax::builder() + .expr("{{", "}}") + .block("{", "}") + .build(); Engine::with_syntax(syntax) .compile("lorem {{ ipsum }} { if dolor } { endif } sit amet") .unwrap(); @@ -40,9 +48,13 @@ fn lex_syntax_whitespace_trimming() { .unwrap(); } +#[cfg(feature = "syntax")] #[test] fn lex_syntax_precedence() { - let syntax = Syntax::builder().expr("{|", "|}").block("{", "}").build(); + let syntax = upon::Syntax::builder() + .expr("{|", "|}") + .block("{", "}") + .build(); Engine::with_syntax(syntax) .compile("lorem {| ipsum | dolor |} sit") .unwrap();