Skip to content

Commit

Permalink
fix: avoid collecting iterator during token resolving (#112)
Browse files Browse the repository at this point in the history
* fix: avoid collecting iterator in `TokenResolver`

* fix: add test cases for bold ambiguous start and end

* fix: remove unnecessary checks on symbol flattening

* fix: remove redundant check for interruption of tokens

* fix: use global substitutor to prevent repeated allocs

* fix: remove dead code

* fix: optimize resolving interrupted tokens as plain

* fix: use `fxhash` for faster hashing in substitutor

* fix: re-enable optimized debug assertions in Symbol::flatten

* fix: use alias for TokenMap keys

* chore: remove unused dependency

* fix: don't clone iterator if there's no need to
  • Loading branch information
nfejzic authored Oct 23, 2023
1 parent 6ff4562 commit 5c9db5a
Show file tree
Hide file tree
Showing 13 changed files with 443 additions and 159 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ clap = { version = "4.2.7", features = ["derive", "cargo", "env"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_yaml = "0.8.23"
ribbon = "0.7.0"
42 changes: 23 additions & 19 deletions commons/src/scanner/symbol/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,31 +151,35 @@ impl Symbol<'_> {
}

/// Flattens the input of consecutive symbols. Returns the slice of input starting from start
/// position of first symbol until the end of last symbol.
/// position of first symbol until the end of last symbol. Returns [`None`] if slice is empty.
///
/// # Panics
///
/// It's assumed that all [`Symbol`]s in slice reference the same input. If not, the function
/// might panic (guaranteed in debug) if inputs are not the same and last [`Symbol`] in slice
/// references input that is longer than the one referenced in the first [`Symbol`].
///
/// # Examples
///
/// Returns `None` if the referenced input in the given symbols is not the same.
/// ```
/// use unimarkup_commons::scanner::{scan_str, Symbol};
///
/// let input = "This is a string";
/// let symbols: Vec<_> = scan_str(input);
///
/// assert_eq!(input, Symbol::flatten(&symbols).unwrap());
/// ```
pub fn flatten(symbols: &[Self]) -> Option<&str> {
debug_assert!(symbols
.windows(2)
.all(|window| window[0].input == window[1].input));

if symbols.is_empty() {
return Some("");
}
let (first, last) = (symbols.first()?, symbols.last()?);

let first = symbols.first()?;
let last = symbols.last()?;
debug_assert_eq!(first.input, last.input);

if first.input == last.input {
let input = first.input;
let input = first.input;

let start = first.offset.start;
let end = last.offset.end;
let start = first.offset.start;
let end = last.offset.end;

Some(&input[start..end])
} else {
None
}
Some(&input[start..end])
}

/// Flattens the iterator of consecutive symbols. Returns the slice of input starting from start
Expand Down
2 changes: 2 additions & 0 deletions inline/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ harness=false

[dependencies]
logid.workspace = true
ribbon.workspace = true
unimarkup-commons = { path = "../commons/", version = "0" }
fxhash = "0.2.1"

[dev-dependencies]
unimarkup-commons = { path ="../commons/", version = "0", features = ["test_runner"] }
Expand Down
23 changes: 16 additions & 7 deletions inline/src/inlines/substitute.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use std::collections::{HashMap, HashSet};
//! Substitutor and constants that can be substituted in Unimarkup content.
use fxhash::{FxHashMap, FxHashSet};

use logid::evident::once_cell::sync::Lazy;
use unimarkup_commons::scanner::{self, span::Span};

/// ASCII Emojis that can be replaced with their Unicode versions in a Unimarkup text.
Expand Down Expand Up @@ -75,27 +78,33 @@ pub const ALIASES: [(&str, &str); 20] = [

#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Substitutor<'a> {
direct: HashMap<&'a str, &'a str>,
aliased: HashMap<&'a str, &'a str>,
direct: FxHashMap<&'a str, &'a str>,
aliased: FxHashMap<&'a str, &'a str>,
max_len: usize,
first_grapheme: HashSet<&'a str>,
first_grapheme: FxHashSet<&'a str>,
}

static GLOBAL: Lazy<Substitutor> = Lazy::new(Substitutor::create_global);

impl<'sub> Substitutor<'sub> {
pub(crate) fn new() -> Self {
let direct: HashMap<_, _> = EMOJIS.into_iter().chain(ARROWS).collect();
fn create_global() -> Substitutor<'static> {
let direct: FxHashMap<_, _> = EMOJIS.into_iter().chain(ARROWS).collect();
let aliased = ALIASES.into_iter().collect();
let max_len = direct.keys().map(|key| key.len()).max().unwrap_or(0);
let first_grapheme = direct.keys().map(|key| &key[0..1]).collect();

Self {
Substitutor {
direct,
aliased,
max_len,
first_grapheme,
}
}

pub fn global() -> &'static Substitutor<'static> {
&GLOBAL
}

pub(crate) fn get_subst(&self, slice: &'sub str, span: Span) -> Option<Substitute<'sub>> {
let content = *self.direct.get(slice)?;

Expand Down
6 changes: 3 additions & 3 deletions inline/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ impl<'token> Lexer<'token> {
TokenIterator {
symbols,
index: 0,
substitutor: Substitutor::new(),
substitutor: Substitutor::global(),
}
}

Expand Down Expand Up @@ -246,7 +246,7 @@ pub struct TokenIterator<'input> {
/// [`Substitutor`] used for resolving inline substitutions. Right now, substitutor uses only
/// built-in substitutions and has 'static lifetime per default, and can be shortened to any
/// other lifetime.
substitutor: Substitutor<'input>,
substitutor: &'static Substitutor<'static>,
}

impl<'input> TokenIterator<'input> {
Expand Down Expand Up @@ -558,7 +558,7 @@ impl<'input> Iterator for TokenIterator<'input> {
// 3. next grapheme is not a keyword -> it is plain text

match self.get_symbol(self.index) {
Some(symbol) if symbol.is_keyword() || symbol.is_start_of_subst(&self.substitutor) => {
Some(symbol) if symbol.is_keyword() || symbol.is_start_of_subst(self.substitutor) => {
self.lex_keyword()
}
Some(symbol) if symbol.is_esc() => {
Expand Down
Loading

0 comments on commit 5c9db5a

Please sign in to comment.