From 2be4f865672416cfe1f6b302ab5f49b676b60425 Mon Sep 17 00:00:00 2001 From: vallentin Date: Wed, 12 Jul 2023 20:12:36 +0200 Subject: [PATCH] Implemented plain text lexer --- colorblast/src/lexers/mod.rs | 13 +++++--- colorblast/src/lexers/text.rs | 57 +++++++++++++++++++++++++++++++++++ colorblast/src/token.rs | 1 + 3 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 colorblast/src/lexers/text.rs diff --git a/colorblast/src/lexers/mod.rs b/colorblast/src/lexers/mod.rs index b4d7a42..0f22a39 100644 --- a/colorblast/src/lexers/mod.rs +++ b/colorblast/src/lexers/mod.rs @@ -16,10 +16,12 @@ pub mod prelude { mod json; mod jsonc; mod rust; +mod text; pub use self::json::*; pub use self::jsonc::*; pub use self::rust::*; +pub use self::text::*; use crate::{IntoSimpleToken, SimpleTokenIter, Token, TokenSpan}; @@ -27,21 +29,21 @@ macro_rules! impl_enum_lexer { ( $( $(#[$attr:meta])* - $name:ident => $lexer:ident, - )+ + $name:ident => $lexer:ident + ),+ $(,)? ) => { #[derive(PartialEq, Eq, Clone, Copy, Debug)] #[non_exhaustive] pub enum Lexer { $( $(#[$attr])* - $name, - )+ + $name + ),+ } impl Lexer { pub const VARIANTS: &[Self] = &[ - $(Self::$name,)+ + $(Self::$name),+ ]; pub fn into_lexer<'text>( @@ -67,6 +69,7 @@ impl_enum_lexer!( /// [JSON with Comments]: https://code.visualstudio.com/docs/languages/json#_json-with-comments JsonC => JsonCLexer, Rust => RustLexer, + PlainText => PlainTextLexer, ); macro_rules! impl_iter { diff --git a/colorblast/src/lexers/text.rs b/colorblast/src/lexers/text.rs new file mode 100644 index 0000000..4fba15c --- /dev/null +++ b/colorblast/src/lexers/text.rs @@ -0,0 +1,57 @@ +use super::{impl_iter, Token, TokenSpan}; + +/// Plain text lexer is a simple dummy passthrough tokenizer, +/// which produces at most a single [Token]::[Text]. +/// +/// **Note:** Cloning `PlainTextLexer` is essentially a copy, as it +/// just contains a `&str`. However, `Copy` is not implemented, +/// to avoid accidentally copying immutable `PlainTextLexer`s. +/// +/// # Warning +/// +/// If you are about to use `PlainTextLexer` for anything outside the scope of the +/// [`colorblast` crate], then please see the warning in the [`lexers` module]. +/// +/// [`colorblast` crate]: crate +/// [`lexers` module]: super#warning +/// [Text]: Token::Text +#[derive(Clone, Debug)] +pub struct PlainTextLexer<'text> { + text: Option<&'text str>, +} + +impl<'text> PlainTextLexer<'text> { + #[inline] + pub fn new(text: &'text str) -> Self { + Self { text: Some(text) } + } + + #[inline] + fn next_token(&mut self) -> Option<(Token, TokenSpan<'text>)> { + let text = self.text.take()?; + if text.is_empty() { + return None; + } + Some((Token::Text, TokenSpan::new(text, 0..text.len()))) + } +} + +impl_iter!('text, PlainTextLexer<'text>); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_plain_text_lexer_spans() { + let input = include_str!("../../../text-scanner/src/ext/rust.rs"); + let mut output = String::new(); + + let lexer = PlainTextLexer::new(input); + for (_tok, span) in lexer { + output.push_str(span.as_str()); + } + + assert_eq!(input, output); + } +} diff --git a/colorblast/src/token.rs b/colorblast/src/token.rs index f44a5ac..d522e11 100644 --- a/colorblast/src/token.rs +++ b/colorblast/src/token.rs @@ -65,6 +65,7 @@ macro_rules! impl_enum_token { impl_enum_token!( Space, Comment, + /// Token representing text, which might contain whitespace. Text, Var, Var2,