Skip to content

Commit

Permalink
feat: define more types of tokens.
Browse files Browse the repository at this point in the history
  • Loading branch information
plusvic committed Jul 3, 2024
1 parent 58fbfb5 commit 4faae2d
Show file tree
Hide file tree
Showing 3 changed files with 404 additions and 32 deletions.
112 changes: 109 additions & 3 deletions parser-ng/src/parser/cst/syntax_kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,77 @@ use crate::tokenizer::Token;
#[repr(u16)]
#[allow(non_camel_case_types)]
pub enum SyntaxKind {
ALL_KW,
AND_KW,
ANY_KW,
ASCII_KW,
AT_KW,
BASE64_KW,
BASE64WIDE_KW,
CONDITION_KW,
CONTAINS_KW,
DEFINED_KW,
ENDSWITH_KW,
ENTRYPOINT_KW,
FALSE_KW,
FILESIZE_KW,
FOR_KW,
FULLWORD_KW,
GLOBAL_KW,
ICONTAINS_KW,
IENDSWITH_KW,
IEQUALS_KW,
IMPORT_KW,
IN_KW,
ISTARTSWITH_KW,
MATCHES_KW,
META_KW,
NOCASE_KW,
NONE_KW,
NOT_KW,
OF_KW,
OR_KW,
PRIVATE_KW,
RULE_KW,
STARTSWITH_KW,
STRINGS_KW,
THEM_KW,
TRUE_KW,
WIDE_KW,
XOR_KW,

DIV,
// Arithmetic operators

// Bitwise operators
SHL,
SHR,

// Comparison operators.
EQ,
NE,
LT,
LE,
GT,
GE,

// Punctuation
AMPERSAND,
ASTERISK,
COLON,
COMMA,
BACKSLASH,
DOT,
EQUAL,
MINUS,
PERCENT,
PIPE,
PLUS,
TILDE,

L_BRACE,
R_BRACE,
L_BRACKET,
R_BRACKET,
L_PAREN,
R_PAREN,

Expand All @@ -44,6 +96,7 @@ pub enum SyntaxKind {
PATTERN_DEF,
PATTERN_IDENT,
PATTERNS_BLK,
PATTERN_MODS,
REGEXP,
META_DEF,
META_BLK,
Expand All @@ -68,31 +121,84 @@ impl From<&Token> for SyntaxKind {
fn from(token: &Token) -> Self {
match token {
// Keywords
Token::ALL_KW(_) => SyntaxKind::ALL_KW,
Token::AND_KW(_) => SyntaxKind::AND_KW,
Token::ANY_KW(_) => SyntaxKind::ANY_KW,
Token::ASCII_KW(_) => SyntaxKind::ASCII_KW,
Token::AT_KW(_) => SyntaxKind::AT_KW,
Token::BASE64_KW(_) => SyntaxKind::BASE64_KW,
Token::BASE64WIDE_KW(_) => SyntaxKind::BASE64WIDE_KW,
Token::CONDITION_KW(_) => SyntaxKind::CONDITION_KW,
Token::CONTAINS_KW(_) => SyntaxKind::CONTAINS_KW,
Token::DEFINED_KW(_) => SyntaxKind::DEFINED_KW,
Token::ENDSWITH_KW(_) => SyntaxKind::ENDSWITH_KW,
Token::ENTRYPOINT_KW(_) => SyntaxKind::ENTRYPOINT_KW,
Token::FALSE_KW(_) => SyntaxKind::FALSE_KW,
Token::FILESIZE_KW(_) => SyntaxKind::FILESIZE_KW,
Token::FOR_KW(_) => SyntaxKind::FOR_KW,
Token::FULLWORD_KW(_) => SyntaxKind::FULLWORD_KW,
Token::GLOBAL_KW(_) => SyntaxKind::GLOBAL_KW,
Token::DIV(_) => SyntaxKind::DIV,
Token::ICONTAINS_KW(_) => SyntaxKind::ICONTAINS_KW,
Token::IENDSWITH_KW(_) => SyntaxKind::IENDSWITH_KW,
Token::IEQUALS_KW(_) => SyntaxKind::IEQUALS_KW,
Token::IMPORT_KW(_) => SyntaxKind::IMPORT_KW,
Token::IN_KW(_) => SyntaxKind::IN_KW,
Token::ISTARTSWITH_KW(_) => SyntaxKind::ISTARTSWITH_KW,
Token::MATCHES_KW(_) => SyntaxKind::MATCHES_KW,
Token::META_KW(_) => SyntaxKind::META_KW,
Token::NOCASE_KW(_) => SyntaxKind::NOCASE_KW,
Token::NONE_KW(_) => SyntaxKind::NONE_KW,
Token::NOT_KW(_) => SyntaxKind::NOT_KW,
Token::OF_KW(_) => SyntaxKind::OF_KW,
Token::OR_KW(_) => SyntaxKind::OR_KW,
Token::PRIVATE_KW(_) => SyntaxKind::PRIVATE_KW,
Token::REGEXP(_) => SyntaxKind::REGEXP,
Token::RULE_KW(_) => SyntaxKind::RULE_KW,
Token::STARTSWITH_KW(_) => SyntaxKind::STARTSWITH_KW,
Token::STRINGS_KW(_) => SyntaxKind::STRINGS_KW,
Token::THEM_KW(_) => SyntaxKind::THEM_KW,
Token::TRUE_KW(_) => SyntaxKind::TRUE_KW,
Token::WIDE_KW(_) => SyntaxKind::WIDE_KW,
Token::XOR_KW(_) => SyntaxKind::XOR_KW,

// Bitwise operators
Token::SHL(_) => SyntaxKind::SHL,
Token::SHR(_) => SyntaxKind::SHR,

// Comparison operators.
Token::EQ(_) => SyntaxKind::EQ,
Token::NE(_) => SyntaxKind::NE,
Token::LT(_) => SyntaxKind::LT,
Token::LE(_) => SyntaxKind::LE,
Token::GT(_) => SyntaxKind::GT,
Token::GE(_) => SyntaxKind::GE,

// Literals
Token::REGEXP(_) => SyntaxKind::REGEXP,
Token::FLOAT_LIT(_) => SyntaxKind::FLOAT_LIT,
Token::INTEGER_LIT(_) => SyntaxKind::INTEGER_LIT,
Token::STRING_LIT(_) => SyntaxKind::STRING_LIT,

// Punctuation
Token::AMPERSAND(_) => SyntaxKind::AMPERSAND,
Token::ASTERISK(_) => SyntaxKind::ASTERISK,
Token::BACKSLASH(_) => SyntaxKind::BACKSLASH,
Token::COLON(_) => SyntaxKind::COLON,
Token::COMMA(_) => SyntaxKind::COMMA,
Token::DOT(_) => SyntaxKind::DOT,
Token::EQUAL(_) => SyntaxKind::EQUAL,
Token::MINUS(_) => SyntaxKind::MINUS,
Token::PERCENT(_) => SyntaxKind::PERCENT,
Token::PIPE(_) => SyntaxKind::PIPE,
Token::PLUS(_) => SyntaxKind::PLUS,
Token::TILDE(_) => SyntaxKind::TILDE,

Token::L_BRACE(_) => SyntaxKind::L_BRACE,
Token::R_BRACE(_) => SyntaxKind::R_BRACE,
Token::L_PAREN(_) => SyntaxKind::L_PAREN,
Token::R_PAREN(_) => SyntaxKind::R_PAREN,
Token::L_BRACKET(_) => SyntaxKind::L_BRACKET,
Token::R_BRACKET(_) => SyntaxKind::R_BRACKET,

// Hex patterns
Token::HEX_BYTE(_) => SyntaxKind::HEX_BYTE,
// Identifiers
Expand Down
119 changes: 114 additions & 5 deletions parser-ng/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,36 +143,105 @@ enum Mode<'src> {
#[logos(source = [u8])]
enum NormalToken<'src> {
// Keywords
#[token("all")]
All,
#[token("and")]
And,
#[token("any")]
Any,
#[token("ascii")]
Ascii,
#[token("at")]
At,
#[token("base64")]
Base64,
#[token("base64wide")]
Base64Wide,
#[token("condition")]
Condition,
#[token("contains")]
Contains,
#[token("defined")]
Defined,
#[token("endswith")]
EndsWith,
#[token("entrypoint")]
Entrypoint,
#[token("false")]
False,
#[token("filesize")]
Filesize,
#[token("for")]
For,
#[token("fullword")]
Fullword,
#[token("global")]
Global,
#[token("icontains")]
IContains,
#[token("iendswith")]
IEndsWith,
#[token("iequals")]
IEquals,
#[token("import")]
Import,
#[token("in")]
In,
#[token("istartswith")]
IStarsWith,
#[token("matches")]
Matches,
#[token("meta")]
Meta,
#[token("nocase")]
Nocase,
#[token("none")]
None,
#[token("not")]
Not,
#[token("of")]
Of,
#[token("or")]
Or,
#[token("private")]
Private,
#[token("rule")]
Rule,
#[token("startswith")]
StartsWith,
#[token("strings")]
Strings,
#[token("them")]
Them,
#[token("true")]
True,
#[token("wide")]
Wide,
#[token("xor")]
Xor,

// Punctuation
#[token("&")]
Ampersand,
#[token("*")]
Asterisk,
#[token("\\")]
Backslash,
#[token(":")]
Colon,
#[token(".")]
Dot,
#[token("=")]
Equal,
#[token("-")]
Minus,
#[token("%")]
Percent,
#[token("|")]
Pipe,
#[token("~")]
Tilde,

#[token("{")]
LBrace,
#[token("}")]
Expand All @@ -181,10 +250,10 @@ enum NormalToken<'src> {
LParen,
#[token(")")]
RParen,

// Arithmetic operations
#[token("\\")]
Div,
#[token("[")]
LBracket,
#[token("]")]
RBracket,

// Pattern identifiers.
#[regex(
Expand Down Expand Up @@ -330,25 +399,65 @@ where

fn convert_normal_token(token: NormalToken, span: Span) -> Token {
match token {
// Keywords.
NormalToken::All => Token::ALL_KW(span),
NormalToken::And => Token::AND_KW(span),
NormalToken::Any => Token::ANY_KW(span),
NormalToken::Ascii => Token::ASCII_KW(span),
NormalToken::At => Token::AT_KW(span),
NormalToken::Base64 => Token::BASE64_KW(span),
NormalToken::Base64Wide => Token::BASE64WIDE_KW(span),
NormalToken::Condition => Token::CONDITION_KW(span),
NormalToken::Div => Token::DIV(span),
NormalToken::Contains => Token::CONTAINS_KW(span),
NormalToken::Defined => Token::DEFINED_KW(span),
NormalToken::EndsWith => Token::ENDSWITH_KW(span),
NormalToken::Entrypoint => Token::ENTRYPOINT_KW(span),
NormalToken::False => Token::FALSE_KW(span),
NormalToken::Filesize => Token::FILESIZE_KW(span),
NormalToken::For => Token::FOR_KW(span),
NormalToken::Fullword => Token::FULLWORD_KW(span),
NormalToken::Global => Token::GLOBAL_KW(span),
NormalToken::IContains => Token::ICONTAINS_KW(span),
NormalToken::IEndsWith => Token::IENDSWITH_KW(span),
NormalToken::IEquals => Token::IEQUALS_KW(span),
NormalToken::Import => Token::IMPORT_KW(span),
NormalToken::In => Token::IN_KW(span),
NormalToken::IStarsWith => Token::ISTARTSWITH_KW(span),
NormalToken::Matches => Token::MATCHES_KW(span),
NormalToken::Meta => Token::META_KW(span),
NormalToken::Nocase => Token::NOCASE_KW(span),
NormalToken::None => Token::NONE_KW(span),
NormalToken::Not => Token::NOT_KW(span),
NormalToken::Of => Token::OF_KW(span),
NormalToken::Or => Token::OR_KW(span),
NormalToken::Private => Token::PRIVATE_KW(span),
NormalToken::Rule => Token::RULE_KW(span),
NormalToken::StartsWith => Token::STARTSWITH_KW(span),
NormalToken::Strings => Token::STRINGS_KW(span),
NormalToken::Them => Token::THEM_KW(span),
NormalToken::True => Token::TRUE_KW(span),
NormalToken::Wide => Token::WIDE_KW(span),
NormalToken::Xor => Token::XOR_KW(span),

// Punctuation.
NormalToken::Ampersand => Token::AMPERSAND(span),
NormalToken::Asterisk => Token::ASTERISK(span),
NormalToken::Backslash => Token::BACKSLASH(span),
NormalToken::Colon => Token::COLON(span),
NormalToken::Dot => Token::DOT(span),
NormalToken::Equal => Token::EQUAL(span),
NormalToken::Minus => Token::MINUS(span),
NormalToken::Percent => Token::PERCENT(span),
NormalToken::Pipe => Token::PIPE(span),
NormalToken::Tilde => Token::TILDE(span),

NormalToken::LBrace => Token::L_BRACE(span),
NormalToken::RBrace => Token::R_BRACE(span),
NormalToken::LParen => Token::L_PAREN(span),
NormalToken::RParen => Token::R_PAREN(span),
NormalToken::LBracket => Token::L_BRACKET(span),
NormalToken::RBracket => Token::R_BRACKET(span),

NormalToken::Whitespace => Token::WHITESPACE(span),
NormalToken::Newline => Token::NEWLINE(span),
NormalToken::Ident(ident) => {
Expand Down
Loading

0 comments on commit 4faae2d

Please sign in to comment.