From 1ed14303b603572bc8165e08b4c6e7e7d2f04ead Mon Sep 17 00:00:00 2001
From: Ivan Ivanov <ivan+focus@ivanovs.info>
Date: Fri, 3 May 2024 18:40:39 +1200
Subject: [PATCH] Rust highlighting

---
 FOCUS-CHANGELOG.txt               |   1 +
 config/default.focus-config       |   2 +-
 config/default_macos.focus-config |   2 +-
 src/buffer.jai                    |   3 +
 src/editors.jai                   |   1 +
 src/langs/common.jai              |   1 +
 src/langs/rust.jai                | 743 ++++++++++++++++++++++++++++++
 src/main.jai                      |   1 +
 8 files changed, 752 insertions(+), 2 deletions(-)
 create mode 100644 src/langs/rust.jai
diff --git a/FOCUS-CHANGELOG.txt b/FOCUS-CHANGELOG.txt
index c4ea37e04..fafd83a7a 100644
--- a/FOCUS-CHANGELOG.txt
+++ b/FOCUS-CHANGELOG.txt
@@ -22,6 +22,7 @@
     + New command: `copy_current_line_info`. Copies a string `<current-file-path>:<line>` to clipboard. Useful for setting breakpoints.
     + Basic HLSL highlighting (thanks @Roman-Skabin for many improvements)
     + Basic JSON highlighting (thanks @simonvallz)
+    + Rust highlighting
 + Bug fixes:
     + Fixed the Jai code samples highlighting in build output in some cases
     + Fixed build output highlighting glitch when using the clear build output option
diff --git a/config/default.focus-config b/config/default.focus-config
index 9c8c40cb2..9d4db2e35 100644
--- a/config/default.focus-config
+++ b/config/default.focus-config
@@ -106,7 +106,7 @@ save_current_buffer_on_build:           false
 
 
 # Example error regexes:
-# For jai:  ^(?P<file>.*):(?P<line>\d+),(?P<col>\d+): (?P<type>Error|Warning|Info): (?P<msg>.*)|^(?P<msg>.*error LNK.*)
+# For jai:  ^(?P<file>.*):(?P<line>\d+),(?P<col>\d+): (?P<type>Error|Warning|Info|...):* (?P<msg>.*)|^(?P<msg1>.*error LNK.*)
 # For msvc: ^(?P<file>.*)\((?P<line>\d+),(?P<col>\d+)\): (?P<type>error|warning) (?P<msg>.*)$
 # ... let us know what regex works for you and we'll add it here
 
diff --git a/config/default_macos.focus-config b/config/default_macos.focus-config
index 0f0e0e70a..477d59810 100644
--- a/config/default_macos.focus-config
+++ b/config/default_macos.focus-config
@@ -98,7 +98,7 @@ build_panel_height_percent:             50
 
 
 # Example error regexes:
-# For jai:  ^(?P<file>.*):(?P<line>\d+),(?P<col>\d+): (?P<type>Error|Warning|Info): (?P<msg>.*)|^(?P<msg>.*error LNK.*)
+# For jai:  ^(?P<file>.*):(?P<line>\d+),(?P<col>\d+): (?P<type>Error|Warning|Info|...):* (?P<msg>.*)|^(?P<msg1>.*error LNK.*)
 # ... let us know what regex works for you and we'll add it here
 
 # NOTE:
diff --git a/src/buffer.jai b/src/buffer.jai
index 3278c6c54..507526a2b 100644
--- a/src/buffer.jai
+++ b/src/buffer.jai
@@ -477,6 +477,7 @@ tokenize_for_indentation :: (buffer: *Buffer) -> [] Indentation_Token /* temp */
         case .Xml;          return tokenize_xml_for_indentation(buffer);
         case .Lua;          return tokenize_lua_for_indentation(buffer);
         case .Odin;         return tokenize_odin_for_indentation(buffer);
+        case .Rust;         return tokenize_rust_for_indentation(buffer);
     }
 
     return .[];
@@ -1190,6 +1191,7 @@ get_tokenize_function :: (lang: Buffer.Lang) -> Tokenize_Function {
         case .Odin;         return highlight_odin_syntax;
         case .Python;       return highlight_python_syntax;
         case .RenPy;        return highlight_renpy_syntax;
+        case .Rust;         return highlight_rust_syntax;
         case .Html;         return highlight_xml_syntax;
         case .Xml;          return highlight_xml_syntax;
         case .Worklog;      return highlight_worklog;
@@ -1378,6 +1380,7 @@ Buffer :: struct {
         Odin;
         Python;
         RenPy;
+        Rust;
         Xml;
         Html;
         Worklog;
diff --git a/src/editors.jai b/src/editors.jai
index d2d61993d..1cbf0955e 100644
--- a/src/editors.jai
+++ b/src/editors.jai
@@ -1155,6 +1155,7 @@ get_lang_from_path :: (path: string) -> Buffer.Lang {
             case "odin";           lang = .Odin;
             case "py";             lang = .Python;
             case "rpy";            lang = .RenPy;
+            case "rs";             lang = .Rust;
 
             case "vert"; #through;
             case "frag"; #through;
diff --git a/src/langs/common.jai b/src/langs/common.jai
index cc4985cf9..a29671d13 100644
--- a/src/langs/common.jai
+++ b/src/langs/common.jai
@@ -174,6 +174,7 @@ get_lang_from_name :: (lang_name: string) -> Buffer.Lang {
     if ends_with_nocase(lang_name, "odin")         return .Odin;
     if ends_with_nocase(lang_name, "python")       return .Python;
     if ends_with_nocase(lang_name, "renpy")        return .RenPy;
+    if ends_with_nocase(lang_name, "rust")         return .Rust;
     if ends_with_nocase(lang_name, "html")         return .Html;
     if ends_with_nocase(lang_name, "xml")          return .Xml;
     if ends_with_nocase(lang_name, "worklog")      return .Worklog;
diff --git a/src/langs/rust.jai b/src/langs/rust.jai
new file mode 100644
index 000000000..70ffd5ef7
--- /dev/null
+++ b/src/langs/rust.jai
@@ -0,0 +1,743 @@
+highlight_rust_syntax :: (using buffer: *Buffer, start_offset := -1, count := -1) -> [] Buffer_Region {
+    tokenizer := get_rust_tokenizer(buffer, start_offset, count);
+
+    start_scope(*tokenizer, tokenizer.t - tokenizer.buf.data, .scope_export);
+
+    while true {
+        token := get_next_token(*tokenizer);
+        if token.type == .eof break;
+
+        // Maybe retroactively highlight a function
+        before_prev, prev := tokenizer.last_tokens[0], tokenizer.last_tokens[1];
+        if token.type == .punctuation && token.punctuation == .l_paren {
+            if prev.type == .identifier {
+                // Handle "func("
+                memset(colors.data + prev.start, xx Color.CODE_FUNCTION, prev.len);
+            }
+        }
+
+        // Remember last 2 tokens
+        tokenizer.last_tokens[0] = tokenizer.last_tokens[1];
+        tokenizer.last_tokens[1] = token;
+
+        color := COLOR_MAP[token.type];
+        memset(colors.data + token.start, xx color, token.len);
+    }
+
+    end_scope(*tokenizer, tokenizer.t - tokenizer.buf.data);
+
+    return tokenizer.regions;
+}
+
+tokenize_rust_for_indentation :: (using buffer: *Buffer) -> [] Indentation_Token /* temp */ {
+    tokens: [..] Indentation_Token;
+    tokens.allocator = temp;
+
+    tokenizer := get_rust_tokenizer(buffer);
+
+    while true {
+        src := get_next_token(*tokenizer);
+
+        tokenizer.last_tokens[0] = tokenizer.last_tokens[1];  // if we don't remember last 2 tokens, here strings won't be detected properly
+        tokenizer.last_tokens[1] = src;
+
+        token: Indentation_Token = ---;
+        token.start = src.start;
+        token.len   = src.len;
+
+        if src.type == {
+            case .punctuation;
+                if src.punctuation == {
+                    case .l_paren;      token.type = .open;  token.kind = .paren;
+                    case .l_bracket;    token.type = .open;  token.kind = .bracket;
+                    case .l_brace;      token.type = .open;  token.kind = .brace;
+
+                    case .r_paren;      token.type = .close; token.kind = .paren;
+                    case .r_bracket;    token.type = .close; token.kind = .bracket;
+                    case .r_brace;      token.type = .close; token.kind = .brace;
+
+                    case;               continue;
+                }
+
+            case .multiline_comment;    token.type = .maybe_multiline;
+            case .multiline_string;     token.type = .maybe_multiline;
+            case .eof;                  token.type = .eof;  // to guarantee we always have indentation tokens
+            case;                       token.type = .unimportant;
+        }
+
+        array_add(*tokens, token);
+
+        if src.type == .eof break;
+    }
+
+    return tokens;
+}
+
+#scope_file
+
+// We're using a separate tokenizer here because we have to keep track of last 2 tokens in many places
+// and we can't use a global variable for that because of threading
+get_rust_tokenizer :: (using buffer: *Buffer, start_offset := -1, count := -1) -> Rust_Tokenizer {
+    tokenizer: Rust_Tokenizer;
+
+    tokenizer.buf    = cast(string) bytes;
+    tokenizer.max_t  = bytes.data + bytes.count;
+    tokenizer.t      = bytes.data;
+
+    if start_offset >= 0 {
+        start_offset = clamp(start_offset, 0, bytes.count - 1);
+        count        = clamp(count,        0, bytes.count - 1);
+        tokenizer.t += start_offset;
+        tokenizer.max_t = tokenizer.t + count;
+    }
+
+    return tokenizer;
+}
+
+eat_white_space :: (using tokenizer: *Rust_Tokenizer) {
+    while t < max_t && is_white_space(t.*) {
+        t += 1;
+    }
+}
+
+get_next_token :: (using tokenizer: *Rust_Tokenizer) -> Token {
+    eat_white_space(tokenizer);
+
+    token: Token;
+    token.start = cast(s32) (t - buf.data);
+    token.type  = .eof;
+    if t >= max_t return token;
+
+    start_t = t;
+
+    // Assume ASCII, unless we're in the middle of a string.
+    // UTF-8 characters elsewhere are a syntax error.
+    char := t.*;
+
+    if is_alpha(char) || char == #char "_" {
+        parse_identifier_or_raw_string(tokenizer, *token);
+    } else if is_digit(char) {
+        // Handle number literals that don't look like they are being finished properly
+        // For example, 0b10002 will highlight 0b1000 as one number and 2 as another number,
+        // but when highlit in editor it looks like one number literal rather than two
+        // The solution here is to not highlight more numbers if there are many number tokens in a row
+        last_char := (t - 1).*;
+        if tokenizer.last_tokens[1].type != .number || (!is_digit(last_char) && last_char != #char "_") {
+            parse_number(tokenizer, *token);
+        } else {
+            parse_identifier_or_raw_string(tokenizer, *token);
+        }
+    } else if char == {
+        case #char ":";  parse_colon             (tokenizer, *token);
+        case #char "=";  parse_equal             (tokenizer, *token);
+        case #char "-";  parse_minus             (tokenizer, *token);
+        case #char "+";  parse_plus              (tokenizer, *token);
+        case #char "*";  parse_asterisk          (tokenizer, *token);
+        case #char "<";  parse_less_than         (tokenizer, *token);
+        case #char ">";  parse_greater_than      (tokenizer, *token);
+        case #char "!";  parse_bang              (tokenizer, *token);
+        case #char "\""; parse_string_literal    (tokenizer, *token);
+        case #char "\t"; parse_tab               (tokenizer, *token);
+        case #char "/";  parse_slash_or_comment  (tokenizer, *token);
+        case #char "&";  parse_ampersand         (tokenizer, *token);
+        case #char "|";  parse_pipe              (tokenizer, *token);
+        case #char "%";  parse_percent           (tokenizer, *token);
+        case #char "^";  parse_caret             (tokenizer, *token);
+        case #char ".";  parse_period            (tokenizer, *token);
+        case #char "'";  parse_single_quote      (tokenizer, *token);
+
+        case #char ";";  token.type = .punctuation; token.punctuation = .semicolon; t += 1;
+        case #char ",";  token.type = .punctuation; token.punctuation = .comma;     t += 1;
+        case #char "{";  token.type = .punctuation; token.punctuation = .l_brace;   t += 1;
+        case #char "}";  token.type = .punctuation; token.punctuation = .r_brace;   t += 1;
+        case #char "(";  token.type = .punctuation; token.punctuation = .l_paren;   t += 1;
+        case #char ")";  token.type = .punctuation; token.punctuation = .r_paren;   t += 1;
+        case #char "[";  token.type = .punctuation; token.punctuation = .l_bracket; t += 1;
+        case #char "]";  token.type = .punctuation; token.punctuation = .r_bracket; t += 1;
+        case #char "$";  token.type = .punctuation; token.punctuation = .dollar;    t += 1;
+        case #char "#";  token.type = .punctuation; token.punctuation = .hash;      t += 1;
+
+        case #char "~";  token.type = .operation;   token.operation   = .tilde;     t += 1;
+        case #char "`";  token.type = .operation;   token.operation   = .backtick;  t += 1;
+        case #char "?";  token.type = .operation;   token.operation   = .question_mark;  t += 1;
+
+        case;            token.type = .invalid; t += 1;
+    }
+
+    if t >= max_t then t = max_t;
+    token.len = cast(s32) (t - start_t);
+    return token;
+}
+
+parse_identifier_or_raw_string :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .identifier;
+
+    // Check for raw strings r#"..."# and br#"..."#
+    if (t + 2 < max_t && t.* == #char "r" && (t+1).* == #char "#") ||
+       (t + 3 < max_t && t.* == #char "b" && (t+1).* == #char "r" && (t+2).* == #char "#") {
+        token.type = .string_literal;
+
+        if t.* == #char "b" then t += 1;
+        t += 2;  // skip the first r#
+        num_hashes := 1;
+        while t < max_t && t.* == #char "#" { t += 1; num_hashes += 1; }  // skip any number of hashes
+        if t >= max_t || t.* != #char "\"" { token.type = .invalid; return; }  // expect a valid-formed raw string
+
+        // Construct the expected end sequence
+        end_sequence := talloc_string(num_hashes + 1);  // "####..###
+        end_sequence[0] = cast(u8) #char "\"";
+        memset(end_sequence.data+1, cast(u8) #char "#", num_hashes);
+
+        remaining_buffer := string.{max_t - t, t};
+        contents_length := find_index_from_left(remaining_buffer, end_sequence);
+        if contents_length >= 0 {
+            t += contents_length + end_sequence.count;
+        } else {
+            t = max_t;
+        }
+
+        return;
+    }
+
+    identifier_str := read_utf8_identifier_string(tokenizer);
+
+    // Maybe it's a keyword
+    if identifier_str.count <= MAX_KEYWORD_LENGTH {
+        kw_token, ok := table_find(*KEYWORD_MAP, identifier_str);
+        if ok { token.type = kw_token.type; token.keyword = kw_token.keyword; return; }
+    }
+
+    // Maybe it's a macro
+    if t < max_t && t.* == #char "!" {
+        token.type = .macro;
+        t += 1;
+        return;
+    }
+}
+
+parse_number :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .number;
+
+    t += 1;
+    if t >= max_t return;
+
+    is_decimal_variant :: inline (c: u8) -> bool {
+        return is_digit(c) || c == #char "." || c == #char "-" || c == #char "e" || c == #char "E";
+    }
+
+    if is_decimal_variant(t.*) || t.* == #char "_" {
+        // Decimal
+        seen_decimal_point  := false;
+        scientific_notation := false;
+        while t < max_t && (is_decimal_variant(t.*) || t.* == #char "_") {
+            if t.* == #char "." {
+                // Handle 0..1 (gets interpreted as a float-period-int rather than int-rangeop-int)
+                if (t + 1) < max_t && (t + 1).* == #char "." {
+                    break;
+                }
+
+                // else handle float decimal
+                if seen_decimal_point then break;
+                seen_decimal_point = true;
+            }
+            else if t.* == #char "e" || t.* == #char "E" {
+                // Scientific notation (3.5e2, 1.0e-34)
+                // Only works if there is a decimal point
+                if scientific_notation || !seen_decimal_point then break;
+                scientific_notation = true;
+            }
+            else if t.* == #char "-" {
+                // Handle negative exponent in scientific notation (1.0e-34)
+                if !scientific_notation then break;
+                if (t - 1).* != #char "e" && (t - 1).* != #char "E" then break;
+            }
+
+            t += 1;
+        }
+    } else if t.* == #char "x" || t.* == #char "h" {
+        // Hex
+        t += 1;
+        while t < max_t && (is_hex(t.*) || t.* == #char "_") t += 1;
+    } else if t.* == #char "b" {
+        // Binary
+        t += 1;
+        while t < max_t && (t.* == #char "1" || t.* == #char "0" || t.* == #char "_") t += 1;
+    }
+}
+
+parse_colon :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .colon;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char ":";  token.operation = .double_colon;  t += 1;
+        case #char "=";  token.operation = .colon_equal;   t += 1;
+    }
+}
+
+parse_equal :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .equal;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";  token.operation = .equal_equal; t += 1;
+    }
+}
+
+parse_minus :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .minus;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .minus_equal;
+            t += 1;
+        case #char ">";
+            token.operation = .arrow;
+            t += 1;
+        case #char "-";
+            t += 1;
+            if t < max_t && t.* == #char "-" {
+                token.operation = .triple_dash;
+                t += 1;
+            } else {
+                token.operation = .unknown;  // -- is not a valid token
+            }
+        case;
+            if tokenizer.last_tokens[1].type != .number && is_digit(t.*) {
+                parse_number(tokenizer, token);
+            }
+    }
+}
+
+parse_plus :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .plus;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .plus_equal;
+            t += 1;
+    }
+}
+
+parse_asterisk :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .asterisk;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .asterisk_equal;
+            t += 1;
+    }
+}
+
+parse_period :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type        = .punctuation;
+    token.punctuation = .period;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char ".";
+            token.type      = .operation;
+            token.operation = .double_period;
+            t += 1;
+
+        case #char "*";
+            token.type      = .operation;
+            token.operation = .period_asterisk;
+            t += 1;
+    }
+}
+
+parse_bang :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .bang;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .bang_equal;
+            t += 1;
+    }
+}
+
+parse_percent :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .percent;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .percent_equal;
+            t += 1;
+    }
+}
+
+parse_caret :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .caret;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .caret_equal;
+            t += 1;
+    }
+}
+
+parse_single_quote :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .lifetime;  // by default assuming it's a lifetime
+
+    t += 1;
+    if t >= max_t return;
+
+    // Maybe it's a char literal like '\n' or '\u{...}'
+    if t.* == #char "\\" {
+        token.type = .string_literal;
+        while t < max_t && t.* != #char "'" && t.* != #char "\n"  t += 1;  // just allow however many characters they want. We're not meant to check syntax.
+        return;
+    }
+
+    // Maybe it's a char literal like '<some unicode char>'
+    next_char := unicode_next_character(t);
+    if next_char < max_t && next_char.* == #char "'" {
+        token.type = .string_literal;
+        t = next_char;
+        t += 1;
+        return;
+    }
+
+    // Treat it as a lifetime
+    identifier_str := read_utf8_identifier_string(tokenizer);
+    if identifier_str == "static" {
+        token.type = .keyword;
+        return;
+    }
+}
+
+parse_ampersand :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .ampersand;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .ampersand_equal;
+            t += 1;
+        case #char "&";
+            token.operation = .double_ampersand;
+            t += 1;
+    }
+}
+
+parse_pipe :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .pipe;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .pipe_equal;
+            t += 1;
+        case #char "|";
+            token.operation = .double_pipe;
+            t += 1;
+    }
+}
+
+parse_less_than :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .less_than;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .less_than_equal;
+            t += 1;
+        case #char "<";
+            token.operation = .double_less_than;
+            t += 1;
+    }
+}
+
+parse_greater_than :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .greater_than;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .greater_than_equal;
+            t += 1;
+    }
+}
+
+parse_tab :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .comment;
+    t += 1;
+    while t < max_t && t.* == #char "\t" t += 1;
+}
+
+parse_slash_or_comment :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .operation;
+    token.operation = .slash;
+
+    t += 1;
+    if t >= max_t return;
+
+    if t.* == {
+        case #char "=";
+            token.operation = .slash_equal;
+            t += 1;
+        case #char "/";
+            token.type = .comment;
+            t += 1;
+            while t < max_t && t.* != #char "\n" t += 1;
+        case #char "*";
+            token.type = .multiline_comment;
+            t += 1;
+            num_open_comments := 0;
+            while t + 1 < max_t {
+                if t.* == #char "*" && << (t + 1) == #char "/" {
+                    if num_open_comments == 0 {
+                        t += 2;
+                        break;
+                    } else {
+                        num_open_comments -= 1;
+                    }
+                } else if t.* == #char "/" && << (t + 1) == #char "*" {
+                    num_open_comments += 1;
+                    t += 1;
+                }
+                t += 1;
+            }
+    }
+}
+
+parse_string_literal :: (using tokenizer: *Rust_Tokenizer, token: *Token) {
+    token.type = .string_literal;
+
+    escape_seen := false;
+
+    t += 1;
+    while t < max_t {
+        if t.* == #char "\"" && !escape_seen break;
+        escape_seen = !escape_seen && t.* == #char "\\";
+        t += 1;
+    }
+    if t >= max_t return;
+
+    t += 1;
+}
+
+read_identifier_string_tmp :: (using tokenizer: *Rust_Tokenizer) -> string /* temp */ {
+    identifier: [..] u8;
+    identifier.allocator = temp;
+
+    array_add(*identifier, t.*);
+
+    t += 1;
+    slash_mode := false;
+
+    while t < max_t {
+        c := t.*;
+        if is_alnum(c)      { t += 1; slash_mode = false; array_add(*identifier, c); continue; }
+        if c == #char "\\"  { t += 1; slash_mode = true;  continue; }
+        if slash_mode && is_white_space(c) { t += 1; continue; }
+        break;
+    }
+    if t >= max_t then t = max_t;
+
+    return cast(string) identifier;
+}
+
+start_scope :: (using tokenizer: *Rust_Tokenizer, offset: s64, kind: Buffer_Region.Kind) {
+    if current_scope_id >= 0 then end_scope(tokenizer, offset);
+    current_scope_id = regions.count;
+
+    region := Buffer_Region.{
+        start = xx offset,
+        end   = -1,
+        kind  = kind,
+    };
+    array_add(*regions, region);
+}
+
+end_scope :: inline (using tokenizer: *Rust_Tokenizer, offset: s64) {
+    regions[current_scope_id].end = xx offset;
+}
+
+Rust_Tokenizer :: struct {
+    using #as base: Tokenizer;
+
+    regions: [..] Buffer_Region;
+    regions.allocator = temp;
+    current_scope_id := -1;
+
+    last_tokens: [2] Token;
+}
+
+Token :: struct {
+    start, len: s32;
+    type: Type = .invalid;
+
+    // Additional info to distinguish between keywords/punctuation
+    union {
+        keyword:            Keyword;
+        punctuation:        Punctuation;
+        operation:          Operation;
+    }
+
+    Type :: enum u16 {
+        eof;
+
+        identifier;
+        string_literal;
+        multiline_string;
+        number;
+        comment;
+        multiline_comment;
+        operation;
+        punctuation;
+        keyword;
+        type_keyword;
+        value_keyword;
+        lifetime;
+        macro;
+        invalid;
+    }
+}
+
+// Must match the order of the types in the enum
+COLOR_MAP :: Color.[
+    .CODE_COMMENT,       // eof - obviously not used
+    .CODE_DEFAULT,       // identifier
+    .CODE_STRING,        // string_literal
+    .CODE_STRING,        // multiline_string
+    .CODE_VALUE,         // number
+    .CODE_COMMENT,       // comment
+    .CODE_COMMENT,       // multiline_comment
+    .CODE_OPERATION,     // operation
+    .CODE_PUNCTUATION,   // punctuation
+    .CODE_KEYWORD,       // keyword
+    .CODE_TYPE,          // type_keyword
+    .CODE_VALUE,         // value_keyword
+    .CODE_VALUE,         // lifetime
+    .CODE_OPERATION,     // macro
+    .CODE_ERROR,         // invalid
+];
+
+PUNCTUATION :: string.[
+    "dollar", "semicolon", "l_paren", "r_paren", "l_brace", "r_brace", "l_bracket", "r_bracket",
+    "period", "comma", "hash",
+];
+
+OPERATIONS :: string.[
+    "arrow", "bang", "backtick", "pipe", "double_pipe", "pipe_equal", "equal", "equal_equal", "bang_equal",
+    "percent", "percent_equal", "less_than", "double_less_than", "less_than_equal", "greater_than", "greater_than_equal",
+    "minus", "minus_equal", "triple_dash", "asterisk", "asterisk_equal", "colon", "colon_equal", "double_colon", "slash",
+    "plus", "plus_equal", "slash_equal", "ampersand", "double_ampersand", "ampersand_equal", "tilde", "unknown",
+    "caret", "caret_equal", "double_period", "period_asterisk", "question_mark",
+];
+
+KEYWORDS :: string.[
+    "as", "break", "const", "continue", "crate", "else", "enum", "extern", "fn", "for", "if", "impl", "in", "let",
+    "loop", "match", "mod", "move", "mut", "pub", "ref", "return", "static", "struct", "super", "trait",
+    "type", "unsafe", "use", "where", "while",
+    "async", "await", "dyn", "try",
+    "abstract", "become", "box", "do", "final", "macro", "override", "priv", "typeof", "unsized", "virtual", "yield",
+];
+
+TYPE_KEYWORDS :: string.[
+    "bool", "f32", "f64",
+    "i8", "i16", "i32", "i64", "i128", "isize",
+    "u8", "u16", "u32", "u64", "u128", "usize",
+    "str", "char",
+    "Self",
+];
+
+VALUE_KEYWORDS :: string.[
+    "true", "false", "self",
+    "Some", "None", "Ok", "Err",
+];
+
+#insert -> string {
+    b: String_Builder;
+    init_string_builder(*b);
+
+    define_enum :: (b: *String_Builder, enum_name: string, prefix: string, value_lists: [][] string) {
+        print_to_builder(b, "% :: enum u16 {\n", enum_name);
+        for values : value_lists {
+            for v : values print_to_builder(b, "    %0%;\n", prefix, v);
+        }
+        print_to_builder(b, "}\n");
+    }
+
+    define_enum(*b, "Punctuation",        "",           .[PUNCTUATION]);
+    define_enum(*b, "Operation",          "",           .[OPERATIONS]);
+    define_enum(*b, "Keyword",            "kw_",        .[KEYWORDS, TYPE_KEYWORDS, VALUE_KEYWORDS]);
+
+    return builder_to_string(*b);
+}
+
+Keyword_Token :: struct {
+    type: Token.Type;
+    keyword: Keyword;
+}
+
+KEYWORD_MAP :: #run -> Table(string, Keyword_Token) {
+    table: Table(string, Keyword_Token);
+    size := 10 * (KEYWORDS.count + TYPE_KEYWORDS.count + VALUE_KEYWORDS.count);
+    init(*table, size);
+
+    #insert -> string {
+        b: String_Builder;
+        for KEYWORDS        append(*b, sprint("table_add(*table, \"%1\", Keyword_Token.{ type = .keyword,       keyword = .kw_%1 });\n", it));
+        for TYPE_KEYWORDS   append(*b, sprint("table_add(*table, \"%1\", Keyword_Token.{ type = .type_keyword,  keyword = .kw_%1 });\n", it));
+        for VALUE_KEYWORDS  append(*b, sprint("table_add(*table, \"%1\", Keyword_Token.{ type = .value_keyword, keyword = .kw_%1 });\n", it));
+        return builder_to_string(*b);
+    }
+
+    return table;
+}
+
+MAX_KEYWORD_LENGTH :: #run -> s32 {
+    result: s64;
+    for KEYWORDS       { if it.count > result then result = it.count; }
+    for TYPE_KEYWORDS  { if it.count > result then result = it.count; }
+    for VALUE_KEYWORDS { if it.count > result then result = it.count; }
+    return xx result;
+}
+
diff --git a/src/main.jai b/src/main.jai
index 61daa09b5..aff86ca4c 100644
--- a/src/main.jai
+++ b/src/main.jai
@@ -908,6 +908,7 @@ dont_ignore_next_window_resize := false;
 #load "langs/yang.jai";
 #load "langs/zig.jai";
 #load "langs/uxntal.jai";
+#load "langs/rust.jai";
 
 #if OS == .WINDOWS {
     #load "platform/windows.jai";