From 311dc317fa8e8bf9f3c0faf515a1f5f13108e2cd Mon Sep 17 00:00:00 2001
From: Daniel Thaler <daniel@dthaler.de>
Date: Fri, 21 Jun 2024 20:27:10 +0200
Subject: [PATCH] a2ml tokenizer refactoring

Like the a2l tokenizer it now runs on bytes instead of chars: all
relevant elements are ASCII; only comments could contain UTF-8.
For readbility the code was split into several functions.
---
 Cargo.lock                   | 159 +++++++++-
 a2lfile/Cargo.toml           |   2 +
 a2lfile/src/a2ml.rs          | 557 +++++++++++++++++------------------
 a2lfile/src/ifdata.rs        |   2 +-
 a2lfile/src/lib.rs           |  10 +-
 a2lfile/src/loader.rs        |   2 +-
 a2lfile/src/specification.rs |   6 +-
 a2lfile/tests/test.rs        |   8 +-
 8 files changed, 441 insertions(+), 305 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 936c6c0..4921aa1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7,6 +7,7 @@ name = "a2lfile"
 version = "2.0.0"
 dependencies = [
  "a2lmacros",
+ "tempfile",
  "thiserror",
 ]
 
@@ -18,49 +19,114 @@ dependencies = [
  "quote",
 ]
 
+[[package]]
+name = "bitflags"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "errno"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
+dependencies = [
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
+
+[[package]]
+name = "libc"
+version = "0.2.155"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
+
 [[package]]
 name = "proc-macro2"
-version = "1.0.79"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "rustix"
+version = "0.38.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys",
+]
+
 [[package]]
 name = "syn"
-version = "2.0.58"
+version = "2.0.67"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687"
+checksum = "ff8655ed1d86f3af4ee3fd3263786bc14245ad17c4c7e85ba7187fb3ae028c90"
 dependencies = [
  "proc-macro2",
  "quote",
  "unicode-ident",
 ]
 
+[[package]]
+name = "tempfile"
+version = "3.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "rustix",
+ "windows-sys",
+]
+
 [[package]]
 name = "thiserror"
-version = "1.0.58"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
+checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.58"
+version = "1.0.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
+checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -72,3 +138,76 @@ name = "unicode-ident"
 version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
diff --git a/a2lfile/Cargo.toml b/a2lfile/Cargo.toml
index 50dfdef..a3733df 100644
--- a/a2lfile/Cargo.toml
+++ b/a2lfile/Cargo.toml
@@ -16,3 +16,5 @@ version = "2.0.0"
 [dependencies]
 thiserror = "1.0.50"
 
+[dev-dependencies]
+tempfile = "3.8"
diff --git a/a2lfile/src/a2ml.rs b/a2lfile/src/a2ml.rs
index 2f7a22d..0065dae 100644
--- a/a2lfile/src/a2ml.rs
+++ b/a2lfile/src/a2ml.rs
@@ -1,5 +1,6 @@
-use super::{tokenizer, loader};
 use super::writer::{TaggedItemInfo, Writer};
+use super::{loader, tokenizer};
+use std::borrow::Cow;
 use std::collections::HashMap;
 use std::path::Path;
 
@@ -120,298 +121,276 @@ pub enum GenericIfData {
 
 // tokenize()
 // Tokenize the text of the a2ml section
-fn tokenize_a2ml(filename: String, input: &str, complete_string: &mut String) -> Result<Vec<TokenType>, String> {
+fn tokenize_a2ml(filename: &str, input: &str) -> Result<(Vec<TokenType>, String), String> {
     let mut amltokens = Vec::<TokenType>::new();
-    let mut remaining = input;
+    let input_bytes = input.as_bytes();
+    let datalen = input_bytes.len();
+    let mut bytepos = 0;
+    let mut complete_string = String::with_capacity(datalen);
+    let mut copypos = 0;
 
-    while !remaining.is_empty() {
-        let mut chars = remaining.char_indices();
-        let (mut idx, mut c) = chars.next().unwrap();
-        let mut append_to_complete = true;
+    while bytepos < datalen {
+        let startpos = bytepos;
+        let c = input_bytes[bytepos];
 
-        if c.is_ascii_whitespace() {
+        if input_bytes[bytepos].is_ascii_whitespace() {
             /* skip whitespace */
-            while c.is_ascii_whitespace() {
-                let pair = chars.next().unwrap_or((idx + 1, '\0'));
-                idx = pair.0;
-                c = pair.1;
-            }
-        } else if remaining.starts_with("/*") {
-            /* get a block comment */
-            chars.next(); /* skip over the '*' char of the opening sequence */
-            let mut done = false;
-            let mut star = false;
-            while !done {
-                if let Some(pair) = chars.next() {
-                    idx = pair.0;
-                    c = pair.1;
-                    if c == '*' {
-                        star = true;
-                    } else if c == '/' && star {
-                        done = true;
-                    } else {
-                        star = false;
-                    }
-                } else {
-                    let displaylen = if remaining.len() > 16 {
-                        16
-                    } else {
-                        remaining.len()
-                    };
-                    // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed
-                    let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]);
-                    return Err(format!("unclosed block quote starting with \"{errtxt}\""));
-                }
-            }
-            // chomp the last /
-            idx += 1;
-        } else if remaining.starts_with("//") {
-            /* get a line comment */
-            loop {
-                if let Some(pair) = chars.next() {
-                    idx = pair.0;
-                    c = pair.1;
-                    if c == '\n' {
-                        break;
-                    }
-                } else {
-                    idx = remaining.len() - 1; // results in an empty remaining
-                    break;
-                }
-            }
-            // add the initial extra / in //
-            idx += 1;
-        } else if remaining.starts_with("/include") {
-            // skip the first elements (include = 0..6)
-            chars.nth(6);
-            let mut state = 0;
-            let mut fname_idx_start = 0;
-            let fname_idx_end;
-
-            // skip the whitespaces
-            loop {
-                let pair = chars.next().unwrap_or((idx + 1, '\0'));
-                idx = pair.0;
-                c = pair.1;
-                if state == 0 && c.is_ascii_whitespace() {
-                    // just skip whitespaces
-                } else if state == 0 && tokenizer::is_pathchar(c as u8) {
-                    // start a non quoted filename
-                    state = 1;
-                    fname_idx_start = idx;
-                } else if state == 1 && tokenizer::is_pathchar(c as u8) {
-                    // in non quoted filename
-                } else if state == 1 && (c.is_ascii_whitespace() || c == '\0') {
-                    // end of non quoted filename
-                    fname_idx_end = idx;
-                    break;
-                } else if state == 0 && c == '"' {
-                    // start a quoted filename
-                    state = 2;
-                } else if state == 2 && tokenizer::is_pathchar(c as u8) {
-                    // first byte of a quoted filename
-                    state = 3;
-                    fname_idx_start = idx;
-                } else if state == 3 && tokenizer::is_pathchar(c as u8) {
-                    // in a quoted filename
-                } else if state == 3 && c == '"' {
-                    // end of non quoted filename
-                    fname_idx_end = idx;
-                    // chomp the '"'
-                    idx += 1;
-                    break;
-                }
-                else {
-                    let displaylen = if remaining.len() > 16 {
-                        16
-                    } else {
-                        remaining.len()
-                    };
-                    // slicing localremaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed
-                    let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]);
-                    return Err(format!("failed parsing a2ml include filename in {errtxt}"));
-                }
-            }
-            // if the current filename was not provided (unit tests..), do not try to parse the include file
-            if !filename.is_empty() {
-                let incfilename = loader::make_include_filename(&remaining[fname_idx_start..fname_idx_end], &filename);
-
-                // check if incname is an accessible file
-                let incpathref = Path::new(&incfilename);
-                let loadresult = loader::load(incpathref);
-                if let Ok(incfiledata) = loadresult {
-                    let mut tokresult = tokenize_a2ml(incpathref.display().to_string(), &incfiledata, complete_string)?;
-                    // append the tokens from the included file(s)
-                    amltokens.append(&mut tokresult);
-                } else {
-                    return Err(format!("failed reading {}", incpathref.display()));
-                }
-            }
-            append_to_complete = false;
-        } else if c == '"' {
-            /* tag - it is enclosed in double quotes, but contains neither spaces nor escape characters */
-            loop {
-                let pair = chars.next().unwrap_or((idx + 1, '\0'));
-                idx = pair.0;
-                c = pair.1;
-                if c == '"' || c == '\0' {
-                    break;
-                }
-            }
-            if c == '"' {
-                let tag = &remaining[1..idx];
-                amltokens.push(TokenType::Tag(tag.to_string()));
-                idx += 1;
-            } else {
-                let displaylen = if remaining.len() > 16 {
-                    16
-                } else {
-                    remaining.len()
-                };
-                // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed
-                let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]);
-                return Err(format!("unclosed tag string starting with {errtxt}"));
-            }
-        } else if c == ';' {
+            while bytepos < datalen && input_bytes[bytepos].is_ascii_whitespace() {
+                bytepos += 1;
+            }
+        } else if input_bytes[bytepos..].starts_with(b"/*") {
+            /* skip a block comment */
+            bytepos += 2; // just past the initial "/*"
+            while bytepos < datalen && !input_bytes[bytepos..].starts_with(b"*/") {
+                bytepos += 1;
+            }
+
+            if bytepos >= datalen {
+                let errtxt = make_errtxt(startpos, input_bytes);
+                return Err(format!("unclosed block quote starting with \"{errtxt}\""));
+            }
+
+            // chomp the closing "*/"
+            bytepos += 2;
+        } else if input_bytes[bytepos..].starts_with(b"//") {
+            /* skip a line comment */
+            while bytepos < datalen && input_bytes[bytepos] != b'\n' {
+                bytepos += 1;
+            }
+            if bytepos < datalen {
+                // skip the final '\n'
+                bytepos += 1;
+            }
+        } else if input_bytes[bytepos..].starts_with(b"/include") {
+            // copy any uncopied text before the include token
+            complete_string.push_str(&input[copypos..startpos]);
+            let (mut tokresult, incfile_text) = tokenize_include(filename, input, &mut bytepos)?;
+            complete_string.push_str(&incfile_text);
+            copypos = bytepos;
+
+            // append the tokens from the included file(s)
+            amltokens.append(&mut tokresult);
+        } else if c == b'"' {
+            let token = tokenize_tag(input, &mut bytepos)?;
+            amltokens.push(token);
+        } else if c == b';' {
             amltokens.push(TokenType::Semicolon);
-            idx = 1;
-        } else if c == ',' {
+            bytepos += 1;
+        } else if c == b',' {
             amltokens.push(TokenType::Comma);
-            idx = 1;
-        } else if c == '{' {
+            bytepos += 1;
+        } else if c == b'{' {
             amltokens.push(TokenType::OpenCurlyBracket);
-            idx = 1;
-        } else if c == '}' {
+            bytepos += 1;
+        } else if c == b'}' {
             amltokens.push(TokenType::ClosedCurlyBracket);
-            idx = 1;
-        } else if c == '[' {
+            bytepos += 1;
+        } else if c == b'[' {
             amltokens.push(TokenType::OpenSquareBracket);
-            idx = 1;
-        } else if c == ']' {
+            bytepos += 1;
+        } else if c == b']' {
             amltokens.push(TokenType::ClosedSquareBracket);
-            idx = 1;
-        } else if c == '(' {
+            bytepos += 1;
+        } else if c == b'(' {
             amltokens.push(TokenType::OpenRoundBracket);
-            idx = 1;
-        } else if c == ')' {
+            bytepos += 1;
+        } else if c == b')' {
             amltokens.push(TokenType::ClosedRoundBracket);
-            idx = 1;
-        } else if c == '*' {
+            bytepos += 1;
+        } else if c == b'*' {
             amltokens.push(TokenType::Repeat);
-            idx = 1;
-        } else if c == '=' {
+            bytepos += 1;
+        } else if c == b'=' {
             amltokens.push(TokenType::Equals);
-            idx = 1;
+            bytepos += 1;
         } else if c.is_ascii_digit() {
-            loop {
-                let pair = chars.next().unwrap_or((idx + 1, '\0'));
-                idx = pair.0;
-                c = pair.1;
-                if !c.is_ascii_alphanumeric() && c != '_' {
-                    break;
-                }
-            }
-            let num_text = &remaining[0..idx];
-            if let Some(hexval) = num_text.strip_prefix("0x") {
-                // hex constant
-                if let Ok(number) = i32::from_str_radix(hexval, 16) {
-                    amltokens.push(TokenType::Constant(number));
-                } else {
-                    return Err(format!("Invalid sequence in AML: {num_text}"));
-                }
-            } else {
-                // not hex format -> must be decimal
-                if let Ok(number) = num_text.parse::<i32>() {
-                    amltokens.push(TokenType::Constant(number));
-                } else {
-                    return Err(format!("Invalid sequence in AML: {num_text}"));
-                }
-            }
-        } else if c.is_ascii_alphabetic() || c == '_' {
-            loop {
-                let pair = chars.next().unwrap_or((idx + 1, '\0'));
-                idx = pair.0;
-                c = pair.1;
-                if !c.is_ascii_alphanumeric() && c != '_' {
-                    break;
-                }
-            }
-            let kw_or_ident = &remaining[..idx];
-            match kw_or_ident {
-                "char" => {
-                    amltokens.push(TokenType::Char);
-                }
-                "int" => {
-                    amltokens.push(TokenType::Int);
-                }
-                "long" => {
-                    amltokens.push(TokenType::Long);
-                }
-                "int64" => {
-                    amltokens.push(TokenType::Int64);
-                }
-                "uint" => {
-                    amltokens.push(TokenType::Uint);
-                }
-                "uchar" => {
-                    amltokens.push(TokenType::Uchar);
-                }
-                "ulong" => {
-                    amltokens.push(TokenType::Ulong);
-                }
-                "uint64" => {
-                    amltokens.push(TokenType::Uint64);
-                }
-                "double" => {
-                    amltokens.push(TokenType::Double);
-                }
-                "float" => {
-                    amltokens.push(TokenType::Float);
-                }
-                "block" => {
-                    amltokens.push(TokenType::Block);
-                }
-                "enum" => {
-                    amltokens.push(TokenType::Enum);
-                }
-                "struct" => {
-                    amltokens.push(TokenType::Struct);
-                }
-                "taggedstruct" => {
-                    amltokens.push(TokenType::Taggedstruct);
-                }
-                "taggedunion" => {
-                    amltokens.push(TokenType::Taggedunion);
-                }
-                _ => {
-                    amltokens.push(TokenType::Identifier(kw_or_ident.to_string()));
-                }
-            }
+            // tokenize a number, either decimal or hexadecimal
+            let token = tokenize_number(input, &mut bytepos)?;
+            amltokens.push(token);
+        } else if c.is_ascii_alphabetic() || c == b'_' {
+            // tokenize a keyword (int, long, etc.) or an identifier, both of which are non-quoted text
+            let token = tokenize_keyword_ident(input, &mut bytepos);
+            amltokens.push(token);
         } else {
-            let displaylen = if remaining.len() > 16 {
-                16
-            } else {
-                remaining.len()
-            };
-            // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed
-            let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]);
+            let errtxt = make_errtxt(startpos, input_bytes);
             return Err(format!("Unable to tokenize: {errtxt}..."));
         }
-        if append_to_complete {
-            complete_string.push_str(&remaining[..idx])
+    }
+    complete_string.push_str(&input[copypos..datalen]);
+
+    Ok((amltokens, complete_string))
+}
+
+fn tokenize_tag(input: &str, bytepos: &mut usize) -> Result<TokenType, String> {
+    let input_bytes = input.as_bytes();
+    let datalen = input_bytes.len();
+    let startpos = *bytepos;
+
+    *bytepos += 1;
+    let mut c = input_bytes[*bytepos];
+    while *bytepos < datalen {
+        c = input_bytes[*bytepos];
+        if c == b'"' {
+            break;
+        }
+        *bytepos += 1;
+    }
+    /* tag - it is enclosed in double quotes, but contains neither spaces nor escape characters */
+    if c == b'"' {
+        let tag = &input[(startpos + 1)..*bytepos];
+        *bytepos += 1;
+        Ok(TokenType::Tag(tag.to_string()))
+    } else {
+        let errtxt = make_errtxt(startpos, input_bytes);
+        Err(format!("unclosed tag string starting with {errtxt}"))
+    }
+}
+
+fn tokenize_include(
+    filename: &str,
+    input: &str,
+    bytepos: &mut usize,
+) -> Result<(Vec<TokenType>, String), String> {
+    let input_bytes = input.as_bytes();
+    let datalen = input_bytes.len();
+    let startpos = *bytepos;
+
+    *bytepos += 8;
+    let mut state = 0;
+    let mut fname_idx_start = 0;
+    let fname_idx_end;
+    loop {
+        let c = if *bytepos < datalen {
+            input_bytes[*bytepos]
+        } else {
+            b'\0'
+        };
+
+        if state == 0 && c.is_ascii_whitespace() {
+            // just skip whitespaces
+        } else if state == 0 && tokenizer::is_pathchar(c) {
+            // start a non quoted filename
+            state = 1;
+            fname_idx_start = *bytepos;
+        } else if state == 1 && tokenizer::is_pathchar(c) {
+            // in non quoted filename
+        } else if state == 1 && (c.is_ascii_whitespace() || c == b'\0') {
+            // end of non quoted filename
+            fname_idx_end = *bytepos;
+            break;
+        } else if state == 0 && c == b'"' {
+            // start a quoted filename
+            state = 2;
+        } else if state == 2 && tokenizer::is_pathchar(c) {
+            // first byte of a quoted filename
+            state = 3;
+            fname_idx_start = *bytepos;
+        } else if state == 3 && tokenizer::is_pathchar(c) {
+            // in a quoted filename
+        } else if state == 3 && c == b'"' {
+            // end of non quoted filename
+            fname_idx_end = *bytepos;
+            // chomp the '"'
+            *bytepos += 1;
+            break;
+        } else {
+            let errtxt = make_errtxt(startpos, input_bytes);
+            return Err(format!("failed parsing a2ml include filename in {errtxt}"));
+        }
+        *bytepos += 1;
+    }
+
+    let incname = &input[fname_idx_start..fname_idx_end];
+    let incfilename = loader::make_include_filename(incname, filename);
+
+    // check if incname is an accessible file
+    let incpathref = Path::new(&incfilename);
+    let loadresult = loader::load(incpathref);
+    if let Ok(incfiledata) = loadresult {
+        tokenize_a2ml(incpathref.to_string_lossy().as_ref(), &incfiledata)
+    } else {
+        Err(format!("failed reading {}", incpathref.display()))
+    }
+}
+
+fn tokenize_number(input: &str, bytepos: &mut usize) -> Result<TokenType, String> {
+    let input_bytes = input.as_bytes();
+    let datalen = input_bytes.len();
+    let startpos = *bytepos;
+
+    while *bytepos < datalen {
+        let c = input_bytes[*bytepos];
+        if !c.is_ascii_alphanumeric() && c != b'_' {
+            break;
+        }
+        *bytepos += 1;
+    }
+    let num_text = &input[startpos..*bytepos];
+    if let Some(hexval) = num_text.strip_prefix("0x") {
+        // hex constant
+        if let Ok(number) = i32::from_str_radix(hexval, 16) {
+            Ok(TokenType::Constant(number))
+        } else {
+            Err(format!("Invalid sequence in AML: {num_text}"))
+        }
+    } else {
+        // not hex format -> must be decimal
+        if let Ok(number) = num_text.parse::<i32>() {
+            Ok(TokenType::Constant(number))
+        } else {
+            Err(format!("Invalid sequence in AML: {num_text}"))
+        }
+    }
+}
+
+fn tokenize_keyword_ident(input: &str, bytepos: &mut usize) -> TokenType {
+    let input_bytes = input.as_bytes();
+    let datalen = input_bytes.len();
+    let startpos = *bytepos;
+    while *bytepos < datalen {
+        let c = input_bytes[*bytepos];
+        if !c.is_ascii_alphanumeric() && c != b'_' {
+            break;
         }
-        remaining = &remaining[idx..];
+        *bytepos += 1;
+    }
+    let kw_or_ident = &input[startpos..*bytepos];
+    match kw_or_ident {
+        "char" => TokenType::Char,
+        "int" => TokenType::Int,
+        "long" => TokenType::Long,
+        "int64" => TokenType::Int64,
+        "uint" => TokenType::Uint,
+        "uchar" => TokenType::Uchar,
+        "ulong" => TokenType::Ulong,
+        "uint64" => TokenType::Uint64,
+        "double" => TokenType::Double,
+        "float" => TokenType::Float,
+        "block" => TokenType::Block,
+        "enum" => TokenType::Enum,
+        "struct" => TokenType::Struct,
+        "taggedstruct" => TokenType::Taggedstruct,
+        "taggedunion" => TokenType::Taggedunion,
+        _ => TokenType::Identifier(kw_or_ident.to_string()),
     }
+}
 
-    Ok(amltokens)
+fn make_errtxt(pos: usize, input_bytes: &[u8]) -> Cow<str> {
+    let datalen = input_bytes.len();
+    let endpos = if pos + 16 < datalen {
+        pos + 16
+    } else {
+        datalen
+    };
+    // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed
+    String::from_utf8_lossy(&input_bytes[pos..endpos])
 }
 
 // parse an a2ml fragment in an a2l file
 // The target data structure is the parsing definition used by the a2l parser, so that the
 // a2ml can control the parsing of IF_DATA blocks
-pub(crate) fn parse_a2ml(filename: String, input: &str) -> Result<(A2mlTypeSpec, String), String> {
-    let mut complete_string = String::with_capacity(input.len());
-    let tok_result = tokenize_a2ml(filename, input, &mut complete_string)?;
+pub(crate) fn parse_a2ml(filename: &str, input: &str) -> Result<(A2mlTypeSpec, String), String> {
+    let (tok_result, complete_string) = tokenize_a2ml(filename, input)?;
     let mut tok_iter = tok_result.iter().peekable();
 
     let mut ifdata_block: Option<A2mlTypeSpec> = None;
@@ -1356,60 +1335,66 @@ impl PartialEq for GenericIfDataTaggedItem {
 #[cfg(test)]
 mod test {
     use super::*;
+    use tempfile::tempdir;
 
     #[test]
     fn tokenize() {
-        let mut complete_string = String::new();
-        let tokenvec = tokenize_a2ml(String::new(), "       ", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "       ").unwrap();
         assert!(tokenvec.is_empty());
 
-        let tokenvec = tokenize_a2ml(String::new(), "/* // */", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "/* // */").unwrap();
         assert!(tokenvec.is_empty());
-        let tokenvec = tokenize_a2ml(String::new(), "/*/*/", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "/*/*/").unwrap();
         assert!(tokenvec.is_empty());
-        let tokenvec = tokenize_a2ml(String::new(), "/***/", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "/***/").unwrap();
         assert!(tokenvec.is_empty());
-        let tokenvec_err = tokenize_a2ml(String::new(), "/* ", &mut complete_string);
+        let tokenvec_err = tokenize_a2ml("test", "/* ");
         assert!(tokenvec_err.is_err());
-        let tokenvec = tokenize_a2ml(String::new(), "//*/", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "//*/").unwrap();
         assert!(tokenvec.is_empty());
 
-        let tokenvec = tokenize_a2ml(String::new(), r#""TAG""#, &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", r#""TAG""#).unwrap();
         assert_eq!(tokenvec.len(), 1);
         let _tag = TokenType::Tag("TAG".to_string());
         assert!(matches!(&tokenvec[0], _tag));
 
-        let tokenvec = tokenize_a2ml(String::new(), ";", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", ";").unwrap();
         assert_eq!(tokenvec.len(), 1);
         assert!(matches!(tokenvec[0], TokenType::Semicolon));
 
-        let tokenvec = tokenize_a2ml(String::new(), "0", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "0").unwrap();
         assert_eq!(tokenvec.len(), 1);
         assert!(matches!(tokenvec[0], TokenType::Constant(0)));
 
-        let tokenvec = tokenize_a2ml(String::new(), "0x03", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "0x03").unwrap();
         assert_eq!(tokenvec.len(), 1);
         assert!(matches!(tokenvec[0], TokenType::Constant(3)));
 
-        let tokenvec = tokenize_a2ml(String::new(), "123456", &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", "123456").unwrap();
         assert_eq!(tokenvec.len(), 1);
         assert!(matches!(tokenvec[0], TokenType::Constant(123456)));
 
-        let tokenvec = tokenize_a2ml(String::new(), r#"/include "testfile""#, &mut complete_string).unwrap();
+        // set the current working directory to a temp dir
+        let dir = tempdir().unwrap();
+        std::env::set_current_dir(&dir.path()).unwrap();
+
+        // create the empty "testfile" so that it can be included
+        std::fs::File::create_new("testfile").unwrap();
+
+        let (tokenvec, _) = tokenize_a2ml("test", r#"/include "testfile""#).unwrap();
         assert_eq!(tokenvec.len(), 0);
 
-        let tokenvec = tokenize_a2ml(String::new(), r#"/include"testfile""#, &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", r#"/include"testfile""#).unwrap();
         assert_eq!(tokenvec.len(), 0);
 
-        let tokenvec = tokenize_a2ml(String::new(), r#"/include testfile"#, &mut complete_string).unwrap();
+        let (tokenvec, _) = tokenize_a2ml("test", r#"/include testfile"#).unwrap();
         assert_eq!(tokenvec.len(), 0);
 
-        let err_result = tokenize_a2ml(String::new(), r#"/include "testfile_unclosed_quote"#, &mut complete_string);
+        let err_result = tokenize_a2ml("test", r#"/include "testfile_unclosed_quote"#);
         assert!(err_result.is_err());
 
-        let err_result = tokenize_a2ml(String::new(), r#" "unclosed "#, &mut complete_string);
+        let err_result = tokenize_a2ml("test", r#" "unclosed "#);
         assert!(err_result.is_err());
-
     }
 
     #[test]
@@ -1527,7 +1512,7 @@ mod test {
             A2mlTypeSpec::TaggedStruct(taggedstruct_hashmap),
         ]);
 
-        let parse_result = parse_a2ml(String::new(), TEST_INPUT);
+        let parse_result = parse_a2ml("test", TEST_INPUT);
         assert!(parse_result.is_ok());
         let (a2ml_spec, _complete_string) = parse_result.unwrap();
         println!("{:?}", a2ml_spec);
diff --git a/a2lfile/src/ifdata.rs b/a2lfile/src/ifdata.rs
index 2e78a1f..3dd3258 100644
--- a/a2lfile/src/ifdata.rs
+++ b/a2lfile/src/ifdata.rs
@@ -753,7 +753,7 @@ mod ifdata_test {
             &mut log_msgs,
             false,
         );
-        parser.builtin_a2mlspec = Some(a2lfile::a2ml::parse_a2ml(String::new(), A2MLTEST_TEXT).unwrap().0);
+        parser.builtin_a2mlspec = Some(a2lfile::a2ml::parse_a2ml("test", A2MLTEST_TEXT).unwrap().0);
         super::parse_ifdata(
             &mut parser,
             &a2lfile::ParseContext {
diff --git a/a2lfile/src/lib.rs b/a2lfile/src/lib.rs
index 172f2f5..279c416 100644
--- a/a2lfile/src/lib.rs
+++ b/a2lfile/src/lib.rs
@@ -206,8 +206,9 @@ fn load_impl(
     // if a built-in A2ml specification was passed as a string, then it is parsed here
     if let Some(spec) = a2ml_spec {
         parser.builtin_a2mlspec = Some(
-            a2ml::parse_a2ml(path.to_string_lossy().to_string(), &spec)
-                .map_err(|parse_err| A2lError::InvalidBuiltinA2mlSpec { parse_err })?.0,
+            a2ml::parse_a2ml(path.to_string_lossy().as_ref(), &spec)
+                .map_err(|parse_err| A2lError::InvalidBuiltinA2mlSpec { parse_err })?
+                .0,
         );
     }
 
@@ -371,6 +372,7 @@ impl Module {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use tempfile::tempdir;
 
     #[test]
     fn load_empty_file() {
@@ -475,6 +477,10 @@ mod tests {
 
     #[test]
     fn write_with_banner() {
+        // set the current working directory to a temp dir
+        let dir = tempdir().unwrap();
+        std::env::set_current_dir(&dir.path()).unwrap();
+
         let mut a2l = new();
         a2l.asap2_version
             .as_mut()
diff --git a/a2lfile/src/loader.rs b/a2lfile/src/loader.rs
index 52cc612..007bca3 100644
--- a/a2lfile/src/loader.rs
+++ b/a2lfile/src/loader.rs
@@ -1,5 +1,5 @@
-use std::ffi::OsString;
 use crate::A2lError;
+use std::ffi::OsString;
 use std::fs::File;
 use std::io::Read;
 use std::path::Path;
diff --git a/a2lfile/src/specification.rs b/a2lfile/src/specification.rs
index ee75f40..0496667 100644
--- a/a2lfile/src/specification.rs
+++ b/a2lfile/src/specification.rs
@@ -32117,12 +32117,12 @@ impl A2ml {
         let a2ml_text = parser.get_token_text(token).to_string();
         let filename = &parser.filenames[context.fileid];
         let merged_a2ml_text;
-        match a2ml::parse_a2ml(filename.to_string(), &a2ml_text) {
+        match a2ml::parse_a2ml(filename, &a2ml_text) {
             Ok((a2mlspec, computed_merged_a2ml_text)) => {
                 parser.file_a2mlspec = Some(a2mlspec);
                 merged_a2ml_text = computed_merged_a2ml_text;
-            },
-            Err(errmsg) => { 
+            }
+            Err(errmsg) => {
                 parser.error_or_log(ParserError::A2mlError {
                     filename: filename.to_string(),
                     error_line: parser.last_token_position,
diff --git a/a2lfile/tests/test.rs b/a2lfile/tests/test.rs
index f261a4a..746ef96 100644
--- a/a2lfile/tests/test.rs
+++ b/a2lfile/tests/test.rs
@@ -1,8 +1,8 @@
 #[cfg(test)]
 mod test {
-    use std::{collections::HashMap, vec};
-
     use a2lfile::*;
+    use std::{collections::HashMap, vec};
+    use tempfile::tempdir;
 
     a2ml_specification! {
         <A2mlTest>
@@ -78,6 +78,10 @@ ASAP2_VERSION 1 61
 
     #[test]
     fn full_test() {
+        // work in a tempdir
+        let dir = tempdir().unwrap();
+        std::env::set_current_dir(&dir.path()).unwrap();
+
         let mut a2l_file = a2lfile::new();
 
         a2l_file.a2ml_version = Some(A2mlVersion::new(1, 31));