From 311dc317fa8e8bf9f3c0faf515a1f5f13108e2cd Mon Sep 17 00:00:00 2001 From: Daniel Thaler Date: Fri, 21 Jun 2024 20:27:10 +0200 Subject: [PATCH] a2ml tokenizer refactoring Like the a2l tokenizer it now runs on bytes instead of chars: all relevant elements are ASCII; only comments could contain UTF-8. For readbility the code was split into several functions. --- Cargo.lock | 159 +++++++++- a2lfile/Cargo.toml | 2 + a2lfile/src/a2ml.rs | 557 +++++++++++++++++------------------ a2lfile/src/ifdata.rs | 2 +- a2lfile/src/lib.rs | 10 +- a2lfile/src/loader.rs | 2 +- a2lfile/src/specification.rs | 6 +- a2lfile/tests/test.rs | 8 +- 8 files changed, 441 insertions(+), 305 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 936c6c0..4921aa1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7,6 +7,7 @@ name = "a2lfile" version = "2.0.0" dependencies = [ "a2lmacros", + "tempfile", "thiserror", ] @@ -18,49 +19,114 @@ dependencies = [ "quote", ] +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" + +[[package]] +name = "libc" +version = "0.2.155" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" + +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + [[package]] name = "proc-macro2" -version = "1.0.79" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "syn" -version = "2.0.58" +version = "2.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" +checksum = "ff8655ed1d86f3af4ee3fd3263786bc14245ad17c4c7e85ba7187fb3ae028c90" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" +dependencies = [ + "cfg-if", + "fastrand", + "rustix", + "windows-sys", +] + [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", @@ -72,3 +138,76 @@ name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/a2lfile/Cargo.toml b/a2lfile/Cargo.toml index 50dfdef..a3733df 100644 --- a/a2lfile/Cargo.toml +++ b/a2lfile/Cargo.toml @@ -16,3 +16,5 @@ version = "2.0.0" [dependencies] thiserror = "1.0.50" +[dev-dependencies] +tempfile = "3.8" diff --git a/a2lfile/src/a2ml.rs b/a2lfile/src/a2ml.rs index 2f7a22d..0065dae 100644 --- a/a2lfile/src/a2ml.rs +++ b/a2lfile/src/a2ml.rs @@ -1,5 +1,6 @@ -use super::{tokenizer, loader}; use super::writer::{TaggedItemInfo, Writer}; +use super::{loader, tokenizer}; +use std::borrow::Cow; use std::collections::HashMap; use std::path::Path; @@ -120,298 +121,276 @@ pub enum GenericIfData { // tokenize() // Tokenize the text of the a2ml section -fn tokenize_a2ml(filename: String, input: &str, complete_string: &mut String) -> Result, String> { +fn tokenize_a2ml(filename: &str, input: &str) -> Result<(Vec, String), String> { let mut amltokens = Vec::::new(); - let mut remaining = input; + let input_bytes = input.as_bytes(); + let datalen = input_bytes.len(); + let mut bytepos = 0; + let mut complete_string = String::with_capacity(datalen); + let mut copypos = 0; - while !remaining.is_empty() { - let mut chars = remaining.char_indices(); - let (mut idx, mut c) = chars.next().unwrap(); - let mut append_to_complete = true; + while bytepos < datalen { + let startpos = bytepos; + let c = input_bytes[bytepos]; - if c.is_ascii_whitespace() { + if input_bytes[bytepos].is_ascii_whitespace() { /* skip whitespace */ - while c.is_ascii_whitespace() { - let pair = chars.next().unwrap_or((idx + 1, '\0')); - idx = pair.0; - c = pair.1; - } - } else if remaining.starts_with("/*") { - /* get a block comment */ - chars.next(); /* skip over the '*' char of the opening sequence */ - let mut done = false; - let mut star = false; - while !done { - if let Some(pair) = chars.next() { - idx = pair.0; - c = pair.1; - if c == '*' { - star = true; - } else if c == '/' && star { - done = true; - } else { - star = false; - } - } else { - let displaylen = if remaining.len() > 16 { - 16 - } else { - remaining.len() - }; - // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed - let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]); - return Err(format!("unclosed block quote starting with \"{errtxt}\"")); - } - } - // chomp the last / - idx += 1; - } else if remaining.starts_with("//") { - /* get a line comment */ - loop { - if let Some(pair) = chars.next() { - idx = pair.0; - c = pair.1; - if c == '\n' { - break; - } - } else { - idx = remaining.len() - 1; // results in an empty remaining - break; - } - } - // add the initial extra / in // - idx += 1; - } else if remaining.starts_with("/include") { - // skip the first elements (include = 0..6) - chars.nth(6); - let mut state = 0; - let mut fname_idx_start = 0; - let fname_idx_end; - - // skip the whitespaces - loop { - let pair = chars.next().unwrap_or((idx + 1, '\0')); - idx = pair.0; - c = pair.1; - if state == 0 && c.is_ascii_whitespace() { - // just skip whitespaces - } else if state == 0 && tokenizer::is_pathchar(c as u8) { - // start a non quoted filename - state = 1; - fname_idx_start = idx; - } else if state == 1 && tokenizer::is_pathchar(c as u8) { - // in non quoted filename - } else if state == 1 && (c.is_ascii_whitespace() || c == '\0') { - // end of non quoted filename - fname_idx_end = idx; - break; - } else if state == 0 && c == '"' { - // start a quoted filename - state = 2; - } else if state == 2 && tokenizer::is_pathchar(c as u8) { - // first byte of a quoted filename - state = 3; - fname_idx_start = idx; - } else if state == 3 && tokenizer::is_pathchar(c as u8) { - // in a quoted filename - } else if state == 3 && c == '"' { - // end of non quoted filename - fname_idx_end = idx; - // chomp the '"' - idx += 1; - break; - } - else { - let displaylen = if remaining.len() > 16 { - 16 - } else { - remaining.len() - }; - // slicing localremaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed - let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]); - return Err(format!("failed parsing a2ml include filename in {errtxt}")); - } - } - // if the current filename was not provided (unit tests..), do not try to parse the include file - if !filename.is_empty() { - let incfilename = loader::make_include_filename(&remaining[fname_idx_start..fname_idx_end], &filename); - - // check if incname is an accessible file - let incpathref = Path::new(&incfilename); - let loadresult = loader::load(incpathref); - if let Ok(incfiledata) = loadresult { - let mut tokresult = tokenize_a2ml(incpathref.display().to_string(), &incfiledata, complete_string)?; - // append the tokens from the included file(s) - amltokens.append(&mut tokresult); - } else { - return Err(format!("failed reading {}", incpathref.display())); - } - } - append_to_complete = false; - } else if c == '"' { - /* tag - it is enclosed in double quotes, but contains neither spaces nor escape characters */ - loop { - let pair = chars.next().unwrap_or((idx + 1, '\0')); - idx = pair.0; - c = pair.1; - if c == '"' || c == '\0' { - break; - } - } - if c == '"' { - let tag = &remaining[1..idx]; - amltokens.push(TokenType::Tag(tag.to_string())); - idx += 1; - } else { - let displaylen = if remaining.len() > 16 { - 16 - } else { - remaining.len() - }; - // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed - let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]); - return Err(format!("unclosed tag string starting with {errtxt}")); - } - } else if c == ';' { + while bytepos < datalen && input_bytes[bytepos].is_ascii_whitespace() { + bytepos += 1; + } + } else if input_bytes[bytepos..].starts_with(b"/*") { + /* skip a block comment */ + bytepos += 2; // just past the initial "/*" + while bytepos < datalen && !input_bytes[bytepos..].starts_with(b"*/") { + bytepos += 1; + } + + if bytepos >= datalen { + let errtxt = make_errtxt(startpos, input_bytes); + return Err(format!("unclosed block quote starting with \"{errtxt}\"")); + } + + // chomp the closing "*/" + bytepos += 2; + } else if input_bytes[bytepos..].starts_with(b"//") { + /* skip a line comment */ + while bytepos < datalen && input_bytes[bytepos] != b'\n' { + bytepos += 1; + } + if bytepos < datalen { + // skip the final '\n' + bytepos += 1; + } + } else if input_bytes[bytepos..].starts_with(b"/include") { + // copy any uncopied text before the include token + complete_string.push_str(&input[copypos..startpos]); + let (mut tokresult, incfile_text) = tokenize_include(filename, input, &mut bytepos)?; + complete_string.push_str(&incfile_text); + copypos = bytepos; + + // append the tokens from the included file(s) + amltokens.append(&mut tokresult); + } else if c == b'"' { + let token = tokenize_tag(input, &mut bytepos)?; + amltokens.push(token); + } else if c == b';' { amltokens.push(TokenType::Semicolon); - idx = 1; - } else if c == ',' { + bytepos += 1; + } else if c == b',' { amltokens.push(TokenType::Comma); - idx = 1; - } else if c == '{' { + bytepos += 1; + } else if c == b'{' { amltokens.push(TokenType::OpenCurlyBracket); - idx = 1; - } else if c == '}' { + bytepos += 1; + } else if c == b'}' { amltokens.push(TokenType::ClosedCurlyBracket); - idx = 1; - } else if c == '[' { + bytepos += 1; + } else if c == b'[' { amltokens.push(TokenType::OpenSquareBracket); - idx = 1; - } else if c == ']' { + bytepos += 1; + } else if c == b']' { amltokens.push(TokenType::ClosedSquareBracket); - idx = 1; - } else if c == '(' { + bytepos += 1; + } else if c == b'(' { amltokens.push(TokenType::OpenRoundBracket); - idx = 1; - } else if c == ')' { + bytepos += 1; + } else if c == b')' { amltokens.push(TokenType::ClosedRoundBracket); - idx = 1; - } else if c == '*' { + bytepos += 1; + } else if c == b'*' { amltokens.push(TokenType::Repeat); - idx = 1; - } else if c == '=' { + bytepos += 1; + } else if c == b'=' { amltokens.push(TokenType::Equals); - idx = 1; + bytepos += 1; } else if c.is_ascii_digit() { - loop { - let pair = chars.next().unwrap_or((idx + 1, '\0')); - idx = pair.0; - c = pair.1; - if !c.is_ascii_alphanumeric() && c != '_' { - break; - } - } - let num_text = &remaining[0..idx]; - if let Some(hexval) = num_text.strip_prefix("0x") { - // hex constant - if let Ok(number) = i32::from_str_radix(hexval, 16) { - amltokens.push(TokenType::Constant(number)); - } else { - return Err(format!("Invalid sequence in AML: {num_text}")); - } - } else { - // not hex format -> must be decimal - if let Ok(number) = num_text.parse::() { - amltokens.push(TokenType::Constant(number)); - } else { - return Err(format!("Invalid sequence in AML: {num_text}")); - } - } - } else if c.is_ascii_alphabetic() || c == '_' { - loop { - let pair = chars.next().unwrap_or((idx + 1, '\0')); - idx = pair.0; - c = pair.1; - if !c.is_ascii_alphanumeric() && c != '_' { - break; - } - } - let kw_or_ident = &remaining[..idx]; - match kw_or_ident { - "char" => { - amltokens.push(TokenType::Char); - } - "int" => { - amltokens.push(TokenType::Int); - } - "long" => { - amltokens.push(TokenType::Long); - } - "int64" => { - amltokens.push(TokenType::Int64); - } - "uint" => { - amltokens.push(TokenType::Uint); - } - "uchar" => { - amltokens.push(TokenType::Uchar); - } - "ulong" => { - amltokens.push(TokenType::Ulong); - } - "uint64" => { - amltokens.push(TokenType::Uint64); - } - "double" => { - amltokens.push(TokenType::Double); - } - "float" => { - amltokens.push(TokenType::Float); - } - "block" => { - amltokens.push(TokenType::Block); - } - "enum" => { - amltokens.push(TokenType::Enum); - } - "struct" => { - amltokens.push(TokenType::Struct); - } - "taggedstruct" => { - amltokens.push(TokenType::Taggedstruct); - } - "taggedunion" => { - amltokens.push(TokenType::Taggedunion); - } - _ => { - amltokens.push(TokenType::Identifier(kw_or_ident.to_string())); - } - } + // tokenize a number, either decimal or hexadecimal + let token = tokenize_number(input, &mut bytepos)?; + amltokens.push(token); + } else if c.is_ascii_alphabetic() || c == b'_' { + // tokenize a keyword (int, long, etc.) or an identifier, both of which are non-quoted text + let token = tokenize_keyword_ident(input, &mut bytepos); + amltokens.push(token); } else { - let displaylen = if remaining.len() > 16 { - 16 - } else { - remaining.len() - }; - // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed - let errtxt = String::from_utf8_lossy(&remaining.as_bytes()[..displaylen]); + let errtxt = make_errtxt(startpos, input_bytes); return Err(format!("Unable to tokenize: {errtxt}...")); } - if append_to_complete { - complete_string.push_str(&remaining[..idx]) + } + complete_string.push_str(&input[copypos..datalen]); + + Ok((amltokens, complete_string)) +} + +fn tokenize_tag(input: &str, bytepos: &mut usize) -> Result { + let input_bytes = input.as_bytes(); + let datalen = input_bytes.len(); + let startpos = *bytepos; + + *bytepos += 1; + let mut c = input_bytes[*bytepos]; + while *bytepos < datalen { + c = input_bytes[*bytepos]; + if c == b'"' { + break; + } + *bytepos += 1; + } + /* tag - it is enclosed in double quotes, but contains neither spaces nor escape characters */ + if c == b'"' { + let tag = &input[(startpos + 1)..*bytepos]; + *bytepos += 1; + Ok(TokenType::Tag(tag.to_string())) + } else { + let errtxt = make_errtxt(startpos, input_bytes); + Err(format!("unclosed tag string starting with {errtxt}")) + } +} + +fn tokenize_include( + filename: &str, + input: &str, + bytepos: &mut usize, +) -> Result<(Vec, String), String> { + let input_bytes = input.as_bytes(); + let datalen = input_bytes.len(); + let startpos = *bytepos; + + *bytepos += 8; + let mut state = 0; + let mut fname_idx_start = 0; + let fname_idx_end; + loop { + let c = if *bytepos < datalen { + input_bytes[*bytepos] + } else { + b'\0' + }; + + if state == 0 && c.is_ascii_whitespace() { + // just skip whitespaces + } else if state == 0 && tokenizer::is_pathchar(c) { + // start a non quoted filename + state = 1; + fname_idx_start = *bytepos; + } else if state == 1 && tokenizer::is_pathchar(c) { + // in non quoted filename + } else if state == 1 && (c.is_ascii_whitespace() || c == b'\0') { + // end of non quoted filename + fname_idx_end = *bytepos; + break; + } else if state == 0 && c == b'"' { + // start a quoted filename + state = 2; + } else if state == 2 && tokenizer::is_pathchar(c) { + // first byte of a quoted filename + state = 3; + fname_idx_start = *bytepos; + } else if state == 3 && tokenizer::is_pathchar(c) { + // in a quoted filename + } else if state == 3 && c == b'"' { + // end of non quoted filename + fname_idx_end = *bytepos; + // chomp the '"' + *bytepos += 1; + break; + } else { + let errtxt = make_errtxt(startpos, input_bytes); + return Err(format!("failed parsing a2ml include filename in {errtxt}")); + } + *bytepos += 1; + } + + let incname = &input[fname_idx_start..fname_idx_end]; + let incfilename = loader::make_include_filename(incname, filename); + + // check if incname is an accessible file + let incpathref = Path::new(&incfilename); + let loadresult = loader::load(incpathref); + if let Ok(incfiledata) = loadresult { + tokenize_a2ml(incpathref.to_string_lossy().as_ref(), &incfiledata) + } else { + Err(format!("failed reading {}", incpathref.display())) + } +} + +fn tokenize_number(input: &str, bytepos: &mut usize) -> Result { + let input_bytes = input.as_bytes(); + let datalen = input_bytes.len(); + let startpos = *bytepos; + + while *bytepos < datalen { + let c = input_bytes[*bytepos]; + if !c.is_ascii_alphanumeric() && c != b'_' { + break; + } + *bytepos += 1; + } + let num_text = &input[startpos..*bytepos]; + if let Some(hexval) = num_text.strip_prefix("0x") { + // hex constant + if let Ok(number) = i32::from_str_radix(hexval, 16) { + Ok(TokenType::Constant(number)) + } else { + Err(format!("Invalid sequence in AML: {num_text}")) + } + } else { + // not hex format -> must be decimal + if let Ok(number) = num_text.parse::() { + Ok(TokenType::Constant(number)) + } else { + Err(format!("Invalid sequence in AML: {num_text}")) + } + } +} + +fn tokenize_keyword_ident(input: &str, bytepos: &mut usize) -> TokenType { + let input_bytes = input.as_bytes(); + let datalen = input_bytes.len(); + let startpos = *bytepos; + while *bytepos < datalen { + let c = input_bytes[*bytepos]; + if !c.is_ascii_alphanumeric() && c != b'_' { + break; } - remaining = &remaining[idx..]; + *bytepos += 1; + } + let kw_or_ident = &input[startpos..*bytepos]; + match kw_or_ident { + "char" => TokenType::Char, + "int" => TokenType::Int, + "long" => TokenType::Long, + "int64" => TokenType::Int64, + "uint" => TokenType::Uint, + "uchar" => TokenType::Uchar, + "ulong" => TokenType::Ulong, + "uint64" => TokenType::Uint64, + "double" => TokenType::Double, + "float" => TokenType::Float, + "block" => TokenType::Block, + "enum" => TokenType::Enum, + "struct" => TokenType::Struct, + "taggedstruct" => TokenType::Taggedstruct, + "taggedunion" => TokenType::Taggedunion, + _ => TokenType::Identifier(kw_or_ident.to_string()), } +} - Ok(amltokens) +fn make_errtxt(pos: usize, input_bytes: &[u8]) -> Cow { + let datalen = input_bytes.len(); + let endpos = if pos + 16 < datalen { + pos + 16 + } else { + datalen + }; + // slicing remaining in arbitrary ways is not safe, the end might be in the middle of a utf-8 sequence, so from_utf8_lossy is needed + String::from_utf8_lossy(&input_bytes[pos..endpos]) } // parse an a2ml fragment in an a2l file // The target data structure is the parsing definition used by the a2l parser, so that the // a2ml can control the parsing of IF_DATA blocks -pub(crate) fn parse_a2ml(filename: String, input: &str) -> Result<(A2mlTypeSpec, String), String> { - let mut complete_string = String::with_capacity(input.len()); - let tok_result = tokenize_a2ml(filename, input, &mut complete_string)?; +pub(crate) fn parse_a2ml(filename: &str, input: &str) -> Result<(A2mlTypeSpec, String), String> { + let (tok_result, complete_string) = tokenize_a2ml(filename, input)?; let mut tok_iter = tok_result.iter().peekable(); let mut ifdata_block: Option = None; @@ -1356,60 +1335,66 @@ impl PartialEq for GenericIfDataTaggedItem { #[cfg(test)] mod test { use super::*; + use tempfile::tempdir; #[test] fn tokenize() { - let mut complete_string = String::new(); - let tokenvec = tokenize_a2ml(String::new(), " ", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", " ").unwrap(); assert!(tokenvec.is_empty()); - let tokenvec = tokenize_a2ml(String::new(), "/* // */", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "/* // */").unwrap(); assert!(tokenvec.is_empty()); - let tokenvec = tokenize_a2ml(String::new(), "/*/*/", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "/*/*/").unwrap(); assert!(tokenvec.is_empty()); - let tokenvec = tokenize_a2ml(String::new(), "/***/", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "/***/").unwrap(); assert!(tokenvec.is_empty()); - let tokenvec_err = tokenize_a2ml(String::new(), "/* ", &mut complete_string); + let tokenvec_err = tokenize_a2ml("test", "/* "); assert!(tokenvec_err.is_err()); - let tokenvec = tokenize_a2ml(String::new(), "//*/", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "//*/").unwrap(); assert!(tokenvec.is_empty()); - let tokenvec = tokenize_a2ml(String::new(), r#""TAG""#, &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", r#""TAG""#).unwrap(); assert_eq!(tokenvec.len(), 1); let _tag = TokenType::Tag("TAG".to_string()); assert!(matches!(&tokenvec[0], _tag)); - let tokenvec = tokenize_a2ml(String::new(), ";", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", ";").unwrap(); assert_eq!(tokenvec.len(), 1); assert!(matches!(tokenvec[0], TokenType::Semicolon)); - let tokenvec = tokenize_a2ml(String::new(), "0", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "0").unwrap(); assert_eq!(tokenvec.len(), 1); assert!(matches!(tokenvec[0], TokenType::Constant(0))); - let tokenvec = tokenize_a2ml(String::new(), "0x03", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "0x03").unwrap(); assert_eq!(tokenvec.len(), 1); assert!(matches!(tokenvec[0], TokenType::Constant(3))); - let tokenvec = tokenize_a2ml(String::new(), "123456", &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", "123456").unwrap(); assert_eq!(tokenvec.len(), 1); assert!(matches!(tokenvec[0], TokenType::Constant(123456))); - let tokenvec = tokenize_a2ml(String::new(), r#"/include "testfile""#, &mut complete_string).unwrap(); + // set the current working directory to a temp dir + let dir = tempdir().unwrap(); + std::env::set_current_dir(&dir.path()).unwrap(); + + // create the empty "testfile" so that it can be included + std::fs::File::create_new("testfile").unwrap(); + + let (tokenvec, _) = tokenize_a2ml("test", r#"/include "testfile""#).unwrap(); assert_eq!(tokenvec.len(), 0); - let tokenvec = tokenize_a2ml(String::new(), r#"/include"testfile""#, &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", r#"/include"testfile""#).unwrap(); assert_eq!(tokenvec.len(), 0); - let tokenvec = tokenize_a2ml(String::new(), r#"/include testfile"#, &mut complete_string).unwrap(); + let (tokenvec, _) = tokenize_a2ml("test", r#"/include testfile"#).unwrap(); assert_eq!(tokenvec.len(), 0); - let err_result = tokenize_a2ml(String::new(), r#"/include "testfile_unclosed_quote"#, &mut complete_string); + let err_result = tokenize_a2ml("test", r#"/include "testfile_unclosed_quote"#); assert!(err_result.is_err()); - let err_result = tokenize_a2ml(String::new(), r#" "unclosed "#, &mut complete_string); + let err_result = tokenize_a2ml("test", r#" "unclosed "#); assert!(err_result.is_err()); - } #[test] @@ -1527,7 +1512,7 @@ mod test { A2mlTypeSpec::TaggedStruct(taggedstruct_hashmap), ]); - let parse_result = parse_a2ml(String::new(), TEST_INPUT); + let parse_result = parse_a2ml("test", TEST_INPUT); assert!(parse_result.is_ok()); let (a2ml_spec, _complete_string) = parse_result.unwrap(); println!("{:?}", a2ml_spec); diff --git a/a2lfile/src/ifdata.rs b/a2lfile/src/ifdata.rs index 2e78a1f..3dd3258 100644 --- a/a2lfile/src/ifdata.rs +++ b/a2lfile/src/ifdata.rs @@ -753,7 +753,7 @@ mod ifdata_test { &mut log_msgs, false, ); - parser.builtin_a2mlspec = Some(a2lfile::a2ml::parse_a2ml(String::new(), A2MLTEST_TEXT).unwrap().0); + parser.builtin_a2mlspec = Some(a2lfile::a2ml::parse_a2ml("test", A2MLTEST_TEXT).unwrap().0); super::parse_ifdata( &mut parser, &a2lfile::ParseContext { diff --git a/a2lfile/src/lib.rs b/a2lfile/src/lib.rs index 172f2f5..279c416 100644 --- a/a2lfile/src/lib.rs +++ b/a2lfile/src/lib.rs @@ -206,8 +206,9 @@ fn load_impl( // if a built-in A2ml specification was passed as a string, then it is parsed here if let Some(spec) = a2ml_spec { parser.builtin_a2mlspec = Some( - a2ml::parse_a2ml(path.to_string_lossy().to_string(), &spec) - .map_err(|parse_err| A2lError::InvalidBuiltinA2mlSpec { parse_err })?.0, + a2ml::parse_a2ml(path.to_string_lossy().as_ref(), &spec) + .map_err(|parse_err| A2lError::InvalidBuiltinA2mlSpec { parse_err })? + .0, ); } @@ -371,6 +372,7 @@ impl Module { #[cfg(test)] mod tests { use super::*; + use tempfile::tempdir; #[test] fn load_empty_file() { @@ -475,6 +477,10 @@ mod tests { #[test] fn write_with_banner() { + // set the current working directory to a temp dir + let dir = tempdir().unwrap(); + std::env::set_current_dir(&dir.path()).unwrap(); + let mut a2l = new(); a2l.asap2_version .as_mut() diff --git a/a2lfile/src/loader.rs b/a2lfile/src/loader.rs index 52cc612..007bca3 100644 --- a/a2lfile/src/loader.rs +++ b/a2lfile/src/loader.rs @@ -1,5 +1,5 @@ -use std::ffi::OsString; use crate::A2lError; +use std::ffi::OsString; use std::fs::File; use std::io::Read; use std::path::Path; diff --git a/a2lfile/src/specification.rs b/a2lfile/src/specification.rs index ee75f40..0496667 100644 --- a/a2lfile/src/specification.rs +++ b/a2lfile/src/specification.rs @@ -32117,12 +32117,12 @@ impl A2ml { let a2ml_text = parser.get_token_text(token).to_string(); let filename = &parser.filenames[context.fileid]; let merged_a2ml_text; - match a2ml::parse_a2ml(filename.to_string(), &a2ml_text) { + match a2ml::parse_a2ml(filename, &a2ml_text) { Ok((a2mlspec, computed_merged_a2ml_text)) => { parser.file_a2mlspec = Some(a2mlspec); merged_a2ml_text = computed_merged_a2ml_text; - }, - Err(errmsg) => { + } + Err(errmsg) => { parser.error_or_log(ParserError::A2mlError { filename: filename.to_string(), error_line: parser.last_token_position, diff --git a/a2lfile/tests/test.rs b/a2lfile/tests/test.rs index f261a4a..746ef96 100644 --- a/a2lfile/tests/test.rs +++ b/a2lfile/tests/test.rs @@ -1,8 +1,8 @@ #[cfg(test)] mod test { - use std::{collections::HashMap, vec}; - use a2lfile::*; + use std::{collections::HashMap, vec}; + use tempfile::tempdir; a2ml_specification! { @@ -78,6 +78,10 @@ ASAP2_VERSION 1 61 #[test] fn full_test() { + // work in a tempdir + let dir = tempdir().unwrap(); + std::env::set_current_dir(&dir.path()).unwrap(); + let mut a2l_file = a2lfile::new(); a2l_file.a2ml_version = Some(A2mlVersion::new(1, 31));