From c1746bcbcdc578428cdb7fd2a44da4355acc1607 Mon Sep 17 00:00:00 2001 From: Zach Laine Date: Thu, 7 Nov 2024 23:36:00 -0600 Subject: [PATCH] Address the remaining non-documentation TODOs in the lexer header and tests. See #202. --- include/boost/parser/lexer.hpp | 6 +++ test/lexer.cpp | 69 ++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/include/boost/parser/lexer.hpp b/include/boost/parser/lexer.hpp index c6865e36..f85ccfc6 100644 --- a/include/boost/parser/lexer.hpp +++ b/include/boost/parser/lexer.hpp @@ -140,9 +140,15 @@ namespace boost { namespace parser { }; } + /** TODO */ inline constexpr int ws_id = -1000000; + + /** TODO */ inline constexpr int character_id = -2000000; + /** TODO */ + inline constexpr ctll::fixed_string no_ws = ""; + /** TODO */ template struct token diff --git a/test/lexer.cpp b/test/lexer.cpp index 927716b3..3b7d4376 100644 --- a/test/lexer.cpp +++ b/test/lexer.cpp @@ -20,7 +20,7 @@ namespace bp = boost::parser; -enum class my_tokens { foo, bar, baz }; +enum class my_tokens { ws, foo, bar, baz }; int main() { @@ -399,6 +399,9 @@ int main() position = 0; for (auto tok : s | bp::to_tokens(lexer)) { BOOST_TEST(tok == expected[position]); + static_assert( + std:: + same_as); ++position; } BOOST_TEST(position == (int)std::size(expected)); @@ -406,6 +409,9 @@ int main() position = 0; for (auto tok : u8s | bp::to_tokens(lexer8)) { BOOST_TEST(tok == expected8[position]); + static_assert(std::same_as< + decltype(tok.get_string_view()), + std::u8string_view>); ++position; } BOOST_TEST(position == (int)std::size(expected)); @@ -413,6 +419,9 @@ int main() position = 0; for (auto tok : u16s | bp::to_tokens(lexer16)) { BOOST_TEST(tok == expected16[position]); + static_assert(std::same_as< + decltype(tok.get_string_view()), + std::u16string_view>); ++position; } BOOST_TEST(position == (int)std::size(expected)); @@ -420,21 +429,65 @@ int main() position = 0; for (auto tok : u32s | bp::to_tokens(lexer32)) { BOOST_TEST(tok == expected32[position]); + static_assert(std::same_as< + decltype(tok.get_string_view()), + std::u32string_view>); ++position; } BOOST_TEST(position == (int)std::size(expected)); } - // TODO: Note the limitation of CTRE that the input must be a - // continguous_range, so that string_views can be formed. + // no-ws lexer + { + auto const lexer = bp::lexer | + bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec<"bar", my_tokens::bar> | + bp::token_spec<"baz", my_tokens::baz> | + bp::token_chars<'='>; + + std::string s = "foo=bar"; + using tok_t = bp::token; + tok_t const expected[] = { + tok_t((int)my_tokens::foo, "foo"), + tok_t(bp::character_id, (long long)'='), + tok_t((int)my_tokens::bar, "bar")}; + + int position = 0; + for (auto tok : s | bp::to_tokens(lexer)) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + } - // TODO: Need to check that string_views in tokens are the ones expected, - // based on the lexer. + // ws-as-token lexers + { + auto const lexer = bp::lexer | + bp::token_spec<"\\s+", my_tokens::ws> | + bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec<"bar", my_tokens::bar> | + bp::token_spec<"baz", my_tokens::baz> | + bp::token_chars<'='>; + + std::string s = "foo = bar"; + using tok_t = bp::token; + tok_t const expected[] = { + tok_t((int)my_tokens::foo, "foo"), + tok_t((int)my_tokens::ws, " "), + tok_t(bp::character_id, (long long)'='), + tok_t((int)my_tokens::ws, " "), + tok_t((int)my_tokens::bar, "bar")}; - // TODO: Add a compile-time check to tokens_view that the CharType of the - // Lexer is char or char32_t, and that it matches range_value_t. + int position = 0; + for (auto tok : s | bp::to_tokens(lexer)) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + } - // TODO: Add a lexing test for a lexer with no whitespace. + // TODO: Document the limitation of CTRE that the input must be a + // continguous_range, so that string_views can be formed. // TODO: Document that every spec's chars are assumed to be in UTF when // CTRE_STRING_IS_UTF8 is defined, and no encoding otherwise. Also document