diff --git a/include/boost/parser/lexer.hpp b/include/boost/parser/lexer.hpp index dd213f2c..c6865e36 100644 --- a/include/boost/parser/lexer.hpp +++ b/include/boost/parser/lexer.hpp @@ -16,9 +16,8 @@ #include #include #include -#include +#include // TODO: Now moot. #include -#include #include @@ -337,7 +336,7 @@ namespace boost { namespace parser { } } - template + template struct token_chars_spec { static_assert( @@ -457,6 +456,9 @@ namespace boost { namespace parser { int Base = 10> constexpr auto token_spec = token_spec_t{}; + // TODO: Document that this takes a pack of char -- and nothing else. Also + // note that for anything more complicated, including a short UTF-8 sequence + // that encodes a code point, you must use the token_spec form. /** TODO */ template constexpr auto token_chars = detail::token_chars_spec{}; @@ -510,7 +512,7 @@ namespace boost { namespace parser { new_specs>{}; } - template + template auto operator|(detail::token_chars_spec rhs) const { constexpr auto new_regex = @@ -533,12 +535,11 @@ namespace boost { namespace parser { template static constexpr auto regex_range(V & base) { - auto r = detail::make_input_subrange(base); if constexpr (has_ws) { return ctre::multiline_tokenize< - detail::wrap_escape_concat()>(r); + detail::wrap_escape_concat()>(base); } else { - return ctre::multiline_tokenize(r); + return ctre::multiline_tokenize(base); } } }; @@ -584,9 +585,10 @@ namespace boost { namespace parser { case token_parsed_type::string_view: return {id, ctre_token}; case token_parsed_type::bool_: - if (ctre_token == "true") { + using namespace std::literals; + if (std::ranges::equal(ctre_token, "true"sv)) { return {id, 1ll}; - } else if (ctre_token == "false") { + } else if (std::ranges::equal(ctre_token, "false"sv)) { return {id, 0ll}; } else { // TODO: report error. diff --git a/test/lexer.cpp b/test/lexer.cpp index 0172887c..6ebc0ece 100644 --- a/test/lexer.cpp +++ b/test/lexer.cpp @@ -287,18 +287,159 @@ int main() } #endif - // TODO: Need tests with the various supported kinds of input sequence. + { + // Mixed UTFs. + auto const lexer = + bp::lexer | bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec | + bp::token_spec | bp::token_chars<'='>; + + // mutable vs. const token_views + mutable vs. const input views + std::string input = "foo = bar"; + auto mr_mi = input | bp::to_tokens(lexer); + auto const cr_mi = input | bp::to_tokens(lexer); + + auto const const_input = input; + auto mr_ci = input | bp::to_tokens(lexer); + auto const cr_ci = input | bp::to_tokens(lexer); + + using tok_t = bp::token; + tok_t const expected[] = { + tok_t((int)my_tokens::foo, "foo"), + tok_t(bp::character_id, (long long)'='), + tok_t((int)my_tokens::bar, "bar")}; + + int position = 0; + + position = 0; + for (auto tok : mr_mi) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + + position = 0; + for (auto tok : cr_mi) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + + position = 0; + for (auto tok : mr_ci) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + + position = 0; + for (auto tok : cr_ci) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + } + + // Check basic plumbing of connecting UTF views to CTRE. + { + auto const lexer = + bp::lexer | bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec<"bar", my_tokens::bar> | + bp::token_spec<"baz", my_tokens::baz> | bp::token_chars<'='>; - // TODO: Test different UTF combinations (no envoding + no encoding), and - // all combinations of (UTF-N token specs + UTF-M input). + std::string s = "foo = bar"; + using tok_t = bp::token; + tok_t const expected[] = { + tok_t((int)my_tokens::foo, "foo"), + tok_t(bp::character_id, (long long)'='), + tok_t((int)my_tokens::bar, "bar")}; + + auto const lexer8 = bp::lexer | + bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec<"bar", my_tokens::bar> | + bp::token_spec<"baz", my_tokens::baz> | + bp::token_chars<'='>; + + std::u8string u8s = u8"foo = bar"; + using tok8_t = bp::token; + tok8_t const expected8[] = { + tok8_t((int)my_tokens::foo, u8"foo"), + tok8_t(bp::character_id, (long long)'='), + tok8_t((int)my_tokens::bar, u8"bar")}; + + auto const lexer16 = bp::lexer | + bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec<"bar", my_tokens::bar> | + bp::token_spec<"baz", my_tokens::baz> | + bp::token_chars<'='>; + + std::u16string u16s = u"foo = bar"; + using tok16_t = bp::token; + tok16_t const expected16[] = { + tok16_t((int)my_tokens::foo, u"foo"), + tok16_t(bp::character_id, (long long)'='), + tok16_t((int)my_tokens::bar, u"bar")}; + + auto const lexer32 = bp::lexer | + bp::token_spec<"foo", my_tokens::foo> | + bp::token_spec<"bar", my_tokens::bar> | + bp::token_spec<"baz", my_tokens::baz> | + bp::token_chars<'='>; + + std::u32string u32s = U"foo = bar"; + using tok32_t = bp::token; + tok32_t const expected32[] = { + tok32_t((int)my_tokens::foo, U"foo"), + tok32_t(bp::character_id, (long long)'='), + tok32_t((int)my_tokens::bar, U"bar")}; + + + int position = 0; + + position = 0; + for (auto tok : s | bp::to_tokens(lexer)) { + BOOST_TEST(tok == expected[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + + position = 0; + for (auto tok : u8s | bp::to_tokens(lexer8)) { + BOOST_TEST(tok == expected8[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + + position = 0; + for (auto tok : u16s | bp::to_tokens(lexer16)) { + BOOST_TEST(tok == expected16[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + + position = 0; + for (auto tok : u32s | bp::to_tokens(lexer32)) { + BOOST_TEST(tok == expected32[position]); + ++position; + } + BOOST_TEST(position == (int)std::size(expected)); + } - // TODO: Test const and mutable versions of tokens_view. + // TODO: Note the limitation of CTRE that the input must be a + // continguous_range, so that string_views can be formed. - // TODO: Add a lexing test for a lexer with no whitespace. + // TODO: Need to check that string_views in tokens are the ones expected, + // based on the lexer. + + // TODO: Add a compile-time check to tokens_view that the CharType of the + // Lexer is char or char32_t, and that it matches range_value_t. - // TODO: Document that every spec's chars need to be in the same UTF (or - // none). Wait -- is this actually true? Tests needed.... + // TODO: Add a lexing test for a lexer with no whitespace. + // TODO: Document that every spec's chars are assumed to be in UTF when + // CTRE_STRING_IS_UTF8 is defined, and no encoding otherwise. Also document + // that char16_t is treated as UTF-16, but wchar_t and char32_t are *both* + // treated as UTF-32, even on windows. #endif return boost::report_errors();