Extend the lexer tests, addressing some of the testing TODOs; fix err…

…ors. See #202.
boostorg · Nov 8, 2024 · 6fdab1d · 6fdab1d
1 parent 4ca0766
commit 6fdab1d
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 16 deletions.
diff --git a/include/boost/parser/lexer.hpp b/include/boost/parser/lexer.hpp
@@ -16,9 +16,8 @@
 #include <boost/parser/concepts.hpp>
 #include <boost/parser/detail/debug_assert.hpp>
 #include <boost/parser/detail/hl.hpp>
-#include <boost/parser/detail/make_input_subrange.hpp>
+#include <boost/parser/detail/make_input_subrange.hpp> // TODO: Now moot.
 #include <boost/parser/detail/numeric.hpp>
-#include <boost/parser/detail/text/transcode_view.hpp>
 
 #include <ctre-unicode.hpp>
 
@@ -337,7 +336,7 @@ namespace boost { namespace parser {
             }
         }
 
-        template<auto Ch, auto... Chs>
+        template<char Ch, auto... Chs>
         struct token_chars_spec
         {
             static_assert(
@@ -457,6 +456,9 @@ namespace boost { namespace parser {
         int Base = 10>
     constexpr auto token_spec = token_spec_t<Regex, ID, ValueType, Base>{};
 
+    // TODO: Document that this takes a pack of char -- and nothing else.  Also
+    // note that for anything more complicated, including a short UTF-8 sequence
+    // that encodes a code point, you must use the token_spec form.
     /** TODO */
     template<char Ch, auto... Chs>
     constexpr auto token_chars = detail::token_chars_spec<Ch, Chs...>{};
@@ -510,7 +512,7 @@ namespace boost { namespace parser {
                 new_specs>{};
         }
 
-        template<CharType Ch, auto... Chs>
+        template<auto Ch, auto... Chs>
         auto operator|(detail::token_chars_spec<Ch, Chs...> rhs) const
         {
             constexpr auto new_regex =
@@ -533,12 +535,11 @@ namespace boost { namespace parser {
         template<parsable_range V>
         static constexpr auto regex_range(V & base)
         {
-            auto r = detail::make_input_subrange(base);
             if constexpr (has_ws) {
                 return ctre::multiline_tokenize<
-                    detail::wrap_escape_concat<regex_str, WsStr>()>(r);
+                    detail::wrap_escape_concat<regex_str, WsStr>()>(base);
             } else {
-                return ctre::multiline_tokenize<regex_str>(r);
+                return ctre::multiline_tokenize<regex_str>(base);
             }
         }
     };
@@ -584,9 +585,10 @@ namespace boost { namespace parser {
             case token_parsed_type::string_view: return {id, ctre_token};
 
             case token_parsed_type::bool_:
-                if (ctre_token == "true") {
+                using namespace std::literals;
+                if (std::ranges::equal(ctre_token, "true"sv)) {
                     return {id, 1ll};
-                } else if (ctre_token == "false") {
+                } else if (std::ranges::equal(ctre_token, "false"sv)) {
                     return {id, 0ll};
                 } else {
                     // TODO: report error.

diff --git a/test/lexer.cpp b/test/lexer.cpp
@@ -287,18 +287,159 @@ int main()
     }
 #endif
 
-    // TODO: Need tests with the various supported kinds of input sequence.
+    {
+        // Mixed UTFs.
+        auto const lexer =
+            bp::lexer<char, my_tokens> | bp::token_spec<"foo", my_tokens::foo> |
+            bp::token_spec<u"bar", my_tokens::bar> |
+            bp::token_spec<U"baz", my_tokens::baz> | bp::token_chars<'='>;
+
+        // mutable vs. const token_views + mutable vs. const input views
+        std::string input = "foo = bar";
+        auto mr_mi = input | bp::to_tokens(lexer);
+        auto const cr_mi = input | bp::to_tokens(lexer);
+
+        auto const const_input = input;
+        auto mr_ci = input | bp::to_tokens(lexer);
+        auto const cr_ci = input | bp::to_tokens(lexer);
+
+        using tok_t = bp::token<char>;
+        tok_t const expected[] = {
+            tok_t((int)my_tokens::foo, "foo"),
+            tok_t(bp::character_id, (long long)'='),
+            tok_t((int)my_tokens::bar, "bar")};
+
+        int position = 0;
+
+        position = 0;
+        for (auto tok : mr_mi) {
+            BOOST_TEST(tok == expected[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+
+        position = 0;
+        for (auto tok : cr_mi) {
+            BOOST_TEST(tok == expected[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+
+        position = 0;
+        for (auto tok : mr_ci) {
+            BOOST_TEST(tok == expected[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+
+        position = 0;
+        for (auto tok : cr_ci) {
+            BOOST_TEST(tok == expected[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+    }
+
+    // Check basic plumbing of connecting UTF views to CTRE.
+    {
+        auto const lexer =
+            bp::lexer<char, my_tokens> | bp::token_spec<"foo", my_tokens::foo> |
+            bp::token_spec<"bar", my_tokens::bar> |
+            bp::token_spec<"baz", my_tokens::baz> | bp::token_chars<'='>;
 
-    // TODO: Test different UTF combinations (no envoding + no encoding), and
-    // all combinations of (UTF-N token specs + UTF-M input).
+        std::string s = "foo = bar";
+        using tok_t = bp::token<char>;
+        tok_t const expected[] = {
+            tok_t((int)my_tokens::foo, "foo"),
+            tok_t(bp::character_id, (long long)'='),
+            tok_t((int)my_tokens::bar, "bar")};
+
+        auto const lexer8 = bp::lexer<char8_t, my_tokens> |
+                            bp::token_spec<"foo", my_tokens::foo> |
+                            bp::token_spec<"bar", my_tokens::bar> |
+                            bp::token_spec<"baz", my_tokens::baz> |
+                            bp::token_chars<'='>;
+
+        std::u8string u8s = u8"foo = bar";
+        using tok8_t = bp::token<char8_t>;
+        tok8_t const expected8[] = {
+            tok8_t((int)my_tokens::foo, u8"foo"),
+            tok8_t(bp::character_id, (long long)'='),
+            tok8_t((int)my_tokens::bar, u8"bar")};
+
+        auto const lexer16 = bp::lexer<char16_t, my_tokens> |
+                             bp::token_spec<"foo", my_tokens::foo> |
+                             bp::token_spec<"bar", my_tokens::bar> |
+                             bp::token_spec<"baz", my_tokens::baz> |
+                             bp::token_chars<'='>;
+
+        std::u16string u16s = u"foo = bar";
+        using tok16_t = bp::token<char16_t>;
+        tok16_t const expected16[] = {
+            tok16_t((int)my_tokens::foo, u"foo"),
+            tok16_t(bp::character_id, (long long)'='),
+            tok16_t((int)my_tokens::bar, u"bar")};
+
+        auto const lexer32 = bp::lexer<char32_t, my_tokens> |
+                             bp::token_spec<"foo", my_tokens::foo> |
+                             bp::token_spec<"bar", my_tokens::bar> |
+                             bp::token_spec<"baz", my_tokens::baz> |
+                             bp::token_chars<'='>;
+
+        std::u32string u32s = U"foo = bar";
+        using tok32_t = bp::token<char32_t>;
+        tok32_t const expected32[] = {
+            tok32_t((int)my_tokens::foo, U"foo"),
+            tok32_t(bp::character_id, (long long)'='),
+            tok32_t((int)my_tokens::bar, U"bar")};
+
+
+        int position = 0;
+
+        position = 0;
+        for (auto tok : s | bp::to_tokens(lexer)) {
+            BOOST_TEST(tok == expected[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+
+        position = 0;
+        for (auto tok : u8s | bp::to_tokens(lexer8)) {
+            BOOST_TEST(tok == expected8[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+
+        position = 0;
+        for (auto tok : u16s | bp::to_tokens(lexer16)) {
+            BOOST_TEST(tok == expected16[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+
+        position = 0;
+        for (auto tok : u32s | bp::to_tokens(lexer32)) {
+            BOOST_TEST(tok == expected32[position]);
+            ++position;
+        }
+        BOOST_TEST(position == (int)std::size(expected));
+    }
 
-    // TODO: Test const and mutable versions of tokens_view.
+    // TODO: Note the limitation of CTRE that the input must be a
+    // continguous_range, so that string_views can be formed.
 
-    // TODO: Add a lexing test for a lexer with no whitespace.
+    // TODO: Need to check that string_views in tokens are the ones expected,
+    // based on the lexer.
+
+    // TODO: Add a compile-time check to tokens_view that the CharType of the
+    // Lexer is char or char32_t, and that it matches range_value_t<V>.
 
-    // TODO: Document that every spec's chars need to be in the same UTF (or
-    // none).  Wait -- is this actually true?  Tests needed....
+    // TODO: Add a lexing test for a lexer with no whitespace.
 
+    // TODO: Document that every spec's chars are assumed to be in UTF when
+    // CTRE_STRING_IS_UTF8 is defined, and no encoding otherwise.  Also document
+    // that char16_t is treated as UTF-16, but wchar_t and char32_t are *both*
+    // treated as UTF-32, even on windows.
 #endif
 
     return boost::report_errors();