diff --git a/doc/parser.qbk b/doc/parser.qbk index d137b785..e7fbb76f 100644 --- a/doc/parser.qbk +++ b/doc/parser.qbk @@ -42,6 +42,7 @@ [import ../test/parser.cpp] [import ../test/parser_rule.cpp] [import ../test/parser_quoted_string.cpp] +[import ../test/lexer_and_parser.cpp] [import ../include/boost/parser/concepts.hpp] [import ../include/boost/parser/error_handling_fwd.hpp] @@ -109,6 +110,16 @@ [def _trans_replace_vs_ [classref boost::parser::transform_replace_view `boost::parser::transform_replace_view`s]] +[def _lex_ [classref boost::parser::lexer_t `boost::parser::lexer_t`]] +[def _tok_ [classref boost::parser::token `boost::parser::token`]] +[def _toks_ [classref boost::parser::token `boost::parser::token`s]] +[def _tok_spec_ [classref boost::parser::token_spec_t `boost::parser::token_spec_t`]] +[def _tok_specs_ [classref boost::parser::token_spec_t `boost::parser::token_spec_t`s]] +[def _tok_chs_ [globalref boost::parser::token_chars `boost::parser::token_chars`]] +[def _to_tok_ [globalref boost::parser::to_tokens `boost::parser::to_tokens`]] +[def _tok_v_ [classref boost::parser::tokens_view `boost::parser::tokens_view`]] +[def _ch_id_ [globalref boost::parser::character_id `boost::parser::character_id`]] + [def _std_str_ `std::string`] [def _std_vec_char_ `std::vector`] [def _std_vec_char32_ `std::vector`] @@ -253,6 +264,12 @@ [def _udls_ [@https://en.cppreference.com/w/cpp/language/user_literal UDLs]] [def _yaml_ [@https://yaml.org/spec/1.2/spec.html YAML 1.2]] +[def _nttp_ [@https://en.cppreference.com/w/cpp/language/template_parameters NTTP]] +[def _nttps_ [@https://en.cppreference.com/w/cpp/language/template_parameters NTTPs]] + +[def _ctre_ [@https://github.com/hanickadot/compile-time-regular-expressions CTRE]] +[def _pcre_ [@https://www.pcre.org PCRE]] + [def _Spirit_ [@https://www.boost.org/doc/libs/release/libs/spirit Boost.Spirit]] [def _spirit_reals_ [@https://www.boost.org/doc/libs/release/libs/spirit/doc/html/spirit/qi/reference/numeric/real.html real number parsers]] diff --git a/doc/tables.qbk b/doc/tables.qbk index 0f1c3432..ac3ed07c 100644 --- a/doc/tables.qbk +++ b/doc/tables.qbk @@ -595,3 +595,194 @@ same attribute generation rules. [[`p1 | p2[a] | p3`] [`std::optional>`]] ] ] + +[template table_token_parsers_and_their_semantics +This table lists all the _Parser_ parsers usable during token parsing. For +the callable parsers, a separate entry exists for each possible arity of +arguments. For a parser `p`, if there is no entry for `p` without arguments, +`p` is a function, and cannot itself be used as a parser; it must be called. +In the table below: + +* each entry is a global object usable directly in your parsers, unless + otherwise noted; + +* "code point" is used to refer to the elements of the input range, which + assumes that the parse is being done in the Unicode-aware code path (if the + parse is being done in the non-Unicode code path, read "code point" as + "`char`"); + +* _RES_ is a notional macro that expands to the resolution of parse argument + or evaluation of a parse predicate (see _parsers_uses_); + +* "`_RES_np_(pred) == true`" is a shorthand notation for "`_RES_np_(pred)` is + contextually convertible to `bool` and `true`"; likewise for `false`; + +* `c` is a character of some character type; + +* `str` is a string literal of type `CharType const[]`, for some character + type `Char\Type`; + +* `pred` is a parse predicate; + +* `arg0`, `arg1`, `arg2`, ... are parse arguments; + +* `a` is a semantic action; + +* `r` is an object whose type models `parsable_range` and + `std::ranges::contiguous_range`; and + +* `p`, `p1`, `p2`, ... are parsers. + +[note The definition of `parsable_range` is: + +[parsable_range_concept] + +] + +[note Some of the parsers in this table consume no input. All parsers consume +the input they match unless otherwise stated in the table below.] + +[table Token Parsers and Their Semantics + [[Parser] [Semantics] [Attribute Type] [Notes]] + + [[ _e_ ] + [ Matches /epsilon/, the empty string. Always matches, and consumes no input. ] + [ None. ] + [ Matching _e_ an unlimited number of times creates an infinite loop, which is undefined behavior in C++. _Parser_ will assert in debug mode when it encounters `*_e_`, `+_e_`, etc (this applies to unconditional _e_ only). ]] + + [[ `_e_(pred)` ] + [ Fails to match the input if `_RES_np_(pred) == false`. Otherwise, the semantics are those of _e_. ] + [ None. ] + []] + + [[ _ws_ ] + [ Matches a single whitespace code point (see note), according to the Unicode White_Space property. ] + [ None. ] + [ For more info, see the [@https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt Unicode properties]. _ws_ may consume one code point or two. It only consumes two code points when it matches `"\r\n"`. ]] + + [[ _eol_ ] + [ Matches a single newline (see note), following the "hard" line breaks in the Unicode line breaking algorithm. ] + [ None. ] + [ For more info, see the [@https://unicode.org/reports/tr14 Unicode Line Breaking Algorithm]. _eol_ may consume one code point or two. It only consumes two code points when it matches `"\r\n"`. ]] + + [[ _eoi_ ] + [ Matches only at the end of input, and consumes no input. ] + [ None. ] + []] + + [[ _attr_np_`(arg0)` ] + [ Always matches, and consumes no input. Generates the attribute `_RES_np_(arg0)`. ] + [ `decltype(_RES_np_(arg0))`. ] + [ An important use case for `_attr_` is to provide a default attribute value as a trailing alternative. For instance, an *optional* comma-delmited list is: `int_ % ',' | attr(std::vector)`. Without the "`| attr(...)`", at least one `int_` match would be required. ]] + + [[ _ch_ ] + [ Matches any single code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ] + []] + + [[ `_ch_(arg0)` ] + [ Matches exactly the code point `_RES_np_(arg0)`. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ] + []] + + [[ `_ch_(arg0, arg1)` ] + [ Matches the next code point `n` in the input, if `_RES_np_(arg0) <= n && n <= _RES_np_(arg1)`. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ] + []] + + [[ `_ch_(r)` ] + [ Matches the next code point `n` in the input, if `n` is one of the code points in `r`. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ] + [ `r` is taken to be in a UTF encoding. The exact UTF used depends on `r`'s element type. If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined. Note that ASCII is a subset of UTF-8, so ASCII is fine. EBCDIC is not. `r` is not copied; a reference to it is taken. The lifetime of `_ch_(r)` must be within the lifetime of `r`. This overload of _ch_ does *not* take parse arguments. ]] + + [[ _cp_ ] + [ Matches a single code point. ] + [ `char32_t` ] + [ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. ]] + + [[ _cu_ ] + [ Matches a single code point. ] + [ `char` ] + [ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Even though the name "`cu`" suggests that this parser match at the code unit level, it does not. The name refers to the attribute type generated, much like the names _i_ versus _ui_. ]] + + [[ `_blank_` ] + [ Equivalent to `_ws_ - _eol_`. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ `_control_` ] + [ Matches a single control-character code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ `_digit_` ] + [ Matches a single decimal digit code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ `_punct_` ] + [ Matches a single punctuation code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ `_hex_digit_` ] + [ Matches a single hexidecimal digit code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ `_lower_` ] + [ Matches a single lower-case code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ `_upper_` ] + [ Matches a single upper-case code point. ] + [ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ] + []] + + [[ _lit_np_`(c)`] + [ Matches exactly the given code point `c`. ] + [ None. ] + [_lit_ does *not* take parse arguments. ]] + + [[ `c_l` ] + [ Matches exactly the given code point `c`. ] + [ None. ] + [ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. ]] + + [[ _lit_np_`(r)`] + [ Matches exactly the given string `r`. ] + [ None. ] + [ _lit_ does *not* take parse arguments. ]] + + [[ `str_l` ] + [ Matches exactly the given string `str`. ] + [ None. ] + [ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. ]] + + [[ `_rpt_np_(arg0)[p]` ] + [ Matches iff `p` matches exactly `_RES_np_(arg0)` times. ] + [ `std::string` if `_ATTR_np_(p)` is `char` or `char32_t`, otherwise `std::vector<_ATTR_np_(p)>` ] + [ The special value _inf_ may be used; it indicates unlimited repetition. `decltype(_RES_np_(arg0))` must be implicitly convertible to `int64_t`. Matching _e_ an unlimited number of times creates an infinite loop, which is undefined behavior in C++. _Parser_ will assert in debug mode when it encounters `_rpt_np_(_inf_)[_e_]` (this applies to unconditional _e_ only). ]] + + [[ `_rpt_np_(arg0, arg1)[p]` ] + [ Matches iff `p` matches between `_RES_np_(arg0)` and `_RES_np_(arg1)` times, inclusively. ] + [ `std::string` if `_ATTR_np_(p)` is `char` or `char32_t`, otherwise `std::vector<_ATTR_np_(p)>` ] + [ The special value _inf_ may be used for the upper bound; it indicates unlimited repetition. `decltype(_RES_np_(arg0))` and `decltype(_RES_np_(arg1))` each must be implicitly convertible to `int64_t`. Matching _e_ an unlimited number of times creates an infinite loop, which is undefined behavior in C++. _Parser_ will assert in debug mode when it encounters `_rpt_np_(n, _inf_)[_e_]` (this applies to unconditional _e_ only). ]] + + [[ `_if_np_(pred)[p]` ] + [ Equivalent to `_e_(pred) >> p`. ] + [ `std::optional<_ATTR_np_(p)>` ] + [ It is an error to write `_if_np_(pred)`. That is, it is an error to omit the conditionally matched parser `p`. ]] + + [[ `_sw_np_(arg0)(arg1, p1)(arg2, p2) ...` ] + [ Equivalent to `p1` when `_RES_np_(arg0) == _RES_np_(arg1)`, `p2` when `_RES_np_(arg0) == _RES_np_(arg2)`, etc. If there is such no `argN`, the behavior of _sw_ is undefined. ] + [ `std::variant<_ATTR_np_(p1), _ATTR_np_(p2), ...>` ] + [ It is an error to write `_sw_np_(arg0)`. That is, it is an error to omit the conditionally matched parsers `p1`, `p2`, .... ]] + + [[ _symbols_t_ ] + [ _symbols_ is an associative container of key, value pairs. Each key is a _std_str_ and each value has type `T`. In the Unicode parsing path, the strings are considered to be UTF-8 encoded; in the non-Unicode path, no encoding is assumed. _symbols_ Matches the longest prefix `pre` of the input that is equal to one of the keys `k`. If the length `len` of `pre` is zero, and there is no zero-length key, it does not match the input. If `len` is positive, the generated attribute is the value associated with `k`.] + [ `T` ] + [ Unlike the other entries in this table, _symbols_ is a type, not an object. ]] +] +] diff --git a/doc/tutorial.qbk b/doc/tutorial.qbk index 3cf08ce2..cc9a3e66 100644 --- a/doc/tutorial.qbk +++ b/doc/tutorial.qbk @@ -75,6 +75,10 @@ matches the input. _ATTR_ is a notional macro that expands to the attribute type of the parser passed to it; `_ATTR_np_(_d_)` is `double`. This is similar to the _attr_ type trait. +/Token parsing/ is parsing using _Parser_'s optional support for +lexing/tokenizing first, and parsing the resulting tokens, as opposed to the +normal operation of _Parser_, in which input characters are parsed. + Next, we'll look at some simple programs that parse using _Parser_. We'll start small and build up from there. @@ -1163,7 +1167,7 @@ without the context is for use outside of any parse.] _Parser_ comes with all the parsers most parsing tasks will ever need. Each one is a `constexpr` object, or a `constexpr` function. Some of the non-functions are also callable, such as _ch_, which may be used directly, or -with arguments, as in _ch_`('a', 'z')`. Any parser that can be called, +with arguments, as in `_ch_('a', 'z')`. Any parser that can be called, whether a function or callable object, will be called a /callable parser/ from now on. Note that there are no nullary callable parsers; they each take one or more arguments. @@ -3661,6 +3665,393 @@ Some things to be aware of when looking at _Parser_ trace output: [endsect] +[section Using a Lexer / Token Parsing] + +_Parser_ has optional support for lexing before parsing. The optional support +is based on an external dependency, _ctre_. _ctre_ produces a sequence of +tokens by matching a set of regexes that you provide. Each regex is used to +match against the input to produce one token with an ID associated with that +regex. When you call _p_, you pass it a lazy range of tokens that adapts the +input, and _p_ parses the tokens, not the underlying characters. When you +backtrack, you just move back to an earlier token, not an earlier place in the +underlying sequence of characters. + +[heading A basic example] + +Let's look at an example of how to do token parsing. First, you must include +the lexer header before the parser header. + +[tokens_basics_headers] + +The inclusion of this optional header is what enables token parsing. +Character parsing ("normal" parsing) is unaffected by this header inclusion +_emdash_ you can always do character parsing. + +[important _ctre_ is a header-only library, and it can be included as a single +header. It requires C++20 or later, _Parser_'s support for token parsing does +as well. _Parser_ uses the single-header version with Unicode support, +`ctre-unicode.hpp`.] + +Then, you define a lexer and its tokens. + +[tokens_basics_lexer] + +Here, we first see three _tok_specs_. Each one consists of an _nttp_ regex +string literal and an _nttp_ token ID; the first one matches `"foo"`, and has +an ID of `0`, etc. _lex_ takes two template parameters. The first parameter +indicates that the value type of the parsed input sequence is `char`. The +second one indicates that the ID-type of all subsequent _tok_specs_ will be +`int`. We create a full lexer by starting with the `lexer<...>` expression, +follwed by a piped-together sequence of _tok_specs_. + +The final lexer `lexer` has a combined regex string, `"(foo)|(b.*r)|(b.+z)"`. +This string is built up at compile time, and is represented by an _nttp_. It +is the single regex given to _ctre_, which _ctre_ uses to produce a sequence +of matches from it. + +`lexer` and `token_spec` are variable templates; they make variables from the +templates _lex_ and _tok_spec_, respectively. The are provided as a +notational convenience, just so you don't have to put `{}` after every lexer +and token spec you write. _lex_ and _tok_spec_ are empty classes. Their +configury is stored in _nttps_. + +Next, you create a range of _toks_ from your input. This range of tokens is +what _p_ will parse. + +[tokens_basics_input_range] + +The input must model `std::ranges::contiguous_range`. This is due to the way +_ctre_ works; it produces a sequence of matches that are convertible to +`std::basic_string_view`. In our case, since we are lexing a +sequence of `char`, _ctre_ will produce a sequence of `std::basic_string` +matches. Note that the value type/character type we specified for _lex_ above +must match the input sequence's value type/character type, or the program is +ill-formed. Also note that because we are lexing a contiguous range of +characters, you cannot use any of the `boost::parser::as_utf*` range adaptors +when doing token parsing. + +Next, you define a parser. + +[tokens_basics_parser] + +This has the same semantics as the characater parsers you've seen in the rest +of the documentation. Each _tok_spec_ has the same interface as a parser, so +it can be used with all the parser combining operations, like `operator>>`. +However, unlike when doing character parseing, when token parsing all the +terminal parsers are restricted to a subset of the terminal parsers avilable +in character parsing (see the full list in the table below). This is because +most of the parsers in _Parser_ parse sequences of characters. For example, +if you used `_str_np_("foo")` above instead of `foo`, the _str_ parser would +try to match three consecutive values from the input sequence, and would +expect them to equal `'f'`, `'o'`, `and `'o'`, respectively. It would instead +see three tokens, and the comparison would not even compile. Similarly, any +character-based directive will not work with a token parser _emdash_ +_no_case_, for example. + +Finally, you can put everything together in a call to _p_. + +[tokens_basics_parse] + +As you can see, the parse succeeded, and we got three attributes out of it. +Each attribute has the type `std::string_view`. + +[heading Capture groups] + +Capture groups are valid regex syntax, but you cannot use them in your +_tok_spec_ regexes. For instance, `bp::token_spec<"(foo)+", 0>` (to match one +or more consecutive `"foo"`s) will compile and run, and you will get garbage +results. _Parser_ relies on the exact number and order of capture groups to +do its token generation. If you want to group a part of your regex, use a +non-capture group, like `"(?:foo)+"`. + +[heading Whitespace in token parsing] + +Using the parser above, what if we tried to parse the token range `"foo baz +bar" | bp::to_tokens(lexer)` instead? Turns out, we get the same answer. You +cannot use a skipper when parsing tokens. However, parsers are much simpler +when you have a notion of a skipper, especially for whitespace. So, _lex_ has +one built in; it uses `"\\s+"` by default. Whitespace is matched, but +produces no tokens. If you want to change the whitespace/skipper regex, you +can provide it when specifying the lexer. For example, here is how you would +specify the whitespace/skipped tokens to be any sequence of whitespace +charaters, or any C++-style trailing comment (`// ...`). + + bp::lexer + +If whitespace information is important in your parse, simply provide `""` or +the more readable convenience constant `bp::no_ws` to `lexer<>` as the +whitespace regex, and make a regular token that matches whitespace. That way, +you'll see all the whitespace in the sequence of tokens that you parse. + +[heading Token attribute types] + +The parser we looked at in the initial simple example produced three +`std::string_view`s, one for each token we parsed. However, we may know that +a particular token is meant to match numbers. If this is the case, we can let +_Parser_ know that we expect the token to be interpretable as a particular +type of numeric value. I'm using "numeric" for brevity, but this includes +`bool` as well. For example: + +[tokens_attrs] + +The attribute types for these tokens are `bool`, `std::string_view`, and +`double`, respectively. `identifier` has attribute type `std::string_view` +because that is the default if you do not specify a type. + +A _tok_ is essentially a variant of `std::basic_string_view`, `long +long`, and `long double`. The latter two types were seleced because they can +fit any value of an integral or floating-point type, respectively. Even +though _tok_ effectively erases the exact type when it is integral or +floating-point, the token parser retains the information of what the exact +type is. This is why `true_false` above has an attribute type of `bool` and +not `long long`. + +_ctre_ produces a sequence of substrings. Each token produced by _Parser_ +gets its numeric value (if it should have one) by parsing the substring from +_ctre_ with _emdash_ you guessed it _emdash_ a _Parser_ parser. The parser +for `bool` is just _b_; the one for `int` is _i_, etc. The integral-type +parsers all support a radix/base. If you specify an integral value type for +one of your tokens, you can also specify a base, like `bp::token_spec<"\\d+", +int, 16>` to parse hex-encoded `int`s. + +Part of the advantage of doing lexing before parsing is that you don't have to +reparse everything over and over again. If the subsequence `"1.23456789"` is +found in the input, you only lex it once. After that, it's already in the +right form as a floating-point number; backtracking will not provoke reparsing +of those ten characters. + +[heading Single-character tokens] + +Just about any parser above a certain size will have punctuation of some sort +_emdash_ elements of the input, usually a single character, that delimit other +parts of the input, like commas and braces. To make it easier to specify such +tokens, _Parser_ provides _tok_chs_. You can give _tok_chs_ a list of single +characters, and it will create a separate, single-character regex for each +one, and add it to your lexer. Each such token will have the special ID +_ch_id_. + +Note that the single character you provide must be a `char` in the ASCII range +(that is, less than `128`). If you want to use a single character that is +outside the ASCII range, just make a normal _tok_spec_ for it. Here is an +example using _tok_chs_. + +[tokens_token_char] + +Just like in a character parser, we can use character literals to match the +single-character tokens (`'='` and `';'` in the example above). The character +literals are turned into _ch_ parsers. _ch_ parsers that you explicitly write +may be used as well. They will only match single-character tokens, though +(that is, tokens with the ID _ch_id_). + +[heading Parsing tokens with a specific value] + +So far, we've only seen examples of parsing for a particular token. Sometimes +we want to match only occurrances of a given token with a particular value, +just like when we write something like `_ch_('a', 'z')` in a character parser. + +Just as with _ch_ and most other _Parser_ parsers, you can just add the value +to match in parens after the token, like `true_false(true)` or +`identifier("exact string")`. + +[heading Token IDs and diagnostics] + +So far, we've only seen `int` used as the token ID type. Any integral type or +enum can be used, though. There are limitations on the values you can provide +for IDs. First, the values must all be positive; negative values are reserved +for use by _Parser_. Second, the values must not exceed `2^23-1`; no one is +likely to have very many unique IDs, and token storage can be reduced a bit by +using 3 bytes for the ID instead of 4. + +Using an enum has the advantage of making the code a lot clearer. For +instance: + + enum class token_names { foo, bar }; + auto const foo = bp::token_spec<"fo+o", token_names::foo>; + auto const bar = bp::token_spec<"b.*r", token_names::bar>; + +... reads a lot better than just using IDs like `0` and `1`. + +There is another important advantage related to diagnostic messages. Consider +this parse. + + constexpr auto lexer = bp::lexer | foo; + bp::parse("bar" | bp::to_tokens(lexer), bp::eps > foo); + +Here is what the diagnostic looks like. + +[pre +1:0: error: Expected tok<0> here: +bar +^ +] + +If we added a specific string value we expect, that would be included. + + bp::parse("bar" | bp::to_tokens(lexer), bp::eps > foo("foo")); + +[pre +1:0: error: Expected tok<0>("foo") here: +bar +^ +] + +Instead of `"tok"`, it would be nice to give the failed expectation a +user-friendly name. In character parsers we usually do this by giving _rs_ +user-facing diagnostic text. This makes your parse failures much easier to +understand and correct. However, many _tok_specs_ may already have a nice +name, so why not use it? If you use enumerators for you token IDs, and make +their enumeration streamable, _Parser_ will detect this, and use the streamed +enumerator instead of `"tok"`. Here is what we could have written instead. + + enum class printable_tokens { foo, bar }; + std::ostream & operator<<(std::ostream & os, printable_tokens tok) + { + switch (tok) { + case printable_tokens::foo: os << "foo"; break; + case printable_tokens::bar: os << "bar"; break; + } + return os; + } + + auto const foo = bp::token_spec<"foo", printable_tokens::foo>; + auto const bar = bp::token_spec<"b.*r", printable_tokens::bar>; + + constexpr auto lexer = bp::lexer | foo; + bp::parse("bar" | bp::to_tokens(lexer), bp::eps > foo); + +That results in the enumerator being printed instead. + +[pre +1:0: error: Expected foo here: +bar +^ +] + +[important If you provide a streamable enumeration as the token ID type, this +enables the alternate printing behavior described above. If you specify a +particular value for the token parser, that value is printed as the expected +value. So the diagnostic name for `bp::token_spec<"\\d+", 3>(42)` is +`tok<3>(42)` but the name for `bp::token_spec<"\\d+", +printable_tokens::foo>(42)` is just `42` (not `foo`).] + +The takeaway here is that you should use a streamable enumeration for your ID +type. It makes your code easier to read, and produces better diagnostics. + +[heading Token caching] + +Given that I told you earlier that we will make a sequence of tokens and +backtrack within those tokens, you may be wondering where the tokens are +stored. The _tok_v_ (the type created by the range adaptor _to_tok_) uses +internal storage or user-provided external storage to store the tokens as they +are generated. Here is an example of using external storage. + +[tokens_caching_simple] + +The cache could have been a `boost::container::small_vector`, or +even a `static_vector` of appropriate size, to reduce or eliminate memory +allocations. + +Note the size of the cache after the parse; it still contains some tokens. +This is a special case of a more general phenomenon: the token cache grows +without bound when there are no expectation points. This is because, without +expectation points, backtracking is unbounded (refer to the _expect_pts_ +section to see why). If you can go back arbitarily far in order to backtrack, +you need to be sure that there will be a token at the place you backtrack to. + +However, if you use expectation points, the cache is trimmed. The prefix of +tokens before the expectation point is erased from the token cache. + +[tokens_caching_expectation_point] + +Note the use of `std::ref()` to pass a reference to `cache`. This is +necessary because _to_tok_ uses `std::bind_back()` (or a workalike in C++17 +mode). As with the other binders `std`, it does not gracefully handle binding +lvalue references, so you have to use `std::ref()`. + +[heading Lexing failures] + +Parse failures that fail the top-level parse happen only at expectation +points. Lexing failures that fail the top-level parse can happen at any point +in the input. If there is no token regex that matches the current point of +the input, we cannot continue to lex. Lexing failures are usually caused by +bad input, or failure to specify the correct set of _tok_specs_ to cover all +valid input. However, it may also be that you have written an impossible +_tok_spec_. Consider this one. + + constexpr auto bad_token = bp::token_spec<"foo", 0, int>; + +This _tok_spec_ can never generate a valid token. It will match `"foo"` in +the input, but then it will try to parse `"foo"` as an `int`, which is +guaranteed to fail. + +The takeaway here is that a lexing failure might be due to bad input, but it +can also be the sign of a bug in one or more of your _tok_specs_. + +[heading Tradeoffs of token- vs. character-parsing] + +TODO + +[heading The token parsers] + +[table_token_parsers_and_their_semantics] + +[heading Directives and token parsing] + +Some directives that work in character parsing do not work in token parsing. +They are: _no_case_, _lexeme_, and _skip_. These directives all change the +way characters are intrepreted in the middle of the parse. This is not +possible when using a lexer, because a token parser parses tokens, not +characters. The interpretation of the underlying characters is done in the +lexer, and by the time the parser is looking at tokens, it's too late. + +[heading The token parsing API] + +Not all the _p_ and _cbp_ overloads can do token parsing, because some of +them cannot accept a _tok_v_ as input. + +Obviously, the iterator/sentinel overloads cannot accept one directly, but +they also cannot accept the `.begin()` and `.end()` of a _tok_v_. This is +because the iterator/sentinel interface does not provide access to the _tok_v_ +itself, which contains the token cache. The _tok_v_ must be accessible +somehow in order to trim tokens from the cache at expectation points. + +The overloads that take a skipper are precluded, since the skipper must be +built into the lexer itself (see the section above about whitespace handling +for details). + +[heading _ctre_ particulars] + +There are a few details you might want to know about how _ctre_ works. + +_ctre_ uses _pcre_ as its regex grammar. + +"Maximum munch" appears not to be the way _ctre_ tokenizes input. For +instance, if you have _tok_spec_ A that matches `"<=="` and _tok_spec_ B that +matches `"<|>|<=|>=|==|!="`, the input characters `"<=="` will be tokenized as +`"<=="` if the lexer includes `A | B`, but wil be parsed as `"<"` followed by +`"=="` if the lexer includes `B | A`. + +_ctre_ uses `char32_t` for all its compile time strings. If you give it a +regex string literal like `bp::token_spec<"foo", 0>` (that is, an array of +`char`), it will be interpreted in one of two ways. By default, the `char`s +are copied into an array of `char32_t`, unmodified. This is fine if you +provide an ASCII regex, or a regex in a non-Unicode encoding. However, if you +define `CTRE_STRING_IS_UTF8` before including ``, the +array of `char` will be interpreted as UTF-8, and will be transcoded to UTF-32 +before being stored in the array of `char32_t`. All the `charN_t` character +types will be interpreted as UTF-N encoded, and will be transcoded to UTF-32 +if needed. `wchar_t` is taken to mean UTF-32 *even on Windows*. Again, all +of this trancoding happens at compile time. + +[heading Error handling details] + +TODO: Describe how it mostly just works, but that if you use the error +reporting API you need to know which functions require token iterators and +which do not, and how to get from token iterators down to the underlying input +iterators. + +[endsect] + [section Memory Allocation] _Parser_ seldom allocates memory. The exceptions to this are: diff --git a/include/boost/parser/error_handling.hpp b/include/boost/parser/error_handling.hpp index a303a5bd..23a4a6cd 100644 --- a/include/boost/parser/error_handling.hpp +++ b/include/boost/parser/error_handling.hpp @@ -156,7 +156,7 @@ namespace boost { namespace parser { { std::string message = "error: Expected "; message += e.what(); - // TODO: Document that this gracelfully handles token iterators, and + // TODO: Document that this gracefully handles token iterators, and // document the other parts of the API that do or do not. auto [first, it, last] = parser::normalize_iterators(first_, e, last_); return parser::write_formatted_message( @@ -271,8 +271,6 @@ namespace boost { namespace parser { return error_handler_result::fail; } - // TODO: Add term 'token parsing' to glossary at start of docs. - // TODO: Add a test that exercises this function when doing token // parsing. template diff --git a/include/boost/parser/lexer.hpp b/include/boost/parser/lexer.hpp index 7af309e0..42e886f4 100644 --- a/include/boost/parser/lexer.hpp +++ b/include/boost/parser/lexer.hpp @@ -145,10 +145,11 @@ namespace boost { namespace parser { }; } - /** TODO */ + /** A convenience constant for specifying the empty string as the + whitespace template parameter to `boost::parser::lexer`. */ inline constexpr ctll::fixed_string no_ws = ""; - /** TODO */ + /** A token produced by the lexer during token parsing. */ template struct token { @@ -251,8 +252,6 @@ namespace boost { namespace parser { string_view sv_; } value_; position_type underlying_position_ = 0; - // TODO: Document the 22-bit size limitation on id_ (values must be - // positive). int id_ : 24; detail::token_kind kind_ : 8; }; @@ -351,13 +350,8 @@ namespace boost { namespace parser { struct token_chars_spec { static_assert( - (std::same_as && ... && true), - "All non-type template parameters given to token_chars_spec " - "must be chars."); - - static_assert( - (unsigned char)Ch < 128 && - ((unsigned char)(Chs < 128) && ... && true), + (unsigned char)Ch < 128u && + ((unsigned char)(Chs < 128u) && ... && true), "All non-type template parameters given to token_chars_spec " "must be <= 127."); }; @@ -435,7 +429,10 @@ namespace boost { namespace parser { } } - /** TODO */ + /** Represents the compile time parameters for matching a single token + during token parsing, and for producing a `std::basic_string_view` or + number from the matched characters. Don't use this directly; use + `boost::parser::token_spec` instead. */ template struct token_spec_t { @@ -445,26 +442,23 @@ namespace boost { namespace parser { static_assert( 0 <= (int)ID, "Token IDs must be integral values or enums >=0."); - // TODO: Document that capture groups are not allowed within a - // token_spec regex, and to Use '(?:' followed by ')' to create a - // non-capturing group. - static constexpr ctll::fixed_string regex = Regex; static constexpr id_type id = ID; static constexpr int base = Base < 0 ? 10 : Base; static constexpr bool is_character_token = Base < 0; }; - // TODO: Document that this takes a pack of char -- and nothing else. Also - // note that for anything more complicated, including a short UTF-8 sequence - // that encodes a code point, you must use the token_spec form. - /** TODO */ + /** Specifies one or more single-character tokens. Each character must be + in the ASCII range (< 128), and must be of type `char`. If you want + to specify tokens of longer than 1 character, use + `boost::parser::token_spec`. */ template + requires(std::same_as && ... && true) constexpr auto token_chars = detail::token_chars_spec{}; - // TODO: Document that the ID type given to the inital lexer<>() is the one - // that must be used for all non-character token specs. - /** TODO */ + /** The type used to represent the lexer used to tokenize input during + token parsing. Do not use this directly; use `boost::parser::lexer` + instead. */ template< typename CharType, typename ID, @@ -534,7 +528,10 @@ namespace boost { namespace parser { // implicit 0-group that we ignore, but we still need initial elements to // make all the indices line up later. - /** TODO */ + /** A variable template used to generate a lexer for use in token parsing. + The resulting lexer has no associated tokens. Associate tokens with + it by piping `boost::parser::token_spec`s and/or + `boost::parser::token_chars`s after it. */ template< typename CharType, typename ID, @@ -752,7 +749,6 @@ namespace boost { namespace parser { using maybe_const = std::conditional_t; } - /** TODO */ template< std::ranges::contiguous_range V, typename Lexer, @@ -835,10 +831,8 @@ namespace boost { namespace parser { friend struct seq_parser; // TODO: Document that the token cache will grow without bound if the - // parser contains no sequence points. - - // TODO: Document the point above in the doc section that talks about - // the importance of sequence points. + // parser contains no sequence points. Document this in the doc + // section that talks about the importance of sequence points. V base_ = V(); Lexer lexer_; @@ -892,10 +886,6 @@ namespace boost { namespace parser { auto const parse_results = *ctre_first; if constexpr (Lexer::has_ws) { - // TODO: Document that ws is implicitly and unalterably - // filtered out; to get ws tokens, you must explicitly - // provide "" as the ws str for lexer<>, and then add a - // token spec for ws separately. if (auto sv = parse_results.template get()) { continue; @@ -1048,7 +1038,12 @@ namespace boost { namespace parser { }; } - /** TODO */ + /** A range adaptor that produces `boost::parser::token_view`s. Takes a + range (possibly using pipe syntax) as the first argument. The second + argument is the lexer to use. The third argument is a + `std::reference_wrapper`, where `TokenCache` is a + random-access container used to cache tokens during token parsing; + this argument is optional. */ inline constexpr detail::stl_interfaces::adaptor to_tokens = detail::to_tokens_impl{}; diff --git a/include/boost/parser/lexer_fwd.hpp b/include/boost/parser/lexer_fwd.hpp index fc0ea59b..74492566 100644 --- a/include/boost/parser/lexer_fwd.hpp +++ b/include/boost/parser/lexer_fwd.hpp @@ -11,7 +11,10 @@ namespace boost { namespace parser { - /** TODO */ + /** A `std::views`-compatible view that provides the tokens from the given + contiguous range, using the given lexer and optional token cache. You + should typically not need to use this type directly; use + `boost::parser::to_tokens` instead. */ template< std::ranges::contiguous_range V, typename Lexer, diff --git a/include/boost/parser/parser.hpp b/include/boost/parser/parser.hpp index ab2137ad..40fc1f68 100644 --- a/include/boost/parser/parser.hpp +++ b/include/boost/parser/parser.hpp @@ -325,7 +325,8 @@ namespace boost { namespace parser { #ifdef BOOST_PARSER_DOXYGEN - /** TODO */ + /** A type trait that evaluates to `true` iff `T` is a specialization of + `boost::parser::token`. */ template constexpr bool is_token_v = detail::foo; @@ -6658,7 +6659,7 @@ namespace boost { namespace parser { matches one of the values in `r`. If the character being matched during the parse is a `char32_t` value, the elements of `r` are transcoded from their presumed encoding to UTF-32 during the - comparison. Otherwise, the character begin matched is directly + comparison. Otherwise, the character being matched is directly compared to the elements of `r`. */ #if BOOST_PARSER_USE_CONCEPTS template @@ -6688,7 +6689,7 @@ namespace boost { namespace parser { /** Returns a `parser_interface` containing a `char_parser` that matches one of the values in `r`. `r` must be a sorted, - random-access sequence of `char32_t`. The character begin matched + random-access sequence of `char32_t`. The character being matched is directly compared to the elements of `r`. The match is found via binary search. No case folding is performed. @@ -7373,7 +7374,7 @@ namespace boost { namespace parser { the input being matched during the parse is a a sequence of `char32_t`, the elements of `r` are transcoded from their presumed encoding to UTF-32 during the comparison. Otherwise, the - character begin matched is directly compared to the elements of + character being matched is directly compared to the elements of `r`. */ #if BOOST_PARSER_USE_CONCEPTS template @@ -7443,7 +7444,7 @@ namespace boost { namespace parser { the input being matched during the parse is a a sequence of `char32_t`, the elements of `r` are transcoded from their presumed encoding to UTF-32 during the comparison. Otherwise, the - character begin matched is directly compared to the elements of + character being matched is directly compared to the elements of `r`. `symbols` provides a list of strings that may appear after a backslash to form an escape sequence, and what character(s) each escape sequence represents. Note that `"\\"` and `"\ch"` are diff --git a/include/boost/parser/parser_fwd.hpp b/include/boost/parser/parser_fwd.hpp index d3ad0f59..ace78e0d 100644 --- a/include/boost/parser/parser_fwd.hpp +++ b/include/boost/parser/parser_fwd.hpp @@ -449,7 +449,11 @@ namespace boost { namespace parser { struct string_view_tag {}; - /** TODO */ + /** Matches a token from the input with ID `TokenSpec::id`. Fails on any + other input. The parse will also fail if `Expected` is anything but + `detail::nope` (which it is by default), and `expected_.matches(attr)` + is not `true` for the produced attribute `attr`. Used in token + parsing only. */ template struct token_parser; diff --git a/include/boost/parser/token_parser.hpp b/include/boost/parser/token_parser.hpp index 3da2bd47..b9748038 100644 --- a/include/boost/parser/token_parser.hpp +++ b/include/boost/parser/token_parser.hpp @@ -39,6 +39,7 @@ namespace boost { namespace parser { } template + // TODO: requires std::integral || std::floating_point struct token_with_value { explicit token_with_value(T value) : value_(value) {} @@ -54,12 +55,10 @@ namespace boost { namespace parser { template bool matches(std::basic_string_view value) const { - // TODO: this is wrong, maybe. Can we transcode both sides to - // UTF-32? We have a problem that the usual opt-in is not - // available; you cannot specify the input in terms of - // utfN_view. Maybe if CharType is not char, we do the - // transcoding? Try it that way, write some tests, and - // consider whether this is a good idea. + // TODO: this is wrong. We need to transcode both sides to + // UTF-32, when !same_as. (Need to write some + // tests, and evaluate whether this is a good idea. If not, + // go change the docs on token_parser). return std::ranges::equal(value, value_); } @@ -69,8 +68,6 @@ namespace boost { namespace parser { #ifndef BOOST_PARSER_DOXYGEN - // TODO: Constrain the AttributeType to something that detail::token_as() - // can handle. template struct token_parser { @@ -155,7 +152,8 @@ namespace boost { namespace parser { ++first; } - /** TODO */ + /** Returns a `parser_interface` containing a `token_parser` that + matches `value`. */ template requires std::is_integral_v || std::is_floating_point_v constexpr auto operator()(T value) const noexcept @@ -170,6 +168,12 @@ namespace boost { namespace parser { detail::token_with_value(std::move(value)))); } + /** Returns a `parser_interface` containing a `token_parser` that + matches the range `r`. If the token being matched during the + parse has a `char_type` of `char8_t`, `char16_t`, or `char32_t`, + the elements of `r` are transcoded from their presumed encoding to + UTF-32 during the comparison. Otherwise, the character being + matched is directly compared to the elements of `r`. */ template constexpr auto operator()(R && r) const noexcept { @@ -202,7 +206,10 @@ namespace boost { namespace parser { #endif - /** TODO */ + /** A variable template that defines a token parser associated with + `boost::parser::token_spec_t`. This token + parser can be used to specify a lexer, and may also be used in + parsers. */ template< ctll::fixed_string Regex, auto ID, diff --git a/test/lexer.cpp b/test/lexer.cpp index 7da83e8c..9b4fee61 100644 --- a/test/lexer.cpp +++ b/test/lexer.cpp @@ -498,8 +498,6 @@ int main() // lexing errors { - // TODO: Document that a lexing error is a programming error, not an - // input error. using namespace std::literals; auto const lexer = bp::lexer | @@ -567,13 +565,5 @@ int main() BOOST_TEST(caught_exception); } - // TODO: Document the limitation of CTRE that the input must be a - // continguous_range, so that string_views can be formed. - - // TODO: Document that every spec's chars are assumed to be in UTF when - // CTRE_STRING_IS_UTF8 is defined, and no encoding otherwise. Also document - // that char16_t is treated as UTF-16, but wchar_t and char32_t are *both* - // treated as UTF-32, even on windows. - return boost::report_errors(); } diff --git a/test/lexer_adobe_files.cpp b/test/lexer_adobe_files.cpp index 5ac17b16..e0b6ce6a 100644 --- a/test/lexer_adobe_files.cpp +++ b/test/lexer_adobe_files.cpp @@ -24,10 +24,6 @@ namespace bp = boost::parser; int main() { { - // TODO: Document that maximum munch does not appear to apply in CTRE - // regexes -- putting "<==" after "<|>|<=|>=" causes input "<==" to be - // tokenized as "<", "==". - static_assert(decltype(adobe_lexer)::size() == 29 + 1); static_assert( std::same_as); diff --git a/test/lexer_and_parser.cpp b/test/lexer_and_parser.cpp index 717eb379..6dd2c93d 100644 --- a/test/lexer_and_parser.cpp +++ b/test/lexer_and_parser.cpp @@ -7,8 +7,10 @@ */ #define BOOST_PARSER_TESTING +//[ tokens_basics_headers #include #include +//] #include @@ -85,10 +87,6 @@ int main() bp::detail::nope>); } { - // TODO: Document the idiom that you should use enumerations for the - // ID type, and that the enumeration should be stream-insertable, with - // a user-friendly name for each enumerator. This is to make error - // messages better than "expected tok<0> here:". auto parser = identifier >> '=' >> true_false >> ';'; auto r = "foo = false;" | bp::to_tokens(adobe_lexer); auto result = bp::parse(r, parser); @@ -115,5 +113,111 @@ int main() BOOST_TEST(cache.size() == 2u); } + // TODO: Add API tests exercising all the *parse() overloads that accept + // tokens_views. + + // TODO: Add tests exercising all terminal parsers (esp. character + // parsers) that are compat. w/token parsing. + + // TODO: Adapt symbols for use with token parsing; test here. + + // TODO: Adapt the string literal parser to match against + // string_view-providing tokens. + + // doc examples + // clang-format off + { + //[ tokens_basics_lexer + auto const foo = bp::token_spec<"foo", 0>; + auto const bar = bp::token_spec<"b.*r", 1>; + auto const baz = bp::token_spec<"b.+z", 2>; + + auto const lexer = bp::lexer | foo | bar | baz; + //] + + //[ tokens_basics_input_range + auto r = "foobazbar" | bp::to_tokens(lexer); + //] + + //[ tokens_basics_parser + auto parser = foo >> baz >> bar; + //] + + //[ tokens_basics_parse + auto result = bp::parse(r, parser); + assert(result); + assert(std::get<0>(*result) == "foo"); + assert(std::get<1>(*result) == "baz"); + assert(std::get<2>(*result) == "bar"); + //] + } + + { + //[ tokens_attrs + constexpr auto true_false = bp::token_spec<"true|false", 0, bool>; + constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>; + constexpr auto number = bp::token_spec<"\\d+(?:\\.\\d*)?", 2, double>; + //] + (void)true_false; + (void)identifier; + (void)number; + } + + { + //[ tokens_token_char + constexpr auto true_false = bp::token_spec<"true|false", 0, bool>; + constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>; + + constexpr auto lexer = + bp::lexer | true_false | identifier | bp::token_chars<'=', ';'>; + + auto parser = identifier >> '=' >> true_false >> ';'; + auto r = "foo = false;" | bp::to_tokens(lexer); + auto result = bp::parse(r, parser); + assert(result); + assert(std::get<0>(*result) == "foo"); + assert(std::get<1>(*result) == false); + //] + } + + { + //[ tokens_caching_simple + constexpr auto true_false = bp::token_spec<"true|false", 0, bool>; + constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>; + + constexpr auto lexer = + bp::lexer | true_false | identifier | bp::token_chars<'=', ';'>; + + auto parser = identifier >> '=' >> true_false >> ';'; + std::vector> cache; + auto r = "foo = false;" | bp::to_tokens(lexer, std::ref(cache)); + auto result = bp::parse(r, parser); + assert(result); + assert(std::get<0>(*result) == "foo"); + assert(std::get<1>(*result) == false); + assert(cache.size() == 4u); + //] + } + + { + constexpr auto true_false = bp::token_spec<"true|false", 0, bool>; + constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>; + + constexpr auto lexer = + bp::lexer | true_false | identifier | bp::token_chars<'=', ';'>; + + //[ tokens_caching_expectation_point + auto parser = identifier >> '=' > true_false >> ';'; + std::vector> cache; + auto r = "foo = false;" | bp::to_tokens(lexer, std::ref(cache)); + auto result = bp::parse(r, parser); + assert(result); + assert(std::get<0>(*result) == "foo"); + assert(std::get<1>(*result) == false); + assert(cache.size() == 2u); + //] + } + // clang-format on + return boost::report_errors(); }