From 8a644b459e019d1f2a20f5dc436ea7d461d3cac8 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Thu, 9 Nov 2023 18:32:52 +0100 Subject: [PATCH 01/13] fmt: Start working on format_args!() parser This commit adds a base class for parsing the various constructs of a Rust format string, according to the grammar in the reference: https://doc.rust-lang.org/std/fmt/index.html#syntax gcc/rust/ChangeLog: * Make-lang.in: Compile rust-fmt object * ast/rust-fmt.cc: New file. * ast/rust-fmt.h: New file. --- gcc/rust/Make-lang.in | 1 + gcc/rust/ast/rust-fmt.cc | 96 ++++++++++++++++++++++++++++ gcc/rust/ast/rust-fmt.h | 133 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 gcc/rust/ast/rust-fmt.cc create mode 100644 gcc/rust/ast/rust-fmt.h diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in index 4d6460187924..6696b471d476 100644 --- a/gcc/rust/Make-lang.in +++ b/gcc/rust/Make-lang.in @@ -100,6 +100,7 @@ GRS_OBJS = \ rust/rust-proc-macro-invoc-lexer.o \ rust/rust-macro-substitute-ctx.o \ rust/rust-macro-builtins.o \ + rust/rust-fmt.o \ rust/rust-hir.o \ rust/rust-hir-map.o \ rust/rust-attributes.o \ diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc new file mode 100644 index 000000000000..9f9ba48f0c3a --- /dev/null +++ b/gcc/rust/ast/rust-fmt.cc @@ -0,0 +1,96 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +#include "rust-fmt.h" + +namespace Rust { +tl::expected +Fmt::parse_fmt_string (Fmt::Input input) +{ + return Fmt (); +} + +tl::expected>, Fmt::Error> +Fmt::maybe_format (Fmt::Input input) +{ + tl::optional none = tl::nullopt; + + return Fmt::Result (input, none); +} + +tl::expected, Fmt::Error> +Fmt::format (Input input) +{ + return Fmt::Result (input, Format ()); +} + +tl::expected, Fmt::Error> +Fmt::argument (Input input) +{ + return Fmt::Result (input, Argument ()); +} + +tl::expected, Fmt::Error> +Fmt::format_spec (Input input) +{ + return Fmt::Result (input, FormatSpec ()); +} + +tl::expected, Fmt::Error> +Fmt::fill (Input input) +{ + return Fmt::Result (input, Fill ()); +} + +tl::expected, Fmt::Error> +Fmt::align (Input input) +{ + switch (input[0]) + { + case '<': + return Fmt::Result (input.substr (1), Align::Left); + case '^': + return Fmt::Result (input.substr (1), Align::Top); + case '>': + return Fmt::Result (input.substr (1), Align::Right); + default: + // TODO: Store the character here + // TODO: Can we have proper error locations? + // TODO: Maybe we should use a Rust::Literal string instead of a string + return tl::make_unexpected (Error::Align); + } +} + +tl::expected, Fmt::Error> +Fmt::sign (Input input) +{ + switch (input[0]) + { + case '+': + return Fmt::Result (input.substr (1), Sign::Plus); + case '-': + return Fmt::Result (input.substr (1), Sign::Minus); + default: + // TODO: Store the character here + // TODO: Can we have proper error locations? + // TODO: Maybe we should use a Rust::Literal string instead of a string + return tl::make_unexpected (Error::Sign); + } +} + +} // namespace Rust diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h new file mode 100644 index 000000000000..f3dd53da9791 --- /dev/null +++ b/gcc/rust/ast/rust-fmt.h @@ -0,0 +1,133 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +#ifndef RUST_FMT_H +#define RUST_FMT_H + +#include "expected.h" +#include "optional.h" +#include "rust-ast.h" +#include "rust-system.h" + +namespace Rust { + +/** + * This class implements the parsing of Rust format strings according to the + * grammar here: https://doc.rust-lang.org/std/fmt/index.html#syntax + */ +// TODO: Are there features that are only present in specific Rust editions? +class Fmt +{ +public: + // TODO: Keep location information + // TODO: Switch to a Rust::AST::Literal here + using Input = std::string; + + enum class Error + { + Align, + Sign, + }; + + template class Result + { + public: + explicit Result (Input remaining_input, T result) + : remaining_input (remaining_input), result (result) + {} + + private: + Input remaining_input; + T result; + }; + + // FIXME: Do not use an owned string here + static tl::expected parse_fmt_string (Input input); + +private: + // the parse functions should return the remaining input as well as the + // expected node let's look at nom + // TODO: no string view :( use an owned string for now? + + template struct ParseResult + { + tl::expected, Error> inner; + + ParseResult (tl::expected, Error> inner) : inner (inner) {} + ParseResult operator= (tl::expected, Error> inner) + { + return ParseResult (inner); + } + + Input remaining_input () { return inner->remaining_input; } + T value () { return inner->value; } + }; + + struct Format + { + }; + + struct Argument + { + enum struct Kind + { + Integer, + Identifier, + } kind; + + int integer; + Identifier identifier; + }; + + struct FormatSpec + { + }; + + struct Fill + { + char to_fill; + }; + + enum class Align + { + Left, + Top, + Right + }; + + enum class Sign + { + Plus, + Minus + }; + + // let's do one function per rule in the BNF + static tl::expected, Error> text (Input input); + static tl::expected>, Error> + maybe_format (Input input); + static tl::expected, Error> format (Input input); + static tl::expected, Error> argument (Input input); + static tl::expected, Error> format_spec (Input input); + static tl::expected, Error> fill (Input input); + static tl::expected, Error> align (Input input); + static tl::expected, Error> sign (Input input); +}; + +} // namespace Rust + +#endif // ! RUST_FMT_H From 7f587c753caf189a16b8c3ad8fdef496e925badf Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Thu, 30 Nov 2023 14:11:41 +0100 Subject: [PATCH 02/13] libgrust: Add format_parser library Compile libformat_parser and link to it. gcc/rust/ChangeLog: * Make-lang.in: Compile libformat_parser. * ast/rust-fmt.cc: New FFI definitions. * ast/rust-fmt.h: Likewise. * expand/rust-macro-builtins.cc (MacroBuiltin::format_args_handler): Call into libformat_parser. * expand/rust-macro-builtins.h: Define format_args!() handler proper. libgrust/ChangeLog: * libformat_parser/Cargo.lock: New file. * libformat_parser/Cargo.toml: New file. * libformat_parser/generic_format_parser/Cargo.toml: New file. * libformat_parser/generic_format_parser/src/lib.rs: New file. * libformat_parser/src/bin.rs: New file. * libformat_parser/src/lib.rs: New file. --- gcc/rust/Make-lang.in | 15 +- gcc/rust/ast/rust-fmt.cc | 77 +- gcc/rust/ast/rust-fmt.h | 189 +-- gcc/rust/expand/rust-macro-builtins.cc | 12 +- gcc/rust/expand/rust-macro-builtins.h | 3 + libgrust/libformat_parser/Cargo.lock | 30 + libgrust/libformat_parser/Cargo.toml | 21 + .../generic_format_parser/Cargo.toml | 9 + .../generic_format_parser/src/lib.rs | 1102 +++++++++++++++++ libgrust/libformat_parser/src/bin.rs | 7 + libgrust/libformat_parser/src/lib.rs | 41 + 11 files changed, 1351 insertions(+), 155 deletions(-) create mode 100644 libgrust/libformat_parser/Cargo.lock create mode 100644 libgrust/libformat_parser/Cargo.toml create mode 100644 libgrust/libformat_parser/generic_format_parser/Cargo.toml create mode 100644 libgrust/libformat_parser/generic_format_parser/src/lib.rs create mode 100644 libgrust/libformat_parser/src/bin.rs create mode 100644 libgrust/libformat_parser/src/lib.rs diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in index 6696b471d476..7c8ab6e78464 100644 --- a/gcc/rust/Make-lang.in +++ b/gcc/rust/Make-lang.in @@ -54,6 +54,8 @@ GCCRS_D_OBJS = \ rust/rustspec.o \ $(END) +LIBS += -ldl -lpthread + gccrs$(exeext): $(GCCRS_D_OBJS) $(EXTRA_GCC_OBJS) libcommon-target.a $(LIBDEPS) +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \ $(GCCRS_D_OBJS) $(EXTRA_GCC_OBJS) libcommon-target.a \ @@ -209,14 +211,14 @@ RUST_ALL_OBJS = $(GRS_OBJS) $(RUST_TARGET_OBJS) rust_OBJS = $(RUST_ALL_OBJS) rust/rustspec.o -RUST_LDFLAGS = $(LDFLAGS) -L./../libgrust/libproc_macro_internal -RUST_LIBDEPS = $(LIBDEPS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a +RUST_LDFLAGS = $(LDFLAGS) -L./../libgrust/libproc_macro_internal -L./../libgrust/librustc_format_parser/ +RUST_LIBDEPS = $(LIBDEPS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a rust/libformat_parser.a # The compiler itself is called crab1 crab1$(exeext): $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(RUST_LIBDEPS) $(rust.prev) @$(call LINK_PROGRESS,$(INDEX.rust),start) +$(LLINKER) $(ALL_LINKERFLAGS) $(RUST_LDFLAGS) -o $@ \ - $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(LIBS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a $(BACKENDLIBS) + $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(LIBS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a rust/libformat_parser.a $(BACKENDLIBS) @$(call LINK_PROGRESS,$(INDEX.rust),end) # Build hooks. @@ -402,6 +404,13 @@ rust/%.o: rust/lex/%.cc $(COMPILE) $(RUST_CXXFLAGS) $(RUST_INCLUDES) $< $(POSTCOMPILE) +%.toml: + echo $@ + +rust/libformat_parser.a: $(srcdir)/../libgrust/libformat_parser/Cargo.toml $(wildcard $(srcdir)/../libgrust/libformat_parser/src/*.rs) + cargo build --manifest-path $(srcdir)/../libgrust/libformat_parser/Cargo.toml --release # FIXME: Not always release, right? + cp $(srcdir)/../libgrust/libformat_parser/target/release/liblibformat_parser.a $@ + # build all rust/parse files in rust folder, add cross-folder includes rust/%.o: rust/parse/%.cc $(COMPILE) $(RUST_CXXFLAGS) $(RUST_INCLUDES) $< diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc index 9f9ba48f0c3a..559b1c8b5795 100644 --- a/gcc/rust/ast/rust-fmt.cc +++ b/gcc/rust/ast/rust-fmt.cc @@ -19,78 +19,23 @@ #include "rust-fmt.h" namespace Rust { -tl::expected -Fmt::parse_fmt_string (Fmt::Input input) -{ - return Fmt (); -} +namespace Fmt { -tl::expected>, Fmt::Error> -Fmt::maybe_format (Fmt::Input input) +Pieces +Pieces::collect (const std::string &to_parse) { - tl::optional none = tl::nullopt; + auto piece_slice = collect_pieces (to_parse.c_str ()); - return Fmt::Result (input, none); -} + rust_debug ("[ARTHUR] %p, %lu", (void *) piece_slice.ptr, piece_slice.len); -tl::expected, Fmt::Error> -Fmt::format (Input input) -{ - return Fmt::Result (input, Format ()); -} + // this performs multiple copies, can we avoid them maybe? + auto pieces + = std::vector (piece_slice.ptr, piece_slice.ptr + piece_slice.len); -tl::expected, Fmt::Error> -Fmt::argument (Input input) -{ - return Fmt::Result (input, Argument ()); -} + rust_debug ("[ARTHUR] %p, %lu", (void *) pieces.data (), pieces.size ()); -tl::expected, Fmt::Error> -Fmt::format_spec (Input input) -{ - return Fmt::Result (input, FormatSpec ()); -} - -tl::expected, Fmt::Error> -Fmt::fill (Input input) -{ - return Fmt::Result (input, Fill ()); -} - -tl::expected, Fmt::Error> -Fmt::align (Input input) -{ - switch (input[0]) - { - case '<': - return Fmt::Result (input.substr (1), Align::Left); - case '^': - return Fmt::Result (input.substr (1), Align::Top); - case '>': - return Fmt::Result (input.substr (1), Align::Right); - default: - // TODO: Store the character here - // TODO: Can we have proper error locations? - // TODO: Maybe we should use a Rust::Literal string instead of a string - return tl::make_unexpected (Error::Align); - } -} - -tl::expected, Fmt::Error> -Fmt::sign (Input input) -{ - switch (input[0]) - { - case '+': - return Fmt::Result (input.substr (1), Sign::Plus); - case '-': - return Fmt::Result (input.substr (1), Sign::Minus); - default: - // TODO: Store the character here - // TODO: Can we have proper error locations? - // TODO: Maybe we should use a Rust::Literal string instead of a string - return tl::make_unexpected (Error::Sign); - } + return Pieces{}; } +} // namespace Fmt } // namespace Rust diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h index f3dd53da9791..0050977358f1 100644 --- a/gcc/rust/ast/rust-fmt.h +++ b/gcc/rust/ast/rust-fmt.h @@ -19,115 +19,134 @@ #ifndef RUST_FMT_H #define RUST_FMT_H -#include "expected.h" -#include "optional.h" -#include "rust-ast.h" +#include "rust-diagnostics.h" #include "rust-system.h" namespace Rust { +namespace Fmt { -/** - * This class implements the parsing of Rust format strings according to the - * grammar here: https://doc.rust-lang.org/std/fmt/index.html#syntax - */ -// TODO: Are there features that are only present in specific Rust editions? -class Fmt +struct RustHamster { -public: - // TODO: Keep location information - // TODO: Switch to a Rust::AST::Literal here - using Input = std::string; + // hehe +}; - enum class Error - { - Align, - Sign, - }; +struct InnerSpan +{ +}; - template class Result +struct Count +{ + enum class Kind + { + Is, + IsName, + IsParam, + IsStar, + Implied + } kind; + + union { - public: - explicit Result (Input remaining_input, T result) - : remaining_input (remaining_input), result (result) - {} + size_t is; + std::pair is_name; + size_t is_param; + size_t is_star; + } data; +}; - private: - Input remaining_input; - T result; - }; +struct DebugHex +{ +}; - // FIXME: Do not use an owned string here - static tl::expected parse_fmt_string (Input input); +struct Sign +{ +}; -private: - // the parse functions should return the remaining input as well as the - // expected node let's look at nom - // TODO: no string view :( use an owned string for now? +struct Alignment +{ +}; - template struct ParseResult - { - tl::expected, Error> inner; +struct RustString +{ + // hehe +}; - ParseResult (tl::expected, Error> inner) : inner (inner) {} - ParseResult operator= (tl::expected, Error> inner) - { - return ParseResult (inner); - } +struct Position +{ +}; - Input remaining_input () { return inner->remaining_input; } - T value () { return inner->value; } - }; +struct FormatSpec +{ + /// Optionally specified character to fill alignment with. + tl::optional fill; + /// Span of the optionally specified fill character. + tl::optional fill_span; + /// Optionally specified alignment. + Alignment align; + /// The `+` or `-` flag. + tl::optional sign; + /// The `#` flag. + bool alternate; + /// The `0` flag. + bool zero_pad; + /// The `x` or `X` flag. (Only for `Debug`.) + tl::optional debug_hex; + /// The integer precision to use. + // Count <'a> precision; + /// The span of the precision formatting flag (for diagnostics). + tl::optional precision_span; + /// The string width requested for the resulting format. + // Count <'a> width; + /// The span of the width formatting flag (for diagnostics). + tl::optional width_span; + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + RustHamster ty; + // &'a str ty; + /// The span of the descriptor string (for diagnostics). + tl::optional ty_span; +}; - struct Format - { - }; +struct Argument +{ + Position position; + InnerSpan inner_span; + FormatSpec format; +}; - struct Argument +struct Piece +{ + enum class Kind { - enum struct Kind - { - Integer, - Identifier, - } kind; + String, + NextArgument + } kind; - int integer; - Identifier identifier; - }; - - struct FormatSpec + union { - }; + RustString string; + Argument *next_argument; + } data; +}; - struct Fill - { - char to_fill; - }; +struct PieceSlice +{ + Piece *ptr; + size_t len; +}; - enum class Align - { - Left, - Top, - Right - }; +extern "C" { +PieceSlice +collect_pieces (const char *); +} - enum class Sign - { - Plus, - Minus - }; - - // let's do one function per rule in the BNF - static tl::expected, Error> text (Input input); - static tl::expected>, Error> - maybe_format (Input input); - static tl::expected, Error> format (Input input); - static tl::expected, Error> argument (Input input); - static tl::expected, Error> format_spec (Input input); - static tl::expected, Error> fill (Input input); - static tl::expected, Error> align (Input input); - static tl::expected, Error> sign (Input input); +struct Pieces +{ + static Pieces collect (const std::string &to_parse); }; +} // namespace Fmt } // namespace Rust #endif // ! RUST_FMT_H diff --git a/gcc/rust/expand/rust-macro-builtins.cc b/gcc/rust/expand/rust-macro-builtins.cc index 71da575563db..0e57406f10f8 100644 --- a/gcc/rust/expand/rust-macro-builtins.cc +++ b/gcc/rust/expand/rust-macro-builtins.cc @@ -30,6 +30,7 @@ #include "rust-parse.h" #include "rust-session-manager.h" #include "rust-attribute-values.h" +#include "rust-fmt.h" namespace Rust { @@ -89,8 +90,8 @@ std::unordered_map {"env", MacroBuiltin::env_handler}, {"cfg", MacroBuiltin::cfg_handler}, {"include", MacroBuiltin::include_handler}, + {"format_args", MacroBuiltin::format_args_handler}, /* Unimplemented macro builtins */ - {"format_args", MacroBuiltin::sorry}, {"option_env", MacroBuiltin::sorry}, {"format_args_nl", MacroBuiltin::sorry}, {"concat_idents", MacroBuiltin::sorry}, @@ -942,6 +943,15 @@ MacroBuiltin::stringify_handler (location_t invoc_locus, return AST::Fragment ({node}, std::move (token)); } +tl::optional +MacroBuiltin::format_args_handler (location_t invoc_locus, + AST::MacroInvocData &invoc) +{ + Fmt::Pieces::collect ("heyo this {is} what I {} want to {3}, {parse}"); + + return AST::Fragment::create_empty (); +} + tl::optional MacroBuiltin::sorry (location_t invoc_locus, AST::MacroInvocData &invoc) { diff --git a/gcc/rust/expand/rust-macro-builtins.h b/gcc/rust/expand/rust-macro-builtins.h index 6a84a8b86f68..f9ab3fc3698e 100644 --- a/gcc/rust/expand/rust-macro-builtins.h +++ b/gcc/rust/expand/rust-macro-builtins.h @@ -157,6 +157,9 @@ class MacroBuiltin static tl::optional line_handler (location_t invoc_locus, AST::MacroInvocData &invoc); + static tl::optional + format_args_handler (location_t invoc_locus, AST::MacroInvocData &invoc); + static tl::optional sorry (location_t invoc_locus, AST::MacroInvocData &invoc); diff --git a/libgrust/libformat_parser/Cargo.lock b/libgrust/libformat_parser/Cargo.lock new file mode 100644 index 000000000000..65e48263c71a --- /dev/null +++ b/libgrust/libformat_parser/Cargo.lock @@ -0,0 +1,30 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "generic_format_parser" +version = "0.1.0" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "libformat_parser" +version = "0.1.0" +dependencies = [ + "generic_format_parser", + "libc", +] + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" diff --git a/libgrust/libformat_parser/Cargo.toml b/libgrust/libformat_parser/Cargo.toml new file mode 100644 index 000000000000..0fcfa3e89a4c --- /dev/null +++ b/libgrust/libformat_parser/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "libformat_parser" +version = "0.1.0" +edition = "2021" + +[workspace] + +members = [ + "generic_format_parser", +] + +[dependencies] +libc = "0.2" +generic_format_parser = { path = "generic_format_parser" } + +[lib] +crate_type = ["staticlib", "rlib"] + +[[bin]] +name = "format_parser_test" +path = "src/bin.rs" diff --git a/libgrust/libformat_parser/generic_format_parser/Cargo.toml b/libgrust/libformat_parser/generic_format_parser/Cargo.toml new file mode 100644 index 000000000000..34577038cbed --- /dev/null +++ b/libgrust/libformat_parser/generic_format_parser/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "generic_format_parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +unicode-xid = "0.2.0" diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs new file mode 100644 index 000000000000..f42c9d8dffbb --- /dev/null +++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs @@ -0,0 +1,1102 @@ +//! Macro support for format strings +//! +//! These structures are used when parsing format strings for the compiler. +//! Parsing does not happen at runtime: structures of `std::fmt::rt` are +//! generated instead. + +#![doc( + html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/", + html_playground_url = "https://play.rust-lang.org/", + test(attr(deny(warnings))) +)] +#![deny(rustc::untranslatable_diagnostic)] +#![deny(rustc::diagnostic_outside_of_impl)] +// WARNING: We want to be able to build this crate with a stable compiler, +// so no `#![feature]` attributes should be added! + +#[deprecated(note = "Use a proper lexer function for this")] +fn is_id_start(c: char) -> bool { + c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) +} + +#[deprecated(note = "Use a proper lexer function for this")] +fn is_id_continue(c: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(c) +} + +// use rustc_lexer::unescape; +pub use Alignment::*; +pub use Count::*; +pub use Piece::*; +pub use Position::*; + +use std::iter; +use std::str; +use std::string; + +// Note: copied from rustc_span +/// Range inside of a `Span` used for diagnostics when we only have access to relative positions. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub struct InnerSpan { + pub start: usize, + pub end: usize, +} + +impl InnerSpan { + pub fn new(start: usize, end: usize) -> InnerSpan { + InnerSpan { start, end } + } +} + +/// The location and before/after width of a character whose width has changed from its source code +/// representation +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct InnerWidthMapping { + /// Index of the character in the source + pub position: usize, + /// The inner width in characters + pub before: usize, + /// The transformed width in characters + pub after: usize, +} + +impl InnerWidthMapping { + pub fn new(position: usize, before: usize, after: usize) -> InnerWidthMapping { + InnerWidthMapping { + position, + before, + after, + } + } +} + +/// Whether the input string is a literal. If yes, it contains the inner width mappings. +#[derive(Clone, PartialEq, Eq)] +enum InputStringKind { + NotALiteral, + Literal { + width_mappings: Vec, + }, +} + +/// The type of format string that we are parsing. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ParseMode { + /// A normal format string as per `format_args!`. + Format, + /// An inline assembly template string for `asm!`. + InlineAsm, +} + +#[derive(Copy, Clone)] +struct InnerOffset(usize); + +impl InnerOffset { + fn to(self, end: InnerOffset) -> InnerSpan { + InnerSpan::new(self.0, end.0) + } +} + +/// A piece is a portion of the format string which represents the next part +/// to emit. These are emitted as a stream by the `Parser` class. +#[derive(Clone, Debug, PartialEq)] +pub enum Piece<'a> { + /// A literal string which should directly be emitted + String(&'a str), + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument(Box>), +} + +/// Representation of an argument specification. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct Argument<'a> { + /// Where to find this argument + pub position: Position<'a>, + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + pub position_span: InnerSpan, + /// How to format the argument + pub format: FormatSpec<'a>, +} + +/// Specification for the formatting of an argument in the format string. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct FormatSpec<'a> { + /// Optionally specified character to fill alignment with. + pub fill: Option, + /// Span of the optionally specified fill character. + pub fill_span: Option, + /// Optionally specified alignment. + pub align: Alignment, + /// The `+` or `-` flag. + pub sign: Option, + /// The `#` flag. + pub alternate: bool, + /// The `0` flag. + pub zero_pad: bool, + /// The `x` or `X` flag. (Only for `Debug`.) + pub debug_hex: Option, + /// The integer precision to use. + pub precision: Count<'a>, + /// The span of the precision formatting flag (for diagnostics). + pub precision_span: Option, + /// The string width requested for the resulting format. + pub width: Count<'a>, + /// The span of the width formatting flag (for diagnostics). + pub width_span: Option, + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + pub ty: &'a str, + /// The span of the descriptor string (for diagnostics). + pub ty_span: Option, +} + +/// Enum describing where an argument for a format can be located. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Position<'a> { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs(usize), + /// The argument is located at a specific index given in the format, + ArgumentIs(usize), + /// The argument has a name. + ArgumentNamed(&'a str), +} + +impl Position<'_> { + pub fn index(&self) -> Option { + match self { + ArgumentIs(i, ..) | ArgumentImplicitlyIs(i) => Some(*i), + _ => None, + } + } +} + +/// Enum of alignments which are supported. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Alignment { + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, +} + +/// Enum for the sign flags. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Sign { + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, +} + +/// Enum for the debug hex flags. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum DebugHex { + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, +} + +/// A count is used for the precision and width parameters of an integer, and +/// can reference either an argument or a literal integer. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Count<'a> { + /// The count is specified explicitly. + CountIs(usize), + /// The count is specified by the argument with the given name. + CountIsName(&'a str, InnerSpan), + /// The count is specified by the argument at the given index. + CountIsParam(usize), + /// The count is specified by a star (like in `{:.*}`) that refers to the argument at the given index. + CountIsStar(usize), + /// The count is implied and cannot be explicitly specified. + CountImplied, +} + +pub struct ParseError { + pub description: string::String, + pub note: Option, + pub label: string::String, + pub span: InnerSpan, + pub secondary_label: Option<(string::String, InnerSpan)>, + pub suggestion: Suggestion, +} + +pub enum Suggestion { + None, + /// Replace inline argument with positional argument: + /// `format!("{foo.bar}")` -> `format!("{}", foo.bar)` + UsePositional, + /// Remove `r#` from identifier: + /// `format!("{r#foo}")` -> `format!("{foo}")` + RemoveRawIdent(InnerSpan), +} + +/// The parser structure for interpreting the input format string. This is +/// modeled as an iterator over `Piece` structures to form a stream of tokens +/// being output. +/// +/// This is a recursive-descent parser for the sake of simplicity, and if +/// necessary there's probably lots of room for improvement performance-wise. +pub struct Parser<'a> { + mode: ParseMode, + input: &'a str, + cur: iter::Peekable>, + /// Error messages accumulated during parsing + pub errors: Vec, + /// Current position of implicit positional argument pointer + pub curarg: usize, + /// `Some(raw count)` when the string is "raw", used to position spans correctly + style: Option, + /// Start and end byte offset of every successfully parsed argument + pub arg_places: Vec, + /// Characters whose length has been changed from their in-code representation + width_map: Vec, + /// Span of the last opening brace seen, used for error reporting + last_opening_brace: Option, + /// Whether the source string is comes from `println!` as opposed to `format!` or `print!` + append_newline: bool, + /// Whether this formatting string was written directly in the source. This controls whether we + /// can use spans to refer into it and give better error messages. + /// N.B: This does _not_ control whether implicit argument captures can be used. + pub is_source_literal: bool, + /// Start position of the current line. + cur_line_start: usize, + /// Start and end byte offset of every line of the format string. Excludes + /// newline characters and leading whitespace. + pub line_spans: Vec, +} + +impl<'a> Iterator for Parser<'a> { + type Item = Piece<'a>; + + fn next(&mut self) -> Option> { + if let Some(&(pos, c)) = self.cur.peek() { + match c { + '{' => { + let curr_last_brace = self.last_opening_brace; + let byte_pos = self.to_span_index(pos); + let lbrace_end = InnerOffset(byte_pos.0 + self.to_span_width(pos)); + self.last_opening_brace = Some(byte_pos.to(lbrace_end)); + self.cur.next(); + if self.consume('{') { + self.last_opening_brace = curr_last_brace; + + Some(String(self.string(pos + 1))) + } else { + let arg = self.argument(lbrace_end); + if let Some(rbrace_pos) = self.consume_closing_brace(&arg) { + if self.is_source_literal { + let lbrace_byte_pos = self.to_span_index(pos); + let rbrace_byte_pos = self.to_span_index(rbrace_pos); + + let width = self.to_span_width(rbrace_pos); + + self.arg_places.push( + lbrace_byte_pos.to(InnerOffset(rbrace_byte_pos.0 + width)), + ); + } + } else { + if let Some(&(_, maybe)) = self.cur.peek() { + if maybe == '?' { + self.suggest_format(); + } else { + self.suggest_positional_arg_instead_of_captured_arg(arg); + } + } + } + Some(NextArgument(Box::new(arg))) + } + } + '}' => { + self.cur.next(); + if self.consume('}') { + Some(String(self.string(pos + 1))) + } else { + let err_pos = self.to_span_index(pos); + self.err_with_note( + "unmatched `}` found", + "unmatched `}`", + "if you intended to print `}`, you can escape it using `}}`", + err_pos.to(err_pos), + ); + None + } + } + _ => Some(String(self.string(pos))), + } + } else { + if self.is_source_literal { + let span = self.span(self.cur_line_start, self.input.len()); + if self.line_spans.last() != Some(&span) { + self.line_spans.push(span); + } + } + None + } + } +} + +impl<'a> Parser<'a> { + /// Creates a new parser for the given format string + pub fn new( + s: &'a str, + style: Option, + snippet: Option, + append_newline: bool, + mode: ParseMode, + ) -> Parser<'a> { + let input_string_kind = find_width_map_from_snippet(s, snippet, style); + let (width_map, is_source_literal) = match input_string_kind { + InputStringKind::Literal { width_mappings } => (width_mappings, true), + InputStringKind::NotALiteral => (Vec::new(), false), + }; + + Parser { + mode, + input: s, + cur: s.char_indices().peekable(), + errors: vec![], + curarg: 0, + style, + arg_places: vec![], + width_map, + last_opening_brace: None, + append_newline, + is_source_literal, + cur_line_start: 0, + line_spans: vec![], + } + } + + /// Notifies of an error. The message doesn't actually need to be of type + /// String, but I think it does when this eventually uses conditions so it + /// might as well start using it now. + fn err, S2: Into>( + &mut self, + description: S1, + label: S2, + span: InnerSpan, + ) { + self.errors.push(ParseError { + description: description.into(), + note: None, + label: label.into(), + span, + secondary_label: None, + suggestion: Suggestion::None, + }); + } + + /// Notifies of an error. The message doesn't actually need to be of type + /// String, but I think it does when this eventually uses conditions so it + /// might as well start using it now. + fn err_with_note< + S1: Into, + S2: Into, + S3: Into, + >( + &mut self, + description: S1, + label: S2, + note: S3, + span: InnerSpan, + ) { + self.errors.push(ParseError { + description: description.into(), + note: Some(note.into()), + label: label.into(), + span, + secondary_label: None, + suggestion: Suggestion::None, + }); + } + + /// Optionally consumes the specified character. If the character is not at + /// the current position, then the current iterator isn't moved and `false` is + /// returned, otherwise the character is consumed and `true` is returned. + fn consume(&mut self, c: char) -> bool { + self.consume_pos(c).is_some() + } + + /// Optionally consumes the specified character. If the character is not at + /// the current position, then the current iterator isn't moved and `None` is + /// returned, otherwise the character is consumed and the current position is + /// returned. + fn consume_pos(&mut self, c: char) -> Option { + if let Some(&(pos, maybe)) = self.cur.peek() { + if c == maybe { + self.cur.next(); + return Some(pos); + } + } + None + } + + fn remap_pos(&self, mut pos: usize) -> InnerOffset { + for width in &self.width_map { + if pos > width.position { + pos += width.before - width.after; + } else if pos == width.position && width.after == 0 { + pos += width.before; + } else { + break; + } + } + + InnerOffset(pos) + } + + fn to_span_index(&self, pos: usize) -> InnerOffset { + // This handles the raw string case, the raw argument is the number of # + // in r###"..."### (we need to add one because of the `r`). + let raw = self.style.map_or(0, |raw| raw + 1); + let pos = self.remap_pos(pos); + InnerOffset(raw + pos.0 + 1) + } + + fn to_span_width(&self, pos: usize) -> usize { + let pos = self.remap_pos(pos); + match self.width_map.iter().find(|w| w.position == pos.0) { + Some(w) => w.before, + None => 1, + } + } + + fn span(&self, start_pos: usize, end_pos: usize) -> InnerSpan { + let start = self.to_span_index(start_pos); + let end = self.to_span_index(end_pos); + start.to(end) + } + + /// Forces consumption of the specified character. If the character is not + /// found, an error is emitted. + fn consume_closing_brace(&mut self, arg: &Argument<'_>) -> Option { + self.ws(); + + let pos; + let description; + + if let Some(&(peek_pos, maybe)) = self.cur.peek() { + if maybe == '}' { + self.cur.next(); + return Some(peek_pos); + } + + pos = peek_pos; + description = format!("expected `'}}'`, found `{maybe:?}`"); + } else { + description = "expected `'}'` but string was terminated".to_owned(); + // point at closing `"` + pos = self.input.len() - if self.append_newline { 1 } else { 0 }; + } + + let pos = self.to_span_index(pos); + + let label = "expected `'}'`".to_owned(); + let (note, secondary_label) = if arg.format.fill == Some('}') { + ( + Some("the character `'}'` is interpreted as a fill character because of the `:` that precedes it".to_owned()), + arg.format.fill_span.map(|sp| ("this is not interpreted as a formatting closing brace".to_owned(), sp)), + ) + } else { + ( + Some("if you intended to print `{`, you can escape it using `{{`".to_owned()), + self.last_opening_brace + .map(|sp| ("because of this opening brace".to_owned(), sp)), + ) + }; + + self.errors.push(ParseError { + description, + note, + label, + span: pos.to(pos), + secondary_label, + suggestion: Suggestion::None, + }); + + None + } + + /// Consumes all whitespace characters until the first non-whitespace character + fn ws(&mut self) { + while let Some(&(_, c)) = self.cur.peek() { + if c.is_whitespace() { + self.cur.next(); + } else { + break; + } + } + } + + /// Parses all of a string which is to be considered a "raw literal" in a + /// format string. This is everything outside of the braces. + fn string(&mut self, start: usize) -> &'a str { + // we may not consume the character, peek the iterator + while let Some(&(pos, c)) = self.cur.peek() { + match c { + '{' | '}' => { + return &self.input[start..pos]; + } + '\n' if self.is_source_literal => { + self.line_spans.push(self.span(self.cur_line_start, pos)); + self.cur_line_start = pos + 1; + self.cur.next(); + } + _ => { + if self.is_source_literal && pos == self.cur_line_start && c.is_whitespace() { + self.cur_line_start = pos + c.len_utf8(); + } + self.cur.next(); + } + } + } + &self.input[start..self.input.len()] + } + + /// Parses an `Argument` structure, or what's contained within braces inside the format string. + fn argument(&mut self, start: InnerOffset) -> Argument<'a> { + let pos = self.position(); + + let end = self + .cur + .clone() + .find(|(_, ch)| !ch.is_whitespace()) + .map_or(start, |(end, _)| self.to_span_index(end)); + let position_span = start.to(end); + + let format = match self.mode { + ParseMode::Format => self.format(), + ParseMode::InlineAsm => self.inline_asm(), + }; + + // Resolve position after parsing format spec. + let pos = match pos { + Some(position) => position, + None => { + let i = self.curarg; + self.curarg += 1; + ArgumentImplicitlyIs(i) + } + }; + + Argument { + position: pos, + position_span, + format, + } + } + + /// Parses a positional argument for a format. This could either be an + /// integer index of an argument, a named argument, or a blank string. + /// Returns `Some(parsed_position)` if the position is not implicitly + /// consuming a macro argument, `None` if it's the case. + fn position(&mut self) -> Option> { + if let Some(i) = self.integer() { + Some(ArgumentIs(i)) + } else { + match self.cur.peek() { + Some(&(lo, c)) if is_id_start(c) => { + let word = self.word(); + + // Recover from `r#ident` in format strings. + // FIXME: use a let chain + if word == "r" { + if let Some((pos, '#')) = self.cur.peek() { + if self.input[pos + 1..] + .chars() + .next() + .is_some_and(is_id_start) + { + self.cur.next(); + let word = self.word(); + let prefix_span = self.span(lo, lo + 2); + let full_span = self.span(lo, lo + 2 + word.len()); + self.errors.insert(0, ParseError { + description: "raw identifiers are not supported".to_owned(), + note: Some("identifiers in format strings can be keywords and don't need to be prefixed with `r#`".to_string()), + label: "raw identifier used here".to_owned(), + span: full_span, + secondary_label: None, + suggestion: Suggestion::RemoveRawIdent(prefix_span), + }); + return Some(ArgumentNamed(word)); + } + } + } + + Some(ArgumentNamed(word)) + } + + // This is an `ArgumentNext`. + // Record the fact and do the resolution after parsing the + // format spec, to make things like `{:.*}` work. + _ => None, + } + } + } + + fn current_pos(&mut self) -> usize { + if let Some(&(pos, _)) = self.cur.peek() { + pos + } else { + self.input.len() + } + } + + /// Parses a format specifier at the current position, returning all of the + /// relevant information in the `FormatSpec` struct. + fn format(&mut self) -> FormatSpec<'a> { + let mut spec = FormatSpec { + fill: None, + fill_span: None, + align: AlignUnknown, + sign: None, + alternate: false, + zero_pad: false, + debug_hex: None, + precision: CountImplied, + precision_span: None, + width: CountImplied, + width_span: None, + ty: &self.input[..0], + ty_span: None, + }; + if !self.consume(':') { + return spec; + } + + // fill character + if let Some(&(idx, c)) = self.cur.peek() { + if let Some((_, '>' | '<' | '^')) = self.cur.clone().nth(1) { + spec.fill = Some(c); + spec.fill_span = Some(self.span(idx, idx + 1)); + self.cur.next(); + } + } + // Alignment + if self.consume('<') { + spec.align = AlignLeft; + } else if self.consume('>') { + spec.align = AlignRight; + } else if self.consume('^') { + spec.align = AlignCenter; + } + // Sign flags + if self.consume('+') { + spec.sign = Some(Sign::Plus); + } else if self.consume('-') { + spec.sign = Some(Sign::Minus); + } + // Alternate marker + if self.consume('#') { + spec.alternate = true; + } + // Width and precision + let mut havewidth = false; + + if self.consume('0') { + // small ambiguity with '0$' as a format string. In theory this is a + // '0' flag and then an ill-formatted format string with just a '$' + // and no count, but this is better if we instead interpret this as + // no '0' flag and '0$' as the width instead. + if let Some(end) = self.consume_pos('$') { + spec.width = CountIsParam(0); + spec.width_span = Some(self.span(end - 1, end + 1)); + havewidth = true; + } else { + spec.zero_pad = true; + } + } + + if !havewidth { + let start = self.current_pos(); + spec.width = self.count(start); + if spec.width != CountImplied { + let end = self.current_pos(); + spec.width_span = Some(self.span(start, end)); + } + } + + if let Some(start) = self.consume_pos('.') { + if self.consume('*') { + // Resolve `CountIsNextParam`. + // We can do this immediately as `position` is resolved later. + let i = self.curarg; + self.curarg += 1; + spec.precision = CountIsStar(i); + } else { + spec.precision = self.count(start + 1); + } + let end = self.current_pos(); + spec.precision_span = Some(self.span(start, end)); + } + + let ty_span_start = self.current_pos(); + // Optional radix followed by the actual format specifier + if self.consume('x') { + if self.consume('?') { + spec.debug_hex = Some(DebugHex::Lower); + spec.ty = "?"; + } else { + spec.ty = "x"; + } + } else if self.consume('X') { + if self.consume('?') { + spec.debug_hex = Some(DebugHex::Upper); + spec.ty = "?"; + } else { + spec.ty = "X"; + } + } else if self.consume('?') { + spec.ty = "?"; + } else { + spec.ty = self.word(); + if !spec.ty.is_empty() { + let ty_span_end = self.current_pos(); + spec.ty_span = Some(self.span(ty_span_start, ty_span_end)); + } + } + spec + } + + /// Parses an inline assembly template modifier at the current position, returning the modifier + /// in the `ty` field of the `FormatSpec` struct. + fn inline_asm(&mut self) -> FormatSpec<'a> { + let mut spec = FormatSpec { + fill: None, + fill_span: None, + align: AlignUnknown, + sign: None, + alternate: false, + zero_pad: false, + debug_hex: None, + precision: CountImplied, + precision_span: None, + width: CountImplied, + width_span: None, + ty: &self.input[..0], + ty_span: None, + }; + if !self.consume(':') { + return spec; + } + + let ty_span_start = self.current_pos(); + spec.ty = self.word(); + if !spec.ty.is_empty() { + let ty_span_end = self.current_pos(); + spec.ty_span = Some(self.span(ty_span_start, ty_span_end)); + } + + spec + } + + /// Parses a `Count` parameter at the current position. This does not check + /// for 'CountIsNextParam' because that is only used in precision, not + /// width. + fn count(&mut self, start: usize) -> Count<'a> { + if let Some(i) = self.integer() { + if self.consume('$') { + CountIsParam(i) + } else { + CountIs(i) + } + } else { + let tmp = self.cur.clone(); + let word = self.word(); + if word.is_empty() { + self.cur = tmp; + CountImplied + } else if let Some(end) = self.consume_pos('$') { + let name_span = self.span(start, end); + CountIsName(word, name_span) + } else { + self.cur = tmp; + CountImplied + } + } + } + + /// Parses a word starting at the current position. A word is the same as + /// Rust identifier, except that it can't start with `_` character. + fn word(&mut self) -> &'a str { + let start = match self.cur.peek() { + Some(&(pos, c)) if is_id_start(c) => { + self.cur.next(); + pos + } + _ => { + return ""; + } + }; + let mut end = None; + while let Some(&(pos, c)) = self.cur.peek() { + if is_id_continue(c) { + self.cur.next(); + } else { + end = Some(pos); + break; + } + } + let end = end.unwrap_or(self.input.len()); + let word = &self.input[start..end]; + if word == "_" { + self.err_with_note( + "invalid argument name `_`", + "invalid argument name", + "argument name cannot be a single underscore", + self.span(start, end), + ); + } + word + } + + fn integer(&mut self) -> Option { + let mut cur: usize = 0; + let mut found = false; + let mut overflow = false; + let start = self.current_pos(); + while let Some(&(_, c)) = self.cur.peek() { + if let Some(i) = c.to_digit(10) { + let (tmp, mul_overflow) = cur.overflowing_mul(10); + let (tmp, add_overflow) = tmp.overflowing_add(i as usize); + if mul_overflow || add_overflow { + overflow = true; + } + cur = tmp; + found = true; + self.cur.next(); + } else { + break; + } + } + + if overflow { + let end = self.current_pos(); + let overflowed_int = &self.input[start..end]; + self.err( + format!( + "integer `{}` does not fit into the type `usize` whose range is `0..={}`", + overflowed_int, + usize::MAX + ), + "integer out of range for `usize`", + self.span(start, end), + ); + } + + found.then_some(cur) + } + + fn suggest_format(&mut self) { + if let (Some(pos), Some(_)) = (self.consume_pos('?'), self.consume_pos(':')) { + let word = self.word(); + let _end = self.current_pos(); + let pos = self.to_span_index(pos); + self.errors.insert( + 0, + ParseError { + description: "expected format parameter to occur after `:`".to_owned(), + note: Some(format!( + "`?` comes after `:`, try `{}:{}` instead", + word, "?" + )), + label: "expected `?` to occur after `:`".to_owned(), + span: pos.to(pos), + secondary_label: None, + suggestion: Suggestion::None, + }, + ); + } + } + + fn suggest_positional_arg_instead_of_captured_arg(&mut self, arg: Argument<'a>) { + if let Some(end) = self.consume_pos('.') { + let byte_pos = self.to_span_index(end); + let start = InnerOffset(byte_pos.0 + 1); + let field = self.argument(start); + // We can only parse `foo.bar` field access, any deeper nesting, + // or another type of expression, like method calls, are not supported + if !self.consume('}') { + return; + } + if let ArgumentNamed(_) = arg.position { + if let ArgumentNamed(_) = field.position { + self.errors.insert( + 0, + ParseError { + description: "field access isn't supported".to_string(), + note: None, + label: "not supported".to_string(), + span: InnerSpan::new(arg.position_span.start, field.position_span.end), + secondary_label: None, + suggestion: Suggestion::UsePositional, + }, + ); + } + } + } + } +} + +/// Finds the indices of all characters that have been processed and differ between the actual +/// written code (code snippet) and the `InternedString` that gets processed in the `Parser` +/// in order to properly synthesise the intra-string `Span`s for error diagnostics. +// TODO: Can we give an escaped string here? probably yes - and a valid one too +fn find_width_map_from_snippet( + input: &str, + snippet: Option, + str_style: Option, +) -> InputStringKind { + let snippet = match snippet { + Some(ref s) if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") => s, + _ => return InputStringKind::NotALiteral, + }; + + if str_style.is_some() { + return InputStringKind::Literal { + width_mappings: Vec::new(), + }; + } + + // Strip quotes. + let snippet = &snippet[1..snippet.len() - 1]; + + // Macros like `println` add a newline at the end. That technically doesn't make them "literals" anymore, but it's fine + // since we will never need to point our spans there, so we lie about it here by ignoring it. + // Since there might actually be newlines in the source code, we need to normalize away all trailing newlines. + // If we only trimmed it off the input, `format!("\n")` would cause a mismatch as here we they actually match up. + // Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up. + let input_no_nl = input.trim_end_matches('\n'); + let Some(unescaped) = unescape_string(snippet) else { + return InputStringKind::NotALiteral; + }; + + let unescaped_no_nl = unescaped.trim_end_matches('\n'); + + if unescaped_no_nl != input_no_nl { + // The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect. + // This can for example happen with proc macros that respan generated literals. + return InputStringKind::NotALiteral; + } + + let mut s = snippet.char_indices(); + let mut width_mappings = vec![]; + while let Some((pos, c)) = s.next() { + match (c, s.clone().next()) { + // skip whitespace and empty lines ending in '\\' + ('\\', Some((_, '\n'))) => { + let _ = s.next(); + let mut width = 2; + + while let Some((_, c)) = s.clone().next() { + if matches!(c, ' ' | '\n' | '\t') { + width += 1; + let _ = s.next(); + } else { + break; + } + } + + width_mappings.push(InnerWidthMapping::new(pos, width, 0)); + } + ('\\', Some((_, 'n' | 't' | 'r' | '0' | '\\' | '\'' | '\"'))) => { + width_mappings.push(InnerWidthMapping::new(pos, 2, 1)); + let _ = s.next(); + } + ('\\', Some((_, 'x'))) => { + // consume `\xAB` literal + s.nth(2); + width_mappings.push(InnerWidthMapping::new(pos, 4, 1)); + } + ('\\', Some((_, 'u'))) => { + let mut width = 2; + let _ = s.next(); + + if let Some((_, next_c)) = s.next() { + if next_c == '{' { + // consume up to 6 hexanumeric chars + let digits_len = s + .clone() + .take(6) + .take_while(|(_, c)| c.is_digit(16)) + .count(); + + let len_utf8 = s + .as_str() + .get(..digits_len) + .and_then(|digits| u32::from_str_radix(digits, 16).ok()) + .and_then(char::from_u32) + .map_or(1, char::len_utf8); + + // Skip the digits, for chars that encode to more than 1 utf-8 byte + // exclude as many digits as it is greater than 1 byte + // + // So for a 3 byte character, exclude 2 digits + let required_skips = digits_len.saturating_sub(len_utf8.saturating_sub(1)); + + // skip '{' and '}' also + width += required_skips + 2; + + s.nth(digits_len); + } else if next_c.is_digit(16) { + width += 1; + + // We suggest adding `{` and `}` when appropriate, accept it here as if + // it were correct + let mut i = 0; // consume up to 6 hexanumeric chars + while let (Some((_, c)), _) = (s.next(), i < 6) { + if c.is_digit(16) { + width += 1; + } else { + break; + } + i += 1; + } + } + } + + width_mappings.push(InnerWidthMapping::new(pos, width, 1)); + } + _ => {} + } + } + + InputStringKind::Literal { width_mappings } +} + +// TODO: I guess we can provide an `unescape_string` function to the parser... but how do we do that +// Store it in the parser struct? we need to make it FFI-aware +// SO this is not possible because we need `unescape_string` *before* we have a parser + +fn unescape_string(string: &str) -> Option { + // let mut buf = string::String::new(); + // let mut ok = true; + // unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| { + // match unescaped_char { + // Ok(c) => buf.push(c), + // Err(_) => ok = false, + // } + // }); + + let buf = string::String::from(string); + let ok = true; + + ok.then_some(buf) +} + +// Assert a reasonable size for `Piece` +// #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] +// rustc_index::static_assert_size!(Piece<'_>, 16); + +// #[cfg(test)] +// mod tests; \ No newline at end of file diff --git a/libgrust/libformat_parser/src/bin.rs b/libgrust/libformat_parser/src/bin.rs new file mode 100644 index 000000000000..4b1f903ad5fa --- /dev/null +++ b/libgrust/libformat_parser/src/bin.rs @@ -0,0 +1,7 @@ +use libformat_parser::rust; + +fn main() { + dbg!(rust::collect_pieces( + std::env::args().nth(1).unwrap().as_str() + )); +} diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs new file mode 100644 index 000000000000..e6dc16eeb498 --- /dev/null +++ b/libgrust/libformat_parser/src/lib.rs @@ -0,0 +1,41 @@ +//! FFI interface for `rustc_format_parser` + +// what's the plan? Have a function return something that can be constructed into a vector? +// or an iterator? + +use std::ffi::CStr; + +// TODO: Use rustc's version here #3 +use generic_format_parser::Piece; + +// FIXME: Rename? +pub mod rust { + use generic_format_parser::{ParseMode, Parser, Piece}; + + pub fn collect_pieces(input: &str) -> Vec> { + // let parser = Parser::new(); + let parser = Parser::new(input, None, None, true, ParseMode::Format); + + parser.into_iter().collect() + } +} + +#[repr(C)] +pub struct PieceSlice { + base_ptr: *const Piece<'static /* FIXME: That's wrong */>, + len: usize, +} + +#[no_mangle] +pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { + // FIXME: Add comment + let str = unsafe { CStr::from_ptr(input) }; + + // FIXME: No unwrap + let pieces = rust::collect_pieces(str.to_str().unwrap()); + + PieceSlice { + base_ptr: pieces.as_ptr(), + len: pieces.len(), + } +} From af42047389d6958d84082d981febfba930d868d3 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Mon, 29 Jan 2024 16:13:24 +0100 Subject: [PATCH 03/13] libformat_parser: Add FFI safe interface libgrust/ChangeLog: * libformat_parser/generic_format_parser/src/lib.rs: Add generic library. * libformat_parser/src/lib.rs: Add base for FFI interface. --- .../generic_format_parser/src/lib.rs | 2 +- libgrust/libformat_parser/src/lib.rs | 301 +++++++++++++++++- 2 files changed, 298 insertions(+), 5 deletions(-) diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs index f42c9d8dffbb..87a20dc18c56 100644 --- a/libgrust/libformat_parser/generic_format_parser/src/lib.rs +++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs @@ -1099,4 +1099,4 @@ fn unescape_string(string: &str) -> Option { // rustc_index::static_assert_size!(Piece<'_>, 16); // #[cfg(test)] -// mod tests; \ No newline at end of file +// mod tests; diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs index e6dc16eeb498..49821e7cd2f4 100644 --- a/libgrust/libformat_parser/src/lib.rs +++ b/libgrust/libformat_parser/src/lib.rs @@ -5,8 +5,298 @@ use std::ffi::CStr; -// TODO: Use rustc's version here #3 -use generic_format_parser::Piece; +mod ffi { + use std::ops::Deref; + + // Note: copied from rustc_span + /// Range inside of a `Span` used for diagnostics when we only have access to relative positions. + #[derive(Copy, Clone, PartialEq, Eq, Debug)] + #[repr(C)] + pub struct InnerSpan { + pub start: usize, + pub end: usize, + } + + // impl InnerSpan { + // pub fn new(start: usize, end: usize) -> InnerSpan { + // InnerSpan { start, end } + // } + // } + + /// The location and before/after width of a character whose width has changed from its source code + /// representation + #[derive(Copy, Clone, PartialEq, Eq)] + #[repr(C)] + pub struct InnerWidthMapping { + /// Index of the character in the source + pub position: usize, + /// The inner width in characters + pub before: usize, + /// The transformed width in characters + pub after: usize, + } + + // impl InnerWidthMapping { + // pub fn new(position: usize, before: usize, after: usize) -> InnerWidthMapping { + // InnerWidthMapping { + // position, + // before, + // after, + // } + // } + // } + + /// Whether the input string is a literal. If yes, it contains the inner width mappings. + #[derive(Clone, PartialEq, Eq)] + #[repr(C)] + enum InputStringKind { + NotALiteral, + Literal { + width_mappings: Vec, + }, + } + + /// The type of format string that we are parsing. + #[derive(Copy, Clone, Debug, Eq, PartialEq)] + #[repr(C)] + pub enum ParseMode { + /// A normal format string as per `format_args!`. + Format, + /// An inline assembly template string for `asm!`. + InlineAsm, + } + + #[derive(Copy, Clone)] + #[repr(C)] + struct InnerOffset(usize); + + /// A piece is a portion of the format string which represents the next part + /// to emit. These are emitted as a stream by the `Parser` class. + #[derive(Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Piece<'a> { + /// A literal string which should directly be emitted + String(&'a str), + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument(Box>), + } + + /// Representation of an argument specification. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub struct Argument<'a> { + /// Where to find this argument + pub position: Position<'a>, + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + pub position_span: InnerSpan, + /// How to format the argument + pub format: FormatSpec<'a>, + } + + /// Specification for the formatting of an argument in the format string. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub struct FormatSpec<'a> { + /// Optionally specified character to fill alignment with. + pub fill: Option, + /// Span of the optionally specified fill character. + pub fill_span: Option, + /// Optionally specified alignment. + pub align: Alignment, + /// The `+` or `-` flag. + pub sign: Option, + /// The `#` flag. + pub alternate: bool, + /// The `0` flag. + pub zero_pad: bool, + /// The `x` or `X` flag. (Only for `Debug`.) + pub debug_hex: Option, + /// The integer precision to use. + pub precision: Count<'a>, + /// The span of the precision formatting flag (for diagnostics). + pub precision_span: Option, + /// The string width requested for the resulting format. + pub width: Count<'a>, + /// The span of the width formatting flag (for diagnostics). + pub width_span: Option, + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + pub ty: &'a str, + /// The span of the descriptor string (for diagnostics). + pub ty_span: Option, + } + + /// Enum describing where an argument for a format can be located. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Position<'a> { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs(usize), + /// The argument is located at a specific index given in the format, + ArgumentIs(usize), + /// The argument has a name. + ArgumentNamed(&'a str), + } + + /// Enum of alignments which are supported. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Alignment { + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, + } + + /// Enum for the sign flags. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Sign { + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, + } + + /// Enum for the debug hex flags. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum DebugHex { + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, + } + + /// A count is used for the precision and width parameters of an integer, and + /// can reference either an argument or a literal integer. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Count<'a> { + /// The count is specified explicitly. + CountIs(usize), + /// The count is specified by the argument with the given name. + CountIsName(&'a str, InnerSpan), + /// The count is specified by the argument at the given index. + CountIsParam(usize), + /// The count is specified by a star (like in `{:.*}`) that refers to the argument at the given index. + CountIsStar(usize), + /// The count is implied and cannot be explicitly specified. + CountImplied, + } + + impl<'a> From> for Piece<'a> { + fn from(old: generic_format_parser::Piece<'a>) -> Self { + match old { + generic_format_parser::Piece::String(x) => Piece::String(x), + generic_format_parser::Piece::NextArgument(x) => { + Piece::NextArgument(Box::new(Into::::into(*x))) + } + } + } + } + + impl<'a> From> for Argument<'a> { + fn from(old: generic_format_parser::Argument<'a>) -> Self { + Argument { + position: old.position.into(), + position_span: old.position_span.into(), + format: old.format.into(), + } + } + } + + impl<'a> From> for Position<'a> { + fn from(old: generic_format_parser::Position<'a>) -> Self { + match old { + generic_format_parser::Position::ArgumentImplicitlyIs(x) => { + Position::ArgumentImplicitlyIs(x.into()) + } + generic_format_parser::Position::ArgumentIs(x) => Position::ArgumentIs(x.into()), + generic_format_parser::Position::ArgumentNamed(x) => { + Position::ArgumentNamed(x.into()) + } + } + } + } + + impl From for InnerSpan { + fn from(old: generic_format_parser::InnerSpan) -> Self { + InnerSpan { + start: old.start, + end: old.end, + } + } + } + + impl<'a> From> for FormatSpec<'a> { + fn from(old: generic_format_parser::FormatSpec<'a>) -> Self { + FormatSpec { + fill: old.fill, + fill_span: old.fill_span.map(Into::into), + align: old.align.into(), + sign: old.sign.map(Into::into), + alternate: old.alternate, + zero_pad: old.zero_pad, + debug_hex: old.debug_hex.map(Into::into), + precision: old.precision.into(), + precision_span: old.precision_span.map(Into::into), + width: old.width.into(), + width_span: old.width_span.map(Into::into), + ty: old.ty, + ty_span: old.ty_span.map(Into::into), + } + } + } + + impl From for DebugHex { + fn from(old: generic_format_parser::DebugHex) -> Self { + match old { + generic_format_parser::DebugHex::Lower => DebugHex::Lower, + generic_format_parser::DebugHex::Upper => DebugHex::Upper, + } + } + } + + impl<'a> From> for Count<'a> { + fn from(old: generic_format_parser::Count<'a>) -> Self { + match old { + generic_format_parser::Count::CountIs(x) => Count::CountIs(x), + generic_format_parser::Count::CountIsName(x, y) => Count::CountIsName(x, y.into()), + generic_format_parser::Count::CountIsParam(x) => Count::CountIsParam(x), + generic_format_parser::Count::CountIsStar(x) => Count::CountIsStar(x), + generic_format_parser::Count::CountImplied => Count::CountImplied, + } + } + } + + impl From for Sign { + fn from(old: generic_format_parser::Sign) -> Self { + match old { + generic_format_parser::Sign::Plus => Sign::Plus, + generic_format_parser::Sign::Minus => Sign::Minus, + } + } + } + + impl From for Alignment { + fn from(old: generic_format_parser::Alignment) -> Self { + match old { + generic_format_parser::Alignment::AlignLeft => Alignment::AlignLeft, + generic_format_parser::Alignment::AlignRight => Alignment::AlignRight, + generic_format_parser::Alignment::AlignCenter => Alignment::AlignCenter, + generic_format_parser::Alignment::AlignUnknown => Alignment::AlignUnknown, + } + } + } +} // FIXME: Rename? pub mod rust { @@ -22,7 +312,7 @@ pub mod rust { #[repr(C)] pub struct PieceSlice { - base_ptr: *const Piece<'static /* FIXME: That's wrong */>, + base_ptr: *const ffi::Piece<'static /* FIXME: That's wrong */>, len: usize, } @@ -32,7 +322,10 @@ pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { let str = unsafe { CStr::from_ptr(input) }; // FIXME: No unwrap - let pieces = rust::collect_pieces(str.to_str().unwrap()); + let pieces: Vec> = rust::collect_pieces(str.to_str().unwrap()) + .into_iter() + .map(Into::into) + .collect(); PieceSlice { base_ptr: pieces.as_ptr(), From 6f7a373c0727c61d06e591c33e969c7f5286b7db Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Mon, 29 Jan 2024 16:14:13 +0100 Subject: [PATCH 04/13] libformat_parser: Start experimenting with cbindgen libgrust/ChangeLog: * libformat_parser/cbindgen.toml: New file. * libformat_parser/libformat-parser.h: New file. gcc/rust/ChangeLog: * ast/rust-fmt.h: Add remaining FFI types. --- gcc/rust/ast/rust-fmt.h | 4 +- libgrust/libformat_parser/cbindgen.toml | 0 libgrust/libformat_parser/libformat-parser.h | 224 +++++++++++++++++++ 3 files changed, 226 insertions(+), 2 deletions(-) create mode 100644 libgrust/libformat_parser/cbindgen.toml create mode 100644 libgrust/libformat_parser/libformat-parser.h diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h index 0050977358f1..27c1c3625d3e 100644 --- a/gcc/rust/ast/rust-fmt.h +++ b/gcc/rust/ast/rust-fmt.h @@ -92,11 +92,11 @@ struct FormatSpec /// The `x` or `X` flag. (Only for `Debug`.) tl::optional debug_hex; /// The integer precision to use. - // Count <'a> precision; + Count precision; /// The span of the precision formatting flag (for diagnostics). tl::optional precision_span; /// The string width requested for the resulting format. - // Count <'a> width; + Count width; /// The span of the width formatting flag (for diagnostics). tl::optional width_span; /// The descriptor string representing the name of the format desired for diff --git a/libgrust/libformat_parser/cbindgen.toml b/libgrust/libformat_parser/cbindgen.toml new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/libgrust/libformat_parser/libformat-parser.h b/libgrust/libformat_parser/libformat-parser.h new file mode 100644 index 000000000000..a4bc8a754944 --- /dev/null +++ b/libgrust/libformat_parser/libformat-parser.h @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include + +/// Enum of alignments which are supported. +enum class Alignment +{ + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, +}; + +/// Enum for the debug hex flags. +enum class DebugHex +{ + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, +}; + +/// Enum for the sign flags. +enum class Sign +{ + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, +}; + +template struct Box; + +template struct Option; + +/// Enum describing where an argument for a format can be located. +struct Position +{ + enum class Tag + { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs, + /// The argument is located at a specific index given in the format, + ArgumentIs, + /// The argument has a name. + ArgumentNamed, + }; + + struct ArgumentImplicitlyIs_Body + { + uintptr_t _0; + }; + + struct ArgumentIs_Body + { + uintptr_t _0; + }; + + struct ArgumentNamed_Body + { + const str *_0; + }; + + Tag tag; + union + { + ArgumentImplicitlyIs_Body argument_implicitly_is; + ArgumentIs_Body argument_is; + ArgumentNamed_Body argument_named; + }; +}; + +/// Range inside of a `Span` used for diagnostics when we only have access to +/// relative positions. +struct InnerSpan +{ + uintptr_t start; + uintptr_t end; +}; + +/// A count is used for the precision and width parameters of an integer, and +/// can reference either an argument or a literal integer. +struct Count +{ + enum class Tag + { + /// The count is specified explicitly. + CountIs, + /// The count is specified by the argument with the given name. + CountIsName, + /// The count is specified by the argument at the given index. + CountIsParam, + /// The count is specified by a star (like in `{:.*}`) that refers to the + /// argument at the given index. + CountIsStar, + /// The count is implied and cannot be explicitly specified. + CountImplied, + }; + + struct CountIs_Body + { + uintptr_t _0; + }; + + struct CountIsName_Body + { + const str *_0; + InnerSpan _1; + }; + + struct CountIsParam_Body + { + uintptr_t _0; + }; + + struct CountIsStar_Body + { + uintptr_t _0; + }; + + Tag tag; + union + { + CountIs_Body count_is; + CountIsName_Body count_is_name; + CountIsParam_Body count_is_param; + CountIsStar_Body count_is_star; + }; +}; + +/// Specification for the formatting of an argument in the format string. +struct FormatSpec +{ + /// Optionally specified character to fill alignment with. + Option fill; + /// Span of the optionally specified fill character. + Option fill_span; + /// Optionally specified alignment. + Alignment align; + /// The `+` or `-` flag. + Option sign; + /// The `#` flag. + bool alternate; + /// The `0` flag. + bool zero_pad; + /// The `x` or `X` flag. (Only for `Debug`.) + Option debug_hex; + /// The integer precision to use. + Count precision; + /// The span of the precision formatting flag (for diagnostics). + Option precision_span; + /// The string width requested for the resulting format. + Count width; + /// The span of the width formatting flag (for diagnostics). + Option width_span; + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + const str *ty; + /// The span of the descriptor string (for diagnostics). + Option ty_span; +}; + +/// Representation of an argument specification. +struct Argument +{ + /// Where to find this argument + Position position; + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + InnerSpan position_span; + /// How to format the argument + FormatSpec format; +}; + +/// A piece is a portion of the format string which represents the next part +/// to emit. These are emitted as a stream by the `Parser` class. +struct Piece +{ + enum class Tag + { + /// A literal string which should directly be emitted + String, + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument, + }; + + struct String_Body + { + const str *_0; + }; + + struct NextArgument_Body + { + Box _0; + }; + + Tag tag; + union + { + String_Body string; + NextArgument_Body next_argument; + }; +}; + +struct PieceSlice +{ + const Piece *base_ptr; + uintptr_t len; +}; + +extern "C" { + +PieceSlice +collect_pieces (const char *input); + +} // extern "C" From 66f80323943a3146ed4b994e339d4fb3fd5f8b40 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Mon, 29 Jan 2024 16:17:00 +0100 Subject: [PATCH 05/13] git: Ignore libgrust build folders ChangeLog: * .gitignore: Add libgrust target folders to the ignore list. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 88b8aa27a882..b1c6625d645c 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,4 @@ test.code-workspace gcc/rust/test3-tiny/* .clang-format.swap +libgrust/*/target/ From 9fb89f05bdac84977be9779567d5ce76868c4a8b Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Mon, 29 Jan 2024 22:06:39 +0100 Subject: [PATCH 06/13] libformat_parser: Update header and remove old interface gcc/rust/ChangeLog: * ast/rust-fmt.cc (Pieces::collect): Use new Pieces API. * ast/rust-fmt.h: Update interface with new FFI bindings. libgrust/ChangeLog: * libformat_parser/src/lib.rs: Add IntoFFI trait. * libformat_parser/libformat-parser.h: Removed. --- gcc/rust/ast/rust-fmt.cc | 10 +- gcc/rust/ast/rust-fmt.h | 199 ++++++++++++---- libgrust/libformat_parser/libformat-parser.h | 224 ------------------- libgrust/libformat_parser/src/lib.rs | 56 +++-- 4 files changed, 200 insertions(+), 289 deletions(-) delete mode 100644 libgrust/libformat_parser/libformat-parser.h diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc index 559b1c8b5795..a7c4341c52db 100644 --- a/gcc/rust/ast/rust-fmt.cc +++ b/gcc/rust/ast/rust-fmt.cc @@ -17,6 +17,7 @@ // . #include "rust-fmt.h" +#include "rust-diagnostics.h" namespace Rust { namespace Fmt { @@ -26,13 +27,12 @@ Pieces::collect (const std::string &to_parse) { auto piece_slice = collect_pieces (to_parse.c_str ()); - rust_debug ("[ARTHUR] %p, %lu", (void *) piece_slice.ptr, piece_slice.len); + rust_debug ("[ARTHUR] %p, %lu", (const void *) piece_slice.base_ptr, + piece_slice.len); // this performs multiple copies, can we avoid them maybe? - auto pieces - = std::vector (piece_slice.ptr, piece_slice.ptr + piece_slice.len); - - rust_debug ("[ARTHUR] %p, %lu", (void *) pieces.data (), pieces.size ()); + // auto pieces = std::vector (piece_slice.base_ptr, + // piece_slice.base_ptr + piece_slice.len); return Pieces{}; } diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h index 27c1c3625d3e..7ec9a2a199dd 100644 --- a/gcc/rust/ast/rust-fmt.h +++ b/gcc/rust/ast/rust-fmt.h @@ -1,4 +1,4 @@ -// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// Copyright (C) 2023-2024 Free Software Foundation, Inc. // This file is part of GCC. @@ -19,9 +19,10 @@ #ifndef RUST_FMT_H #define RUST_FMT_H -#include "rust-diagnostics.h" #include "rust-system.h" +// FIXME: How to encode Option? + namespace Rust { namespace Fmt { @@ -30,116 +31,220 @@ struct RustHamster // hehe }; -struct InnerSpan +/// Enum of alignments which are supported. +enum class Alignment { + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, }; -struct Count +/// Enum for the debug hex flags. +enum class DebugHex { - enum class Kind - { - Is, - IsName, - IsParam, - IsStar, - Implied - } kind; - - union - { - size_t is; - std::pair is_name; - size_t is_param; - size_t is_star; - } data; + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, }; -struct DebugHex +/// Enum for the sign flags. +enum class Sign { + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, }; -struct Sign +/// Enum describing where an argument for a format can be located. +struct Position { -}; + enum class Tag + { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs, + /// The argument is located at a specific index given in the format, + ArgumentIs, + /// The argument has a name. + ArgumentNamed, + }; -struct Alignment -{ + struct ArgumentImplicitlyIs_Body + { + size_t _0; + }; + + struct ArgumentIs_Body + { + size_t _0; + }; + + struct ArgumentNamed_Body + { + RustHamster _0; + }; + + Tag tag; + union + { + ArgumentImplicitlyIs_Body argument_implicitly_is; + ArgumentIs_Body argument_is; + ArgumentNamed_Body argument_named; + }; }; -struct RustString +/// Range inside of a `Span` used for diagnostics when we only have access to +/// relative positions. +struct InnerSpan { - // hehe + size_t start; + size_t end; }; -struct Position +/// A count is used for the precision and width parameters of an integer, and +/// can reference either an argument or a literal integer. +struct Count { + enum class Tag + { + /// The count is specified explicitly. + CountIs, + /// The count is specified by the argument with the given name. + CountIsName, + /// The count is specified by the argument at the given index. + CountIsParam, + /// The count is specified by a star (like in `{:.*}`) that refers to the + /// argument at the given index. + CountIsStar, + /// The count is implied and cannot be explicitly specified. + CountImplied, + }; + + struct CountIs_Body + { + size_t _0; + }; + + struct CountIsName_Body + { + RustHamster _0; + InnerSpan _1; + }; + + struct CountIsParam_Body + { + size_t _0; + }; + + struct CountIsStar_Body + { + size_t _0; + }; + + Tag tag; + union + { + CountIs_Body count_is; + CountIsName_Body count_is_name; + CountIsParam_Body count_is_param; + CountIsStar_Body count_is_star; + }; }; +/// Specification for the formatting of an argument in the format string. struct FormatSpec { /// Optionally specified character to fill alignment with. - tl::optional fill; + const uint32_t *fill; /// Span of the optionally specified fill character. - tl::optional fill_span; + const InnerSpan *fill_span; /// Optionally specified alignment. Alignment align; /// The `+` or `-` flag. - tl::optional sign; + const Sign *sign; /// The `#` flag. bool alternate; /// The `0` flag. bool zero_pad; /// The `x` or `X` flag. (Only for `Debug`.) - tl::optional debug_hex; + const DebugHex *debug_hex; /// The integer precision to use. Count precision; /// The span of the precision formatting flag (for diagnostics). - tl::optional precision_span; + const InnerSpan *precision_span; /// The string width requested for the resulting format. Count width; /// The span of the width formatting flag (for diagnostics). - tl::optional width_span; + const InnerSpan *width_span; /// The descriptor string representing the name of the format desired for /// this argument, this can be empty or any number of characters, although /// it is required to be one word. RustHamster ty; - // &'a str ty; /// The span of the descriptor string (for diagnostics). - tl::optional ty_span; + const InnerSpan *ty_span; }; +/// Representation of an argument specification. struct Argument { + /// Where to find this argument Position position; - InnerSpan inner_span; + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + InnerSpan position_span; + /// How to format the argument FormatSpec format; }; +/// A piece is a portion of the format string which represents the next part +/// to emit. These are emitted as a stream by the `Parser` class. struct Piece { - enum class Kind + enum class Tag { + /// A literal string which should directly be emitted String, - NextArgument - } kind; + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument, + }; + + struct String_Body + { + RustHamster _0; + }; + + struct NextArgument_Body + { + const Argument *_0; + }; + Tag tag; union { - RustString string; - Argument *next_argument; - } data; + String_Body string; + NextArgument_Body next_argument; + }; }; struct PieceSlice { - Piece *ptr; + const Piece *base_ptr; size_t len; }; extern "C" { + PieceSlice -collect_pieces (const char *); -} +collect_pieces (const char *input); + +} // extern "C" struct Pieces { @@ -149,4 +254,4 @@ struct Pieces } // namespace Fmt } // namespace Rust -#endif // ! RUST_FMT_H +#endif // !RUST_FMT_H diff --git a/libgrust/libformat_parser/libformat-parser.h b/libgrust/libformat_parser/libformat-parser.h deleted file mode 100644 index a4bc8a754944..000000000000 --- a/libgrust/libformat_parser/libformat-parser.h +++ /dev/null @@ -1,224 +0,0 @@ -#include -#include -#include -#include -#include - -/// Enum of alignments which are supported. -enum class Alignment -{ - /// The value will be aligned to the left. - AlignLeft, - /// The value will be aligned to the right. - AlignRight, - /// The value will be aligned in the center. - AlignCenter, - /// The value will take on a default alignment. - AlignUnknown, -}; - -/// Enum for the debug hex flags. -enum class DebugHex -{ - /// The `x` flag in `{:x?}`. - Lower, - /// The `X` flag in `{:X?}`. - Upper, -}; - -/// Enum for the sign flags. -enum class Sign -{ - /// The `+` flag. - Plus, - /// The `-` flag. - Minus, -}; - -template struct Box; - -template struct Option; - -/// Enum describing where an argument for a format can be located. -struct Position -{ - enum class Tag - { - /// The argument is implied to be located at an index - ArgumentImplicitlyIs, - /// The argument is located at a specific index given in the format, - ArgumentIs, - /// The argument has a name. - ArgumentNamed, - }; - - struct ArgumentImplicitlyIs_Body - { - uintptr_t _0; - }; - - struct ArgumentIs_Body - { - uintptr_t _0; - }; - - struct ArgumentNamed_Body - { - const str *_0; - }; - - Tag tag; - union - { - ArgumentImplicitlyIs_Body argument_implicitly_is; - ArgumentIs_Body argument_is; - ArgumentNamed_Body argument_named; - }; -}; - -/// Range inside of a `Span` used for diagnostics when we only have access to -/// relative positions. -struct InnerSpan -{ - uintptr_t start; - uintptr_t end; -}; - -/// A count is used for the precision and width parameters of an integer, and -/// can reference either an argument or a literal integer. -struct Count -{ - enum class Tag - { - /// The count is specified explicitly. - CountIs, - /// The count is specified by the argument with the given name. - CountIsName, - /// The count is specified by the argument at the given index. - CountIsParam, - /// The count is specified by a star (like in `{:.*}`) that refers to the - /// argument at the given index. - CountIsStar, - /// The count is implied and cannot be explicitly specified. - CountImplied, - }; - - struct CountIs_Body - { - uintptr_t _0; - }; - - struct CountIsName_Body - { - const str *_0; - InnerSpan _1; - }; - - struct CountIsParam_Body - { - uintptr_t _0; - }; - - struct CountIsStar_Body - { - uintptr_t _0; - }; - - Tag tag; - union - { - CountIs_Body count_is; - CountIsName_Body count_is_name; - CountIsParam_Body count_is_param; - CountIsStar_Body count_is_star; - }; -}; - -/// Specification for the formatting of an argument in the format string. -struct FormatSpec -{ - /// Optionally specified character to fill alignment with. - Option fill; - /// Span of the optionally specified fill character. - Option fill_span; - /// Optionally specified alignment. - Alignment align; - /// The `+` or `-` flag. - Option sign; - /// The `#` flag. - bool alternate; - /// The `0` flag. - bool zero_pad; - /// The `x` or `X` flag. (Only for `Debug`.) - Option debug_hex; - /// The integer precision to use. - Count precision; - /// The span of the precision formatting flag (for diagnostics). - Option precision_span; - /// The string width requested for the resulting format. - Count width; - /// The span of the width formatting flag (for diagnostics). - Option width_span; - /// The descriptor string representing the name of the format desired for - /// this argument, this can be empty or any number of characters, although - /// it is required to be one word. - const str *ty; - /// The span of the descriptor string (for diagnostics). - Option ty_span; -}; - -/// Representation of an argument specification. -struct Argument -{ - /// Where to find this argument - Position position; - /// The span of the position indicator. Includes any whitespace in implicit - /// positions (`{ }`). - InnerSpan position_span; - /// How to format the argument - FormatSpec format; -}; - -/// A piece is a portion of the format string which represents the next part -/// to emit. These are emitted as a stream by the `Parser` class. -struct Piece -{ - enum class Tag - { - /// A literal string which should directly be emitted - String, - /// This describes that formatting should process the next argument (as - /// specified inside) for emission. - NextArgument, - }; - - struct String_Body - { - const str *_0; - }; - - struct NextArgument_Body - { - Box _0; - }; - - Tag tag; - union - { - String_Body string; - NextArgument_Body next_argument; - }; -}; - -struct PieceSlice -{ - const Piece *base_ptr; - uintptr_t len; -}; - -extern "C" { - -PieceSlice -collect_pieces (const char *input); - -} // extern "C" diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs index 49821e7cd2f4..4bbc468c7557 100644 --- a/libgrust/libformat_parser/src/lib.rs +++ b/libgrust/libformat_parser/src/lib.rs @@ -5,8 +5,31 @@ use std::ffi::CStr; +trait IntoFFI { + type Output; + + fn into_ffi(&self) -> Self::Output; +} + +impl IntoFFI for Option +where + T: Sized, +{ + type Output = *const T; + + fn into_ffi(&self) -> Self::Output { + match self.as_ref() { + None => std::ptr::null(), + Some(r) => r as *const T, + } + } +} + +// FIXME: Make an ffi module in a separate file +// FIXME: Remember to leak the boxed type somehow +// FIXME: How to encode the Option type? As a pointer? Option -> Option<&T> -> *const T could work maybe? mod ffi { - use std::ops::Deref; + use super::IntoFFI; // Note: copied from rustc_span /// Range inside of a `Span` used for diagnostics when we only have access to relative positions. @@ -102,31 +125,31 @@ mod ffi { /// Optionally specified character to fill alignment with. pub fill: Option, /// Span of the optionally specified fill character. - pub fill_span: Option, + pub fill_span: *const InnerSpan, /// Optionally specified alignment. pub align: Alignment, /// The `+` or `-` flag. - pub sign: Option, + pub sign: *const Sign, /// The `#` flag. pub alternate: bool, /// The `0` flag. pub zero_pad: bool, /// The `x` or `X` flag. (Only for `Debug`.) - pub debug_hex: Option, + pub debug_hex: *const DebugHex, /// The integer precision to use. pub precision: Count<'a>, /// The span of the precision formatting flag (for diagnostics). - pub precision_span: Option, + pub precision_span: *const InnerSpan, /// The string width requested for the resulting format. pub width: Count<'a>, /// The span of the width formatting flag (for diagnostics). - pub width_span: Option, + pub width_span: *const InnerSpan, /// The descriptor string representing the name of the format desired for /// this argument, this can be empty or any number of characters, although /// it is required to be one word. pub ty: &'a str, /// The span of the descriptor string (for diagnostics). - pub ty_span: Option, + pub ty_span: *const InnerSpan, } /// Enum describing where an argument for a format can be located. @@ -197,6 +220,11 @@ mod ffi { match old { generic_format_parser::Piece::String(x) => Piece::String(x), generic_format_parser::Piece::NextArgument(x) => { + // FIXME: This is problematic - if we do this, then we probably run into the issue that the Box + // is freed at the end of the call to collect_pieces. if we just .leak() it, then we have + // a memory leak... should we resend the info back to the Rust lib afterwards to free it? + // this is definitely the best way - store that pointer in the FFI piece and rebuild the box + // in a Rust destructor Piece::NextArgument(Box::new(Into::::into(*x))) } } @@ -240,18 +268,18 @@ mod ffi { fn from(old: generic_format_parser::FormatSpec<'a>) -> Self { FormatSpec { fill: old.fill, - fill_span: old.fill_span.map(Into::into), + fill_span: old.fill_span.map(Into::into).into_ffi(), align: old.align.into(), - sign: old.sign.map(Into::into), + sign: old.sign.map(Into::into).into_ffi(), alternate: old.alternate, zero_pad: old.zero_pad, - debug_hex: old.debug_hex.map(Into::into), + debug_hex: old.debug_hex.map(Into::into).into_ffi(), precision: old.precision.into(), - precision_span: old.precision_span.map(Into::into), + precision_span: old.precision_span.map(Into::into).into_ffi(), width: old.width.into(), - width_span: old.width_span.map(Into::into), + width_span: old.width_span.map(Into::into).into_ffi(), ty: old.ty, - ty_span: old.ty_span.map(Into::into), + ty_span: old.ty_span.map(Into::into).into_ffi(), } } } @@ -327,6 +355,8 @@ pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { .map(Into::into) .collect(); + println!("debug: {:?}, {:?}", pieces.as_ptr(), pieces.len()); + PieceSlice { base_ptr: pieces.as_ptr(), len: pieces.len(), From c3006f03941ce13233c7bf37fdf71d98c76ef916 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Tue, 30 Jan 2024 01:48:13 +0100 Subject: [PATCH 07/13] libformat_parser: Send boxed values across FFI properly gcc/rust/ChangeLog: * ast/rust-fmt.cc (Pieces::~Pieces): Call libformat_parser's release function in destructor. * ast/rust-fmt.h (struct PieceSlice): Add capacity. (destroy_pieces): New. (struct Pieces): Add destructor. libgrust/ChangeLog: * libformat_parser/src/lib.rs: Leak Boxes properly for C++ to see them, add memory release function. --- gcc/rust/ast/rust-fmt.cc | 4 +- gcc/rust/ast/rust-fmt.h | 9 +++ libgrust/libformat_parser/src/lib.rs | 94 ++++++++++++++-------------- 3 files changed, 58 insertions(+), 49 deletions(-) diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc index a7c4341c52db..f6ee8a209137 100644 --- a/gcc/rust/ast/rust-fmt.cc +++ b/gcc/rust/ast/rust-fmt.cc @@ -34,8 +34,10 @@ Pieces::collect (const std::string &to_parse) // auto pieces = std::vector (piece_slice.base_ptr, // piece_slice.base_ptr + piece_slice.len); - return Pieces{}; + return Pieces (piece_slice); } +Pieces::~Pieces () { destroy_pieces (slice); } + } // namespace Fmt } // namespace Rust diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h index 7ec9a2a199dd..50aeff6433ee 100644 --- a/gcc/rust/ast/rust-fmt.h +++ b/gcc/rust/ast/rust-fmt.h @@ -237,6 +237,7 @@ struct PieceSlice { const Piece *base_ptr; size_t len; + size_t cap; }; extern "C" { @@ -244,11 +245,19 @@ extern "C" { PieceSlice collect_pieces (const char *input); +void destroy_pieces (PieceSlice); + } // extern "C" struct Pieces { static Pieces collect (const std::string &to_parse); + ~Pieces (); + +private: + Pieces (PieceSlice slice) : slice (slice) {} + + PieceSlice slice; }; } // namespace Fmt diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs index 4bbc468c7557..9b2bffed05d4 100644 --- a/libgrust/libformat_parser/src/lib.rs +++ b/libgrust/libformat_parser/src/lib.rs @@ -3,21 +3,17 @@ // what's the plan? Have a function return something that can be constructed into a vector? // or an iterator? -use std::ffi::CStr; +use std::{ffi::CStr, mem}; -trait IntoFFI { - type Output; - - fn into_ffi(&self) -> Self::Output; +trait IntoFFI { + fn into_ffi(self) -> T; } -impl IntoFFI for Option +impl IntoFFI<*const T> for Option where T: Sized, { - type Output = *const T; - - fn into_ffi(&self) -> Self::Output { + fn into_ffi(self) -> *const T { match self.as_ref() { None => std::ptr::null(), Some(r) => r as *const T, @@ -40,12 +36,6 @@ mod ffi { pub end: usize, } - // impl InnerSpan { - // pub fn new(start: usize, end: usize) -> InnerSpan { - // InnerSpan { start, end } - // } - // } - /// The location and before/after width of a character whose width has changed from its source code /// representation #[derive(Copy, Clone, PartialEq, Eq)] @@ -59,35 +49,27 @@ mod ffi { pub after: usize, } - // impl InnerWidthMapping { - // pub fn new(position: usize, before: usize, after: usize) -> InnerWidthMapping { - // InnerWidthMapping { - // position, - // before, - // after, - // } - // } + // TODO: Not needed for now? + // /// Whether the input string is a literal. If yes, it contains the inner width mappings. + // #[derive(Clone, PartialEq, Eq)] + // #[repr(C)] + // enum InputStringKind { + // NotALiteral, + // Literal { + // width_mappings: Vec, + // }, // } - /// Whether the input string is a literal. If yes, it contains the inner width mappings. - #[derive(Clone, PartialEq, Eq)] - #[repr(C)] - enum InputStringKind { - NotALiteral, - Literal { - width_mappings: Vec, - }, - } - - /// The type of format string that we are parsing. - #[derive(Copy, Clone, Debug, Eq, PartialEq)] - #[repr(C)] - pub enum ParseMode { - /// A normal format string as per `format_args!`. - Format, - /// An inline assembly template string for `asm!`. - InlineAsm, - } + // TODO: Not needed for now? + // /// The type of format string that we are parsing. + // #[derive(Copy, Clone, Debug, Eq, PartialEq)] + // #[repr(C)] + // pub enum ParseMode { + // /// A normal format string as per `format_args!`. + // Format, + // /// An inline assembly template string for `asm!`. + // InlineAsm, + // } #[derive(Copy, Clone)] #[repr(C)] @@ -102,7 +84,13 @@ mod ffi { String(&'a str), /// This describes that formatting should process the next argument (as /// specified inside) for emission. - NextArgument(Box>), + NextArgument(*const Argument<'a>), + } + + impl<'a> Drop for Piece<'a> { + fn drop(&mut self) { + println!("dropping Piece: {:?}", self) + } } /// Representation of an argument specification. @@ -225,7 +213,10 @@ mod ffi { // a memory leak... should we resend the info back to the Rust lib afterwards to free it? // this is definitely the best way - store that pointer in the FFI piece and rebuild the box // in a Rust destructor - Piece::NextArgument(Box::new(Into::::into(*x))) + let ptr = Box::leak(x); + let dst = Into::::into(*ptr); + + Piece::NextArgument(&dst as *const Argument) } } } @@ -331,17 +322,18 @@ pub mod rust { use generic_format_parser::{ParseMode, Parser, Piece}; pub fn collect_pieces(input: &str) -> Vec> { - // let parser = Parser::new(); let parser = Parser::new(input, None, None, true, ParseMode::Format); parser.into_iter().collect() } } +// TODO: Should we instead make an FFIVector struct? #[repr(C)] pub struct PieceSlice { - base_ptr: *const ffi::Piece<'static /* FIXME: That's wrong */>, + base_ptr: *mut ffi::Piece<'static /* FIXME: That's wrong */>, len: usize, + cap: usize, } #[no_mangle] @@ -355,10 +347,16 @@ pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { .map(Into::into) .collect(); - println!("debug: {:?}, {:?}", pieces.as_ptr(), pieces.len()); + println!("[ARTHUR]: debug: {:?}, {:?}", pieces.as_ptr(), pieces.len()); PieceSlice { - base_ptr: pieces.as_ptr(), len: pieces.len(), + cap: pieces.capacity(), + base_ptr: pieces.leak().as_mut_ptr(), } } + +#[no_mangle] +pub extern "C" fn destroy_pieces(PieceSlice { base_ptr, len, cap }: PieceSlice) { + let _ = unsafe { Vec::from_raw_parts(base_ptr, len, cap) }; +} From dc76d45a8a4916c091f7afdd0cdb45681657e930 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Tue, 30 Jan 2024 16:16:36 +0100 Subject: [PATCH 08/13] format_args: Parse format string properly gcc/rust/ChangeLog: * expand/rust-macro-builtins.cc (MacroBuiltin::format_args_handler): Construct string to parser properly. --- gcc/rust/expand/rust-macro-builtins.cc | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/gcc/rust/expand/rust-macro-builtins.cc b/gcc/rust/expand/rust-macro-builtins.cc index 0e57406f10f8..19ea91094539 100644 --- a/gcc/rust/expand/rust-macro-builtins.cc +++ b/gcc/rust/expand/rust-macro-builtins.cc @@ -947,7 +947,24 @@ tl::optional MacroBuiltin::format_args_handler (location_t invoc_locus, AST::MacroInvocData &invoc) { - Fmt::Pieces::collect ("heyo this {is} what I {} want to {3}, {parse}"); + auto fmt_expr + = parse_single_string_literal (BuiltinMacro::FormatArgs, + invoc.get_delim_tok_tree (), invoc_locus, + invoc.get_expander ()); + + if (!fmt_expr) + return AST::Fragment::create_error (); + + // if it is not a literal, it's an eager macro invocation - return it + if (!fmt_expr->is_literal ()) + { + auto token_tree = invoc.get_delim_tok_tree (); + return AST::Fragment ({AST::SingleASTNode (std::move (fmt_expr))}, + token_tree.to_token_stream ()); + } + + auto format_string = fmt_expr->as_string (); + auto pieces = Fmt::Pieces::collect (format_string); return AST::Fragment::create_empty (); } From 7a556de4061e384dce6b847e3c8ab98c3b33dde7 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Wed, 7 Feb 2024 12:46:16 +0100 Subject: [PATCH 09/13] format_args: Parse entire token invocation gcc/rust/ChangeLog: * expand/rust-macro-builtins.cc (MacroBuiltin::format_args_handler): Transform entire invocation token stream into string for the parser. --- gcc/rust/expand/rust-macro-builtins.cc | 40 ++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/gcc/rust/expand/rust-macro-builtins.cc b/gcc/rust/expand/rust-macro-builtins.cc index 19ea91094539..2af05a5e3777 100644 --- a/gcc/rust/expand/rust-macro-builtins.cc +++ b/gcc/rust/expand/rust-macro-builtins.cc @@ -16,6 +16,8 @@ // along with GCC; see the file COPYING3. If not see // . +#include "libproc_macro_internal/tokenstream.h" +#include "rust-token-converter.h" #include "rust-system.h" #include "rust-macro-builtins.h" #include "rust-ast-fragment.h" @@ -947,24 +949,26 @@ tl::optional MacroBuiltin::format_args_handler (location_t invoc_locus, AST::MacroInvocData &invoc) { - auto fmt_expr - = parse_single_string_literal (BuiltinMacro::FormatArgs, - invoc.get_delim_tok_tree (), invoc_locus, - invoc.get_expander ()); - - if (!fmt_expr) - return AST::Fragment::create_error (); - - // if it is not a literal, it's an eager macro invocation - return it - if (!fmt_expr->is_literal ()) - { - auto token_tree = invoc.get_delim_tok_tree (); - return AST::Fragment ({AST::SingleASTNode (std::move (fmt_expr))}, - token_tree.to_token_stream ()); - } - - auto format_string = fmt_expr->as_string (); - auto pieces = Fmt::Pieces::collect (format_string); + auto tokens = invoc.get_delim_tok_tree ().to_token_stream (); + tokens.erase (tokens.begin ()); + tokens.pop_back (); + + std::stringstream stream; + for (const auto &tok : tokens) + stream << tok->as_string () << ' '; + + rust_debug ("[ARTHU]: `%s`", stream.str ().c_str ()); + + // FIXME: We need to handle this + // // if it is not a literal, it's an eager macro invocation - return it + // if (!fmt_expr->is_literal ()) + // { + // auto token_tree = invoc.get_delim_tok_tree (); + // return AST::Fragment ({AST::SingleASTNode (std::move (fmt_expr))}, + // token_tree.to_token_stream ()); + // } + + auto pieces = Fmt::Pieces::collect (stream.str ()); return AST::Fragment::create_empty (); } From a829fc4acf82c62c1d1e3cf9aec871900712c3eb Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Tue, 13 Feb 2024 16:31:25 +0100 Subject: [PATCH 10/13] rust-fmt: Store parsed string in Pieces struct gcc/rust/ChangeLog: * ast/rust-fmt.cc (Pieces::collect): Fix signature to take ownership of the given string. * ast/rust-fmt.h (struct Pieces): Store parsed string in the struct. libgrust/ChangeLog: * libformat_parser/src/lib.rs: Add debug prompt. --- gcc/rust/ast/rust-fmt.cc | 4 ++-- gcc/rust/ast/rust-fmt.h | 7 +++++-- libgrust/libformat_parser/src/lib.rs | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc index f6ee8a209137..511e94740c5e 100644 --- a/gcc/rust/ast/rust-fmt.cc +++ b/gcc/rust/ast/rust-fmt.cc @@ -23,7 +23,7 @@ namespace Rust { namespace Fmt { Pieces -Pieces::collect (const std::string &to_parse) +Pieces::collect (std::string &&to_parse) { auto piece_slice = collect_pieces (to_parse.c_str ()); @@ -34,7 +34,7 @@ Pieces::collect (const std::string &to_parse) // auto pieces = std::vector (piece_slice.base_ptr, // piece_slice.base_ptr + piece_slice.len); - return Pieces (piece_slice); + return Pieces (piece_slice, std::move (to_parse)); } Pieces::~Pieces () { destroy_pieces (slice); } diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h index 50aeff6433ee..0bf9695bb6d2 100644 --- a/gcc/rust/ast/rust-fmt.h +++ b/gcc/rust/ast/rust-fmt.h @@ -251,13 +251,16 @@ void destroy_pieces (PieceSlice); struct Pieces { - static Pieces collect (const std::string &to_parse); + static Pieces collect (std::string &&to_parse); ~Pieces (); private: - Pieces (PieceSlice slice) : slice (slice) {} + Pieces (PieceSlice slice, std::string &&to_parse) + : slice (slice), to_parse (std::move (to_parse)) + {} PieceSlice slice; + std::string to_parse; }; } // namespace Fmt diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs index 9b2bffed05d4..eb3e1060e5d8 100644 --- a/libgrust/libformat_parser/src/lib.rs +++ b/libgrust/libformat_parser/src/lib.rs @@ -340,6 +340,7 @@ pub struct PieceSlice { pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { // FIXME: Add comment let str = unsafe { CStr::from_ptr(input) }; + dbg!(str); // FIXME: No unwrap let pieces: Vec> = rust::collect_pieces(str.to_str().unwrap()) From 3cd6cd76b570af74bafdd277368f0b717dda597b Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Thu, 15 Feb 2024 13:11:26 +0100 Subject: [PATCH 11/13] libformat_parser: Fix Rust warnings. libgrust/ChangeLog: * libformat_parser/generic_format_parser/src/lib.rs: Remove unused deprecated attribute and unused import. * libformat_parser/src/lib.rs: Remove unused import. --- libgrust/libformat_parser/generic_format_parser/src/lib.rs | 2 -- libgrust/libformat_parser/src/lib.rs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs index 87a20dc18c56..6a366177f252 100644 --- a/libgrust/libformat_parser/generic_format_parser/src/lib.rs +++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs @@ -14,12 +14,10 @@ // WARNING: We want to be able to build this crate with a stable compiler, // so no `#![feature]` attributes should be added! -#[deprecated(note = "Use a proper lexer function for this")] fn is_id_start(c: char) -> bool { c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) } -#[deprecated(note = "Use a proper lexer function for this")] fn is_id_continue(c: char) -> bool { unicode_xid::UnicodeXID::is_xid_continue(c) } diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs index eb3e1060e5d8..c164578a1039 100644 --- a/libgrust/libformat_parser/src/lib.rs +++ b/libgrust/libformat_parser/src/lib.rs @@ -3,7 +3,7 @@ // what's the plan? Have a function return something that can be constructed into a vector? // or an iterator? -use std::{ffi::CStr, mem}; +use std::ffi::CStr; trait IntoFFI { fn into_ffi(self) -> T; From a32eeae202f00488ccb60ea367aecc05f85b1e36 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Mon, 26 Feb 2024 11:55:47 +0100 Subject: [PATCH 12/13] format-parser: Add `is_some_and` method for Option Workaround for Ubuntu 18.04, since we still use it for the GCC 4.8 CI. The default Rust package is 1.65 (and unlikely to change I assume?), but the generic format parser library uses `is_some_and` which was introduced in 1.70. So this is a simple reimplementation, directly taken from the standard library sources. libgrust/ChangeLog: * libformat_parser/generic_format_parser/src/lib.rs: Add IsSomeAnd trait, impl it for Option. --- .../generic_format_parser/src/lib.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs index 6a366177f252..8062bf9e5cec 100644 --- a/libgrust/libformat_parser/generic_format_parser/src/lib.rs +++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs @@ -22,6 +22,22 @@ fn is_id_continue(c: char) -> bool { unicode_xid::UnicodeXID::is_xid_continue(c) } +// Workaround for Ubuntu 18.04. The default Rust package is 1.65 (and unlikely to change I assume?), but the +// generic format parser library uses `is_some_and` which was introduced in 1.70. So this is a reimplementation, +// directly taken from the standard library sources +trait IsSomeAnd { + fn is_some_and(self, f: impl FnOnce(T) -> bool) -> bool; +} + +impl IsSomeAnd for Option { + fn is_some_and(self, f: impl FnOnce(T) -> bool) -> bool { + match self { + None => false, + Some(x) => f(x), + } + } +} + // use rustc_lexer::unescape; pub use Alignment::*; pub use Count::*; From 7d2d63900d0cf9c605b968aca5f51c94ced20579 Mon Sep 17 00:00:00 2001 From: Arthur Cohen Date: Mon, 26 Feb 2024 11:57:54 +0100 Subject: [PATCH 13/13] ci: Install cargo on ubuntu 18.04 container. ChangeLog: * .github/workflows/ccpp.yml: Install cargo for GCC 4.8 job. --- .github/workflows/ccpp.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 34908b6eec86..10a909019fbd 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -192,7 +192,8 @@ jobs: g++-4.8 \ gcc-4.8-multilib \ g++-4.8-multilib \ - dejagnu + dejagnu \ + cargo - name: Configure run: |