diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml index 34908b6eec86..10a909019fbd 100644 --- a/.github/workflows/ccpp.yml +++ b/.github/workflows/ccpp.yml @@ -192,7 +192,8 @@ jobs: g++-4.8 \ gcc-4.8-multilib \ g++-4.8-multilib \ - dejagnu + dejagnu \ + cargo - name: Configure run: | diff --git a/.gitignore b/.gitignore index 88b8aa27a882..b1c6625d645c 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,4 @@ test.code-workspace gcc/rust/test3-tiny/* .clang-format.swap +libgrust/*/target/ diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in index 4d6460187924..7c8ab6e78464 100644 --- a/gcc/rust/Make-lang.in +++ b/gcc/rust/Make-lang.in @@ -54,6 +54,8 @@ GCCRS_D_OBJS = \ rust/rustspec.o \ $(END) +LIBS += -ldl -lpthread + gccrs$(exeext): $(GCCRS_D_OBJS) $(EXTRA_GCC_OBJS) libcommon-target.a $(LIBDEPS) +$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \ $(GCCRS_D_OBJS) $(EXTRA_GCC_OBJS) libcommon-target.a \ @@ -100,6 +102,7 @@ GRS_OBJS = \ rust/rust-proc-macro-invoc-lexer.o \ rust/rust-macro-substitute-ctx.o \ rust/rust-macro-builtins.o \ + rust/rust-fmt.o \ rust/rust-hir.o \ rust/rust-hir-map.o \ rust/rust-attributes.o \ @@ -208,14 +211,14 @@ RUST_ALL_OBJS = $(GRS_OBJS) $(RUST_TARGET_OBJS) rust_OBJS = $(RUST_ALL_OBJS) rust/rustspec.o -RUST_LDFLAGS = $(LDFLAGS) -L./../libgrust/libproc_macro_internal -RUST_LIBDEPS = $(LIBDEPS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a +RUST_LDFLAGS = $(LDFLAGS) -L./../libgrust/libproc_macro_internal -L./../libgrust/librustc_format_parser/ +RUST_LIBDEPS = $(LIBDEPS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a rust/libformat_parser.a # The compiler itself is called crab1 crab1$(exeext): $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(RUST_LIBDEPS) $(rust.prev) @$(call LINK_PROGRESS,$(INDEX.rust),start) +$(LLINKER) $(ALL_LINKERFLAGS) $(RUST_LDFLAGS) -o $@ \ - $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(LIBS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a $(BACKENDLIBS) + $(RUST_ALL_OBJS) attribs.o $(BACKEND) $(LIBS) ../libgrust/libproc_macro_internal/libproc_macro_internal.a rust/libformat_parser.a $(BACKENDLIBS) @$(call LINK_PROGRESS,$(INDEX.rust),end) # Build hooks. @@ -401,6 +404,13 @@ rust/%.o: rust/lex/%.cc $(COMPILE) $(RUST_CXXFLAGS) $(RUST_INCLUDES) $< $(POSTCOMPILE) +%.toml: + echo $@ + +rust/libformat_parser.a: $(srcdir)/../libgrust/libformat_parser/Cargo.toml $(wildcard $(srcdir)/../libgrust/libformat_parser/src/*.rs) + cargo build --manifest-path $(srcdir)/../libgrust/libformat_parser/Cargo.toml --release # FIXME: Not always release, right? + cp $(srcdir)/../libgrust/libformat_parser/target/release/liblibformat_parser.a $@ + # build all rust/parse files in rust folder, add cross-folder includes rust/%.o: rust/parse/%.cc $(COMPILE) $(RUST_CXXFLAGS) $(RUST_INCLUDES) $< diff --git a/gcc/rust/ast/rust-fmt.cc b/gcc/rust/ast/rust-fmt.cc new file mode 100644 index 000000000000..511e94740c5e --- /dev/null +++ b/gcc/rust/ast/rust-fmt.cc @@ -0,0 +1,43 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +#include "rust-fmt.h" +#include "rust-diagnostics.h" + +namespace Rust { +namespace Fmt { + +Pieces +Pieces::collect (std::string &&to_parse) +{ + auto piece_slice = collect_pieces (to_parse.c_str ()); + + rust_debug ("[ARTHUR] %p, %lu", (const void *) piece_slice.base_ptr, + piece_slice.len); + + // this performs multiple copies, can we avoid them maybe? + // auto pieces = std::vector (piece_slice.base_ptr, + // piece_slice.base_ptr + piece_slice.len); + + return Pieces (piece_slice, std::move (to_parse)); +} + +Pieces::~Pieces () { destroy_pieces (slice); } + +} // namespace Fmt +} // namespace Rust diff --git a/gcc/rust/ast/rust-fmt.h b/gcc/rust/ast/rust-fmt.h new file mode 100644 index 000000000000..0bf9695bb6d2 --- /dev/null +++ b/gcc/rust/ast/rust-fmt.h @@ -0,0 +1,269 @@ +// Copyright (C) 2023-2024 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +#ifndef RUST_FMT_H +#define RUST_FMT_H + +#include "rust-system.h" + +// FIXME: How to encode Option? + +namespace Rust { +namespace Fmt { + +struct RustHamster +{ + // hehe +}; + +/// Enum of alignments which are supported. +enum class Alignment +{ + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, +}; + +/// Enum for the debug hex flags. +enum class DebugHex +{ + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, +}; + +/// Enum for the sign flags. +enum class Sign +{ + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, +}; + +/// Enum describing where an argument for a format can be located. +struct Position +{ + enum class Tag + { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs, + /// The argument is located at a specific index given in the format, + ArgumentIs, + /// The argument has a name. + ArgumentNamed, + }; + + struct ArgumentImplicitlyIs_Body + { + size_t _0; + }; + + struct ArgumentIs_Body + { + size_t _0; + }; + + struct ArgumentNamed_Body + { + RustHamster _0; + }; + + Tag tag; + union + { + ArgumentImplicitlyIs_Body argument_implicitly_is; + ArgumentIs_Body argument_is; + ArgumentNamed_Body argument_named; + }; +}; + +/// Range inside of a `Span` used for diagnostics when we only have access to +/// relative positions. +struct InnerSpan +{ + size_t start; + size_t end; +}; + +/// A count is used for the precision and width parameters of an integer, and +/// can reference either an argument or a literal integer. +struct Count +{ + enum class Tag + { + /// The count is specified explicitly. + CountIs, + /// The count is specified by the argument with the given name. + CountIsName, + /// The count is specified by the argument at the given index. + CountIsParam, + /// The count is specified by a star (like in `{:.*}`) that refers to the + /// argument at the given index. + CountIsStar, + /// The count is implied and cannot be explicitly specified. + CountImplied, + }; + + struct CountIs_Body + { + size_t _0; + }; + + struct CountIsName_Body + { + RustHamster _0; + InnerSpan _1; + }; + + struct CountIsParam_Body + { + size_t _0; + }; + + struct CountIsStar_Body + { + size_t _0; + }; + + Tag tag; + union + { + CountIs_Body count_is; + CountIsName_Body count_is_name; + CountIsParam_Body count_is_param; + CountIsStar_Body count_is_star; + }; +}; + +/// Specification for the formatting of an argument in the format string. +struct FormatSpec +{ + /// Optionally specified character to fill alignment with. + const uint32_t *fill; + /// Span of the optionally specified fill character. + const InnerSpan *fill_span; + /// Optionally specified alignment. + Alignment align; + /// The `+` or `-` flag. + const Sign *sign; + /// The `#` flag. + bool alternate; + /// The `0` flag. + bool zero_pad; + /// The `x` or `X` flag. (Only for `Debug`.) + const DebugHex *debug_hex; + /// The integer precision to use. + Count precision; + /// The span of the precision formatting flag (for diagnostics). + const InnerSpan *precision_span; + /// The string width requested for the resulting format. + Count width; + /// The span of the width formatting flag (for diagnostics). + const InnerSpan *width_span; + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + RustHamster ty; + /// The span of the descriptor string (for diagnostics). + const InnerSpan *ty_span; +}; + +/// Representation of an argument specification. +struct Argument +{ + /// Where to find this argument + Position position; + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + InnerSpan position_span; + /// How to format the argument + FormatSpec format; +}; + +/// A piece is a portion of the format string which represents the next part +/// to emit. These are emitted as a stream by the `Parser` class. +struct Piece +{ + enum class Tag + { + /// A literal string which should directly be emitted + String, + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument, + }; + + struct String_Body + { + RustHamster _0; + }; + + struct NextArgument_Body + { + const Argument *_0; + }; + + Tag tag; + union + { + String_Body string; + NextArgument_Body next_argument; + }; +}; + +struct PieceSlice +{ + const Piece *base_ptr; + size_t len; + size_t cap; +}; + +extern "C" { + +PieceSlice +collect_pieces (const char *input); + +void destroy_pieces (PieceSlice); + +} // extern "C" + +struct Pieces +{ + static Pieces collect (std::string &&to_parse); + ~Pieces (); + +private: + Pieces (PieceSlice slice, std::string &&to_parse) + : slice (slice), to_parse (std::move (to_parse)) + {} + + PieceSlice slice; + std::string to_parse; +}; + +} // namespace Fmt +} // namespace Rust + +#endif // !RUST_FMT_H diff --git a/gcc/rust/expand/rust-macro-builtins.cc b/gcc/rust/expand/rust-macro-builtins.cc index 71da575563db..2af05a5e3777 100644 --- a/gcc/rust/expand/rust-macro-builtins.cc +++ b/gcc/rust/expand/rust-macro-builtins.cc @@ -16,6 +16,8 @@ // along with GCC; see the file COPYING3. If not see // . +#include "libproc_macro_internal/tokenstream.h" +#include "rust-token-converter.h" #include "rust-system.h" #include "rust-macro-builtins.h" #include "rust-ast-fragment.h" @@ -30,6 +32,7 @@ #include "rust-parse.h" #include "rust-session-manager.h" #include "rust-attribute-values.h" +#include "rust-fmt.h" namespace Rust { @@ -89,8 +92,8 @@ std::unordered_map {"env", MacroBuiltin::env_handler}, {"cfg", MacroBuiltin::cfg_handler}, {"include", MacroBuiltin::include_handler}, + {"format_args", MacroBuiltin::format_args_handler}, /* Unimplemented macro builtins */ - {"format_args", MacroBuiltin::sorry}, {"option_env", MacroBuiltin::sorry}, {"format_args_nl", MacroBuiltin::sorry}, {"concat_idents", MacroBuiltin::sorry}, @@ -942,6 +945,34 @@ MacroBuiltin::stringify_handler (location_t invoc_locus, return AST::Fragment ({node}, std::move (token)); } +tl::optional +MacroBuiltin::format_args_handler (location_t invoc_locus, + AST::MacroInvocData &invoc) +{ + auto tokens = invoc.get_delim_tok_tree ().to_token_stream (); + tokens.erase (tokens.begin ()); + tokens.pop_back (); + + std::stringstream stream; + for (const auto &tok : tokens) + stream << tok->as_string () << ' '; + + rust_debug ("[ARTHU]: `%s`", stream.str ().c_str ()); + + // FIXME: We need to handle this + // // if it is not a literal, it's an eager macro invocation - return it + // if (!fmt_expr->is_literal ()) + // { + // auto token_tree = invoc.get_delim_tok_tree (); + // return AST::Fragment ({AST::SingleASTNode (std::move (fmt_expr))}, + // token_tree.to_token_stream ()); + // } + + auto pieces = Fmt::Pieces::collect (stream.str ()); + + return AST::Fragment::create_empty (); +} + tl::optional MacroBuiltin::sorry (location_t invoc_locus, AST::MacroInvocData &invoc) { diff --git a/gcc/rust/expand/rust-macro-builtins.h b/gcc/rust/expand/rust-macro-builtins.h index 6a84a8b86f68..f9ab3fc3698e 100644 --- a/gcc/rust/expand/rust-macro-builtins.h +++ b/gcc/rust/expand/rust-macro-builtins.h @@ -157,6 +157,9 @@ class MacroBuiltin static tl::optional line_handler (location_t invoc_locus, AST::MacroInvocData &invoc); + static tl::optional + format_args_handler (location_t invoc_locus, AST::MacroInvocData &invoc); + static tl::optional sorry (location_t invoc_locus, AST::MacroInvocData &invoc); diff --git a/libgrust/libformat_parser/Cargo.lock b/libgrust/libformat_parser/Cargo.lock new file mode 100644 index 000000000000..65e48263c71a --- /dev/null +++ b/libgrust/libformat_parser/Cargo.lock @@ -0,0 +1,30 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "generic_format_parser" +version = "0.1.0" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "libformat_parser" +version = "0.1.0" +dependencies = [ + "generic_format_parser", + "libc", +] + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" diff --git a/libgrust/libformat_parser/Cargo.toml b/libgrust/libformat_parser/Cargo.toml new file mode 100644 index 000000000000..0fcfa3e89a4c --- /dev/null +++ b/libgrust/libformat_parser/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "libformat_parser" +version = "0.1.0" +edition = "2021" + +[workspace] + +members = [ + "generic_format_parser", +] + +[dependencies] +libc = "0.2" +generic_format_parser = { path = "generic_format_parser" } + +[lib] +crate_type = ["staticlib", "rlib"] + +[[bin]] +name = "format_parser_test" +path = "src/bin.rs" diff --git a/libgrust/libformat_parser/cbindgen.toml b/libgrust/libformat_parser/cbindgen.toml new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/libgrust/libformat_parser/generic_format_parser/Cargo.toml b/libgrust/libformat_parser/generic_format_parser/Cargo.toml new file mode 100644 index 000000000000..34577038cbed --- /dev/null +++ b/libgrust/libformat_parser/generic_format_parser/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "generic_format_parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +unicode-xid = "0.2.0" diff --git a/libgrust/libformat_parser/generic_format_parser/src/lib.rs b/libgrust/libformat_parser/generic_format_parser/src/lib.rs new file mode 100644 index 000000000000..8062bf9e5cec --- /dev/null +++ b/libgrust/libformat_parser/generic_format_parser/src/lib.rs @@ -0,0 +1,1116 @@ +//! Macro support for format strings +//! +//! These structures are used when parsing format strings for the compiler. +//! Parsing does not happen at runtime: structures of `std::fmt::rt` are +//! generated instead. + +#![doc( + html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/", + html_playground_url = "https://play.rust-lang.org/", + test(attr(deny(warnings))) +)] +#![deny(rustc::untranslatable_diagnostic)] +#![deny(rustc::diagnostic_outside_of_impl)] +// WARNING: We want to be able to build this crate with a stable compiler, +// so no `#![feature]` attributes should be added! + +fn is_id_start(c: char) -> bool { + c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) +} + +fn is_id_continue(c: char) -> bool { + unicode_xid::UnicodeXID::is_xid_continue(c) +} + +// Workaround for Ubuntu 18.04. The default Rust package is 1.65 (and unlikely to change I assume?), but the +// generic format parser library uses `is_some_and` which was introduced in 1.70. So this is a reimplementation, +// directly taken from the standard library sources +trait IsSomeAnd { + fn is_some_and(self, f: impl FnOnce(T) -> bool) -> bool; +} + +impl IsSomeAnd for Option { + fn is_some_and(self, f: impl FnOnce(T) -> bool) -> bool { + match self { + None => false, + Some(x) => f(x), + } + } +} + +// use rustc_lexer::unescape; +pub use Alignment::*; +pub use Count::*; +pub use Piece::*; +pub use Position::*; + +use std::iter; +use std::str; +use std::string; + +// Note: copied from rustc_span +/// Range inside of a `Span` used for diagnostics when we only have access to relative positions. +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +pub struct InnerSpan { + pub start: usize, + pub end: usize, +} + +impl InnerSpan { + pub fn new(start: usize, end: usize) -> InnerSpan { + InnerSpan { start, end } + } +} + +/// The location and before/after width of a character whose width has changed from its source code +/// representation +#[derive(Copy, Clone, PartialEq, Eq)] +pub struct InnerWidthMapping { + /// Index of the character in the source + pub position: usize, + /// The inner width in characters + pub before: usize, + /// The transformed width in characters + pub after: usize, +} + +impl InnerWidthMapping { + pub fn new(position: usize, before: usize, after: usize) -> InnerWidthMapping { + InnerWidthMapping { + position, + before, + after, + } + } +} + +/// Whether the input string is a literal. If yes, it contains the inner width mappings. +#[derive(Clone, PartialEq, Eq)] +enum InputStringKind { + NotALiteral, + Literal { + width_mappings: Vec, + }, +} + +/// The type of format string that we are parsing. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ParseMode { + /// A normal format string as per `format_args!`. + Format, + /// An inline assembly template string for `asm!`. + InlineAsm, +} + +#[derive(Copy, Clone)] +struct InnerOffset(usize); + +impl InnerOffset { + fn to(self, end: InnerOffset) -> InnerSpan { + InnerSpan::new(self.0, end.0) + } +} + +/// A piece is a portion of the format string which represents the next part +/// to emit. These are emitted as a stream by the `Parser` class. +#[derive(Clone, Debug, PartialEq)] +pub enum Piece<'a> { + /// A literal string which should directly be emitted + String(&'a str), + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument(Box>), +} + +/// Representation of an argument specification. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct Argument<'a> { + /// Where to find this argument + pub position: Position<'a>, + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + pub position_span: InnerSpan, + /// How to format the argument + pub format: FormatSpec<'a>, +} + +/// Specification for the formatting of an argument in the format string. +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct FormatSpec<'a> { + /// Optionally specified character to fill alignment with. + pub fill: Option, + /// Span of the optionally specified fill character. + pub fill_span: Option, + /// Optionally specified alignment. + pub align: Alignment, + /// The `+` or `-` flag. + pub sign: Option, + /// The `#` flag. + pub alternate: bool, + /// The `0` flag. + pub zero_pad: bool, + /// The `x` or `X` flag. (Only for `Debug`.) + pub debug_hex: Option, + /// The integer precision to use. + pub precision: Count<'a>, + /// The span of the precision formatting flag (for diagnostics). + pub precision_span: Option, + /// The string width requested for the resulting format. + pub width: Count<'a>, + /// The span of the width formatting flag (for diagnostics). + pub width_span: Option, + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + pub ty: &'a str, + /// The span of the descriptor string (for diagnostics). + pub ty_span: Option, +} + +/// Enum describing where an argument for a format can be located. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Position<'a> { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs(usize), + /// The argument is located at a specific index given in the format, + ArgumentIs(usize), + /// The argument has a name. + ArgumentNamed(&'a str), +} + +impl Position<'_> { + pub fn index(&self) -> Option { + match self { + ArgumentIs(i, ..) | ArgumentImplicitlyIs(i) => Some(*i), + _ => None, + } + } +} + +/// Enum of alignments which are supported. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Alignment { + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, +} + +/// Enum for the sign flags. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Sign { + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, +} + +/// Enum for the debug hex flags. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum DebugHex { + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, +} + +/// A count is used for the precision and width parameters of an integer, and +/// can reference either an argument or a literal integer. +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Count<'a> { + /// The count is specified explicitly. + CountIs(usize), + /// The count is specified by the argument with the given name. + CountIsName(&'a str, InnerSpan), + /// The count is specified by the argument at the given index. + CountIsParam(usize), + /// The count is specified by a star (like in `{:.*}`) that refers to the argument at the given index. + CountIsStar(usize), + /// The count is implied and cannot be explicitly specified. + CountImplied, +} + +pub struct ParseError { + pub description: string::String, + pub note: Option, + pub label: string::String, + pub span: InnerSpan, + pub secondary_label: Option<(string::String, InnerSpan)>, + pub suggestion: Suggestion, +} + +pub enum Suggestion { + None, + /// Replace inline argument with positional argument: + /// `format!("{foo.bar}")` -> `format!("{}", foo.bar)` + UsePositional, + /// Remove `r#` from identifier: + /// `format!("{r#foo}")` -> `format!("{foo}")` + RemoveRawIdent(InnerSpan), +} + +/// The parser structure for interpreting the input format string. This is +/// modeled as an iterator over `Piece` structures to form a stream of tokens +/// being output. +/// +/// This is a recursive-descent parser for the sake of simplicity, and if +/// necessary there's probably lots of room for improvement performance-wise. +pub struct Parser<'a> { + mode: ParseMode, + input: &'a str, + cur: iter::Peekable>, + /// Error messages accumulated during parsing + pub errors: Vec, + /// Current position of implicit positional argument pointer + pub curarg: usize, + /// `Some(raw count)` when the string is "raw", used to position spans correctly + style: Option, + /// Start and end byte offset of every successfully parsed argument + pub arg_places: Vec, + /// Characters whose length has been changed from their in-code representation + width_map: Vec, + /// Span of the last opening brace seen, used for error reporting + last_opening_brace: Option, + /// Whether the source string is comes from `println!` as opposed to `format!` or `print!` + append_newline: bool, + /// Whether this formatting string was written directly in the source. This controls whether we + /// can use spans to refer into it and give better error messages. + /// N.B: This does _not_ control whether implicit argument captures can be used. + pub is_source_literal: bool, + /// Start position of the current line. + cur_line_start: usize, + /// Start and end byte offset of every line of the format string. Excludes + /// newline characters and leading whitespace. + pub line_spans: Vec, +} + +impl<'a> Iterator for Parser<'a> { + type Item = Piece<'a>; + + fn next(&mut self) -> Option> { + if let Some(&(pos, c)) = self.cur.peek() { + match c { + '{' => { + let curr_last_brace = self.last_opening_brace; + let byte_pos = self.to_span_index(pos); + let lbrace_end = InnerOffset(byte_pos.0 + self.to_span_width(pos)); + self.last_opening_brace = Some(byte_pos.to(lbrace_end)); + self.cur.next(); + if self.consume('{') { + self.last_opening_brace = curr_last_brace; + + Some(String(self.string(pos + 1))) + } else { + let arg = self.argument(lbrace_end); + if let Some(rbrace_pos) = self.consume_closing_brace(&arg) { + if self.is_source_literal { + let lbrace_byte_pos = self.to_span_index(pos); + let rbrace_byte_pos = self.to_span_index(rbrace_pos); + + let width = self.to_span_width(rbrace_pos); + + self.arg_places.push( + lbrace_byte_pos.to(InnerOffset(rbrace_byte_pos.0 + width)), + ); + } + } else { + if let Some(&(_, maybe)) = self.cur.peek() { + if maybe == '?' { + self.suggest_format(); + } else { + self.suggest_positional_arg_instead_of_captured_arg(arg); + } + } + } + Some(NextArgument(Box::new(arg))) + } + } + '}' => { + self.cur.next(); + if self.consume('}') { + Some(String(self.string(pos + 1))) + } else { + let err_pos = self.to_span_index(pos); + self.err_with_note( + "unmatched `}` found", + "unmatched `}`", + "if you intended to print `}`, you can escape it using `}}`", + err_pos.to(err_pos), + ); + None + } + } + _ => Some(String(self.string(pos))), + } + } else { + if self.is_source_literal { + let span = self.span(self.cur_line_start, self.input.len()); + if self.line_spans.last() != Some(&span) { + self.line_spans.push(span); + } + } + None + } + } +} + +impl<'a> Parser<'a> { + /// Creates a new parser for the given format string + pub fn new( + s: &'a str, + style: Option, + snippet: Option, + append_newline: bool, + mode: ParseMode, + ) -> Parser<'a> { + let input_string_kind = find_width_map_from_snippet(s, snippet, style); + let (width_map, is_source_literal) = match input_string_kind { + InputStringKind::Literal { width_mappings } => (width_mappings, true), + InputStringKind::NotALiteral => (Vec::new(), false), + }; + + Parser { + mode, + input: s, + cur: s.char_indices().peekable(), + errors: vec![], + curarg: 0, + style, + arg_places: vec![], + width_map, + last_opening_brace: None, + append_newline, + is_source_literal, + cur_line_start: 0, + line_spans: vec![], + } + } + + /// Notifies of an error. The message doesn't actually need to be of type + /// String, but I think it does when this eventually uses conditions so it + /// might as well start using it now. + fn err, S2: Into>( + &mut self, + description: S1, + label: S2, + span: InnerSpan, + ) { + self.errors.push(ParseError { + description: description.into(), + note: None, + label: label.into(), + span, + secondary_label: None, + suggestion: Suggestion::None, + }); + } + + /// Notifies of an error. The message doesn't actually need to be of type + /// String, but I think it does when this eventually uses conditions so it + /// might as well start using it now. + fn err_with_note< + S1: Into, + S2: Into, + S3: Into, + >( + &mut self, + description: S1, + label: S2, + note: S3, + span: InnerSpan, + ) { + self.errors.push(ParseError { + description: description.into(), + note: Some(note.into()), + label: label.into(), + span, + secondary_label: None, + suggestion: Suggestion::None, + }); + } + + /// Optionally consumes the specified character. If the character is not at + /// the current position, then the current iterator isn't moved and `false` is + /// returned, otherwise the character is consumed and `true` is returned. + fn consume(&mut self, c: char) -> bool { + self.consume_pos(c).is_some() + } + + /// Optionally consumes the specified character. If the character is not at + /// the current position, then the current iterator isn't moved and `None` is + /// returned, otherwise the character is consumed and the current position is + /// returned. + fn consume_pos(&mut self, c: char) -> Option { + if let Some(&(pos, maybe)) = self.cur.peek() { + if c == maybe { + self.cur.next(); + return Some(pos); + } + } + None + } + + fn remap_pos(&self, mut pos: usize) -> InnerOffset { + for width in &self.width_map { + if pos > width.position { + pos += width.before - width.after; + } else if pos == width.position && width.after == 0 { + pos += width.before; + } else { + break; + } + } + + InnerOffset(pos) + } + + fn to_span_index(&self, pos: usize) -> InnerOffset { + // This handles the raw string case, the raw argument is the number of # + // in r###"..."### (we need to add one because of the `r`). + let raw = self.style.map_or(0, |raw| raw + 1); + let pos = self.remap_pos(pos); + InnerOffset(raw + pos.0 + 1) + } + + fn to_span_width(&self, pos: usize) -> usize { + let pos = self.remap_pos(pos); + match self.width_map.iter().find(|w| w.position == pos.0) { + Some(w) => w.before, + None => 1, + } + } + + fn span(&self, start_pos: usize, end_pos: usize) -> InnerSpan { + let start = self.to_span_index(start_pos); + let end = self.to_span_index(end_pos); + start.to(end) + } + + /// Forces consumption of the specified character. If the character is not + /// found, an error is emitted. + fn consume_closing_brace(&mut self, arg: &Argument<'_>) -> Option { + self.ws(); + + let pos; + let description; + + if let Some(&(peek_pos, maybe)) = self.cur.peek() { + if maybe == '}' { + self.cur.next(); + return Some(peek_pos); + } + + pos = peek_pos; + description = format!("expected `'}}'`, found `{maybe:?}`"); + } else { + description = "expected `'}'` but string was terminated".to_owned(); + // point at closing `"` + pos = self.input.len() - if self.append_newline { 1 } else { 0 }; + } + + let pos = self.to_span_index(pos); + + let label = "expected `'}'`".to_owned(); + let (note, secondary_label) = if arg.format.fill == Some('}') { + ( + Some("the character `'}'` is interpreted as a fill character because of the `:` that precedes it".to_owned()), + arg.format.fill_span.map(|sp| ("this is not interpreted as a formatting closing brace".to_owned(), sp)), + ) + } else { + ( + Some("if you intended to print `{`, you can escape it using `{{`".to_owned()), + self.last_opening_brace + .map(|sp| ("because of this opening brace".to_owned(), sp)), + ) + }; + + self.errors.push(ParseError { + description, + note, + label, + span: pos.to(pos), + secondary_label, + suggestion: Suggestion::None, + }); + + None + } + + /// Consumes all whitespace characters until the first non-whitespace character + fn ws(&mut self) { + while let Some(&(_, c)) = self.cur.peek() { + if c.is_whitespace() { + self.cur.next(); + } else { + break; + } + } + } + + /// Parses all of a string which is to be considered a "raw literal" in a + /// format string. This is everything outside of the braces. + fn string(&mut self, start: usize) -> &'a str { + // we may not consume the character, peek the iterator + while let Some(&(pos, c)) = self.cur.peek() { + match c { + '{' | '}' => { + return &self.input[start..pos]; + } + '\n' if self.is_source_literal => { + self.line_spans.push(self.span(self.cur_line_start, pos)); + self.cur_line_start = pos + 1; + self.cur.next(); + } + _ => { + if self.is_source_literal && pos == self.cur_line_start && c.is_whitespace() { + self.cur_line_start = pos + c.len_utf8(); + } + self.cur.next(); + } + } + } + &self.input[start..self.input.len()] + } + + /// Parses an `Argument` structure, or what's contained within braces inside the format string. + fn argument(&mut self, start: InnerOffset) -> Argument<'a> { + let pos = self.position(); + + let end = self + .cur + .clone() + .find(|(_, ch)| !ch.is_whitespace()) + .map_or(start, |(end, _)| self.to_span_index(end)); + let position_span = start.to(end); + + let format = match self.mode { + ParseMode::Format => self.format(), + ParseMode::InlineAsm => self.inline_asm(), + }; + + // Resolve position after parsing format spec. + let pos = match pos { + Some(position) => position, + None => { + let i = self.curarg; + self.curarg += 1; + ArgumentImplicitlyIs(i) + } + }; + + Argument { + position: pos, + position_span, + format, + } + } + + /// Parses a positional argument for a format. This could either be an + /// integer index of an argument, a named argument, or a blank string. + /// Returns `Some(parsed_position)` if the position is not implicitly + /// consuming a macro argument, `None` if it's the case. + fn position(&mut self) -> Option> { + if let Some(i) = self.integer() { + Some(ArgumentIs(i)) + } else { + match self.cur.peek() { + Some(&(lo, c)) if is_id_start(c) => { + let word = self.word(); + + // Recover from `r#ident` in format strings. + // FIXME: use a let chain + if word == "r" { + if let Some((pos, '#')) = self.cur.peek() { + if self.input[pos + 1..] + .chars() + .next() + .is_some_and(is_id_start) + { + self.cur.next(); + let word = self.word(); + let prefix_span = self.span(lo, lo + 2); + let full_span = self.span(lo, lo + 2 + word.len()); + self.errors.insert(0, ParseError { + description: "raw identifiers are not supported".to_owned(), + note: Some("identifiers in format strings can be keywords and don't need to be prefixed with `r#`".to_string()), + label: "raw identifier used here".to_owned(), + span: full_span, + secondary_label: None, + suggestion: Suggestion::RemoveRawIdent(prefix_span), + }); + return Some(ArgumentNamed(word)); + } + } + } + + Some(ArgumentNamed(word)) + } + + // This is an `ArgumentNext`. + // Record the fact and do the resolution after parsing the + // format spec, to make things like `{:.*}` work. + _ => None, + } + } + } + + fn current_pos(&mut self) -> usize { + if let Some(&(pos, _)) = self.cur.peek() { + pos + } else { + self.input.len() + } + } + + /// Parses a format specifier at the current position, returning all of the + /// relevant information in the `FormatSpec` struct. + fn format(&mut self) -> FormatSpec<'a> { + let mut spec = FormatSpec { + fill: None, + fill_span: None, + align: AlignUnknown, + sign: None, + alternate: false, + zero_pad: false, + debug_hex: None, + precision: CountImplied, + precision_span: None, + width: CountImplied, + width_span: None, + ty: &self.input[..0], + ty_span: None, + }; + if !self.consume(':') { + return spec; + } + + // fill character + if let Some(&(idx, c)) = self.cur.peek() { + if let Some((_, '>' | '<' | '^')) = self.cur.clone().nth(1) { + spec.fill = Some(c); + spec.fill_span = Some(self.span(idx, idx + 1)); + self.cur.next(); + } + } + // Alignment + if self.consume('<') { + spec.align = AlignLeft; + } else if self.consume('>') { + spec.align = AlignRight; + } else if self.consume('^') { + spec.align = AlignCenter; + } + // Sign flags + if self.consume('+') { + spec.sign = Some(Sign::Plus); + } else if self.consume('-') { + spec.sign = Some(Sign::Minus); + } + // Alternate marker + if self.consume('#') { + spec.alternate = true; + } + // Width and precision + let mut havewidth = false; + + if self.consume('0') { + // small ambiguity with '0$' as a format string. In theory this is a + // '0' flag and then an ill-formatted format string with just a '$' + // and no count, but this is better if we instead interpret this as + // no '0' flag and '0$' as the width instead. + if let Some(end) = self.consume_pos('$') { + spec.width = CountIsParam(0); + spec.width_span = Some(self.span(end - 1, end + 1)); + havewidth = true; + } else { + spec.zero_pad = true; + } + } + + if !havewidth { + let start = self.current_pos(); + spec.width = self.count(start); + if spec.width != CountImplied { + let end = self.current_pos(); + spec.width_span = Some(self.span(start, end)); + } + } + + if let Some(start) = self.consume_pos('.') { + if self.consume('*') { + // Resolve `CountIsNextParam`. + // We can do this immediately as `position` is resolved later. + let i = self.curarg; + self.curarg += 1; + spec.precision = CountIsStar(i); + } else { + spec.precision = self.count(start + 1); + } + let end = self.current_pos(); + spec.precision_span = Some(self.span(start, end)); + } + + let ty_span_start = self.current_pos(); + // Optional radix followed by the actual format specifier + if self.consume('x') { + if self.consume('?') { + spec.debug_hex = Some(DebugHex::Lower); + spec.ty = "?"; + } else { + spec.ty = "x"; + } + } else if self.consume('X') { + if self.consume('?') { + spec.debug_hex = Some(DebugHex::Upper); + spec.ty = "?"; + } else { + spec.ty = "X"; + } + } else if self.consume('?') { + spec.ty = "?"; + } else { + spec.ty = self.word(); + if !spec.ty.is_empty() { + let ty_span_end = self.current_pos(); + spec.ty_span = Some(self.span(ty_span_start, ty_span_end)); + } + } + spec + } + + /// Parses an inline assembly template modifier at the current position, returning the modifier + /// in the `ty` field of the `FormatSpec` struct. + fn inline_asm(&mut self) -> FormatSpec<'a> { + let mut spec = FormatSpec { + fill: None, + fill_span: None, + align: AlignUnknown, + sign: None, + alternate: false, + zero_pad: false, + debug_hex: None, + precision: CountImplied, + precision_span: None, + width: CountImplied, + width_span: None, + ty: &self.input[..0], + ty_span: None, + }; + if !self.consume(':') { + return spec; + } + + let ty_span_start = self.current_pos(); + spec.ty = self.word(); + if !spec.ty.is_empty() { + let ty_span_end = self.current_pos(); + spec.ty_span = Some(self.span(ty_span_start, ty_span_end)); + } + + spec + } + + /// Parses a `Count` parameter at the current position. This does not check + /// for 'CountIsNextParam' because that is only used in precision, not + /// width. + fn count(&mut self, start: usize) -> Count<'a> { + if let Some(i) = self.integer() { + if self.consume('$') { + CountIsParam(i) + } else { + CountIs(i) + } + } else { + let tmp = self.cur.clone(); + let word = self.word(); + if word.is_empty() { + self.cur = tmp; + CountImplied + } else if let Some(end) = self.consume_pos('$') { + let name_span = self.span(start, end); + CountIsName(word, name_span) + } else { + self.cur = tmp; + CountImplied + } + } + } + + /// Parses a word starting at the current position. A word is the same as + /// Rust identifier, except that it can't start with `_` character. + fn word(&mut self) -> &'a str { + let start = match self.cur.peek() { + Some(&(pos, c)) if is_id_start(c) => { + self.cur.next(); + pos + } + _ => { + return ""; + } + }; + let mut end = None; + while let Some(&(pos, c)) = self.cur.peek() { + if is_id_continue(c) { + self.cur.next(); + } else { + end = Some(pos); + break; + } + } + let end = end.unwrap_or(self.input.len()); + let word = &self.input[start..end]; + if word == "_" { + self.err_with_note( + "invalid argument name `_`", + "invalid argument name", + "argument name cannot be a single underscore", + self.span(start, end), + ); + } + word + } + + fn integer(&mut self) -> Option { + let mut cur: usize = 0; + let mut found = false; + let mut overflow = false; + let start = self.current_pos(); + while let Some(&(_, c)) = self.cur.peek() { + if let Some(i) = c.to_digit(10) { + let (tmp, mul_overflow) = cur.overflowing_mul(10); + let (tmp, add_overflow) = tmp.overflowing_add(i as usize); + if mul_overflow || add_overflow { + overflow = true; + } + cur = tmp; + found = true; + self.cur.next(); + } else { + break; + } + } + + if overflow { + let end = self.current_pos(); + let overflowed_int = &self.input[start..end]; + self.err( + format!( + "integer `{}` does not fit into the type `usize` whose range is `0..={}`", + overflowed_int, + usize::MAX + ), + "integer out of range for `usize`", + self.span(start, end), + ); + } + + found.then_some(cur) + } + + fn suggest_format(&mut self) { + if let (Some(pos), Some(_)) = (self.consume_pos('?'), self.consume_pos(':')) { + let word = self.word(); + let _end = self.current_pos(); + let pos = self.to_span_index(pos); + self.errors.insert( + 0, + ParseError { + description: "expected format parameter to occur after `:`".to_owned(), + note: Some(format!( + "`?` comes after `:`, try `{}:{}` instead", + word, "?" + )), + label: "expected `?` to occur after `:`".to_owned(), + span: pos.to(pos), + secondary_label: None, + suggestion: Suggestion::None, + }, + ); + } + } + + fn suggest_positional_arg_instead_of_captured_arg(&mut self, arg: Argument<'a>) { + if let Some(end) = self.consume_pos('.') { + let byte_pos = self.to_span_index(end); + let start = InnerOffset(byte_pos.0 + 1); + let field = self.argument(start); + // We can only parse `foo.bar` field access, any deeper nesting, + // or another type of expression, like method calls, are not supported + if !self.consume('}') { + return; + } + if let ArgumentNamed(_) = arg.position { + if let ArgumentNamed(_) = field.position { + self.errors.insert( + 0, + ParseError { + description: "field access isn't supported".to_string(), + note: None, + label: "not supported".to_string(), + span: InnerSpan::new(arg.position_span.start, field.position_span.end), + secondary_label: None, + suggestion: Suggestion::UsePositional, + }, + ); + } + } + } + } +} + +/// Finds the indices of all characters that have been processed and differ between the actual +/// written code (code snippet) and the `InternedString` that gets processed in the `Parser` +/// in order to properly synthesise the intra-string `Span`s for error diagnostics. +// TODO: Can we give an escaped string here? probably yes - and a valid one too +fn find_width_map_from_snippet( + input: &str, + snippet: Option, + str_style: Option, +) -> InputStringKind { + let snippet = match snippet { + Some(ref s) if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") => s, + _ => return InputStringKind::NotALiteral, + }; + + if str_style.is_some() { + return InputStringKind::Literal { + width_mappings: Vec::new(), + }; + } + + // Strip quotes. + let snippet = &snippet[1..snippet.len() - 1]; + + // Macros like `println` add a newline at the end. That technically doesn't make them "literals" anymore, but it's fine + // since we will never need to point our spans there, so we lie about it here by ignoring it. + // Since there might actually be newlines in the source code, we need to normalize away all trailing newlines. + // If we only trimmed it off the input, `format!("\n")` would cause a mismatch as here we they actually match up. + // Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up. + let input_no_nl = input.trim_end_matches('\n'); + let Some(unescaped) = unescape_string(snippet) else { + return InputStringKind::NotALiteral; + }; + + let unescaped_no_nl = unescaped.trim_end_matches('\n'); + + if unescaped_no_nl != input_no_nl { + // The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect. + // This can for example happen with proc macros that respan generated literals. + return InputStringKind::NotALiteral; + } + + let mut s = snippet.char_indices(); + let mut width_mappings = vec![]; + while let Some((pos, c)) = s.next() { + match (c, s.clone().next()) { + // skip whitespace and empty lines ending in '\\' + ('\\', Some((_, '\n'))) => { + let _ = s.next(); + let mut width = 2; + + while let Some((_, c)) = s.clone().next() { + if matches!(c, ' ' | '\n' | '\t') { + width += 1; + let _ = s.next(); + } else { + break; + } + } + + width_mappings.push(InnerWidthMapping::new(pos, width, 0)); + } + ('\\', Some((_, 'n' | 't' | 'r' | '0' | '\\' | '\'' | '\"'))) => { + width_mappings.push(InnerWidthMapping::new(pos, 2, 1)); + let _ = s.next(); + } + ('\\', Some((_, 'x'))) => { + // consume `\xAB` literal + s.nth(2); + width_mappings.push(InnerWidthMapping::new(pos, 4, 1)); + } + ('\\', Some((_, 'u'))) => { + let mut width = 2; + let _ = s.next(); + + if let Some((_, next_c)) = s.next() { + if next_c == '{' { + // consume up to 6 hexanumeric chars + let digits_len = s + .clone() + .take(6) + .take_while(|(_, c)| c.is_digit(16)) + .count(); + + let len_utf8 = s + .as_str() + .get(..digits_len) + .and_then(|digits| u32::from_str_radix(digits, 16).ok()) + .and_then(char::from_u32) + .map_or(1, char::len_utf8); + + // Skip the digits, for chars that encode to more than 1 utf-8 byte + // exclude as many digits as it is greater than 1 byte + // + // So for a 3 byte character, exclude 2 digits + let required_skips = digits_len.saturating_sub(len_utf8.saturating_sub(1)); + + // skip '{' and '}' also + width += required_skips + 2; + + s.nth(digits_len); + } else if next_c.is_digit(16) { + width += 1; + + // We suggest adding `{` and `}` when appropriate, accept it here as if + // it were correct + let mut i = 0; // consume up to 6 hexanumeric chars + while let (Some((_, c)), _) = (s.next(), i < 6) { + if c.is_digit(16) { + width += 1; + } else { + break; + } + i += 1; + } + } + } + + width_mappings.push(InnerWidthMapping::new(pos, width, 1)); + } + _ => {} + } + } + + InputStringKind::Literal { width_mappings } +} + +// TODO: I guess we can provide an `unescape_string` function to the parser... but how do we do that +// Store it in the parser struct? we need to make it FFI-aware +// SO this is not possible because we need `unescape_string` *before* we have a parser + +fn unescape_string(string: &str) -> Option { + // let mut buf = string::String::new(); + // let mut ok = true; + // unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| { + // match unescaped_char { + // Ok(c) => buf.push(c), + // Err(_) => ok = false, + // } + // }); + + let buf = string::String::from(string); + let ok = true; + + ok.then_some(buf) +} + +// Assert a reasonable size for `Piece` +// #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] +// rustc_index::static_assert_size!(Piece<'_>, 16); + +// #[cfg(test)] +// mod tests; diff --git a/libgrust/libformat_parser/src/bin.rs b/libgrust/libformat_parser/src/bin.rs new file mode 100644 index 000000000000..4b1f903ad5fa --- /dev/null +++ b/libgrust/libformat_parser/src/bin.rs @@ -0,0 +1,7 @@ +use libformat_parser::rust; + +fn main() { + dbg!(rust::collect_pieces( + std::env::args().nth(1).unwrap().as_str() + )); +} diff --git a/libgrust/libformat_parser/src/lib.rs b/libgrust/libformat_parser/src/lib.rs new file mode 100644 index 000000000000..c164578a1039 --- /dev/null +++ b/libgrust/libformat_parser/src/lib.rs @@ -0,0 +1,363 @@ +//! FFI interface for `rustc_format_parser` + +// what's the plan? Have a function return something that can be constructed into a vector? +// or an iterator? + +use std::ffi::CStr; + +trait IntoFFI { + fn into_ffi(self) -> T; +} + +impl IntoFFI<*const T> for Option +where + T: Sized, +{ + fn into_ffi(self) -> *const T { + match self.as_ref() { + None => std::ptr::null(), + Some(r) => r as *const T, + } + } +} + +// FIXME: Make an ffi module in a separate file +// FIXME: Remember to leak the boxed type somehow +// FIXME: How to encode the Option type? As a pointer? Option -> Option<&T> -> *const T could work maybe? +mod ffi { + use super::IntoFFI; + + // Note: copied from rustc_span + /// Range inside of a `Span` used for diagnostics when we only have access to relative positions. + #[derive(Copy, Clone, PartialEq, Eq, Debug)] + #[repr(C)] + pub struct InnerSpan { + pub start: usize, + pub end: usize, + } + + /// The location and before/after width of a character whose width has changed from its source code + /// representation + #[derive(Copy, Clone, PartialEq, Eq)] + #[repr(C)] + pub struct InnerWidthMapping { + /// Index of the character in the source + pub position: usize, + /// The inner width in characters + pub before: usize, + /// The transformed width in characters + pub after: usize, + } + + // TODO: Not needed for now? + // /// Whether the input string is a literal. If yes, it contains the inner width mappings. + // #[derive(Clone, PartialEq, Eq)] + // #[repr(C)] + // enum InputStringKind { + // NotALiteral, + // Literal { + // width_mappings: Vec, + // }, + // } + + // TODO: Not needed for now? + // /// The type of format string that we are parsing. + // #[derive(Copy, Clone, Debug, Eq, PartialEq)] + // #[repr(C)] + // pub enum ParseMode { + // /// A normal format string as per `format_args!`. + // Format, + // /// An inline assembly template string for `asm!`. + // InlineAsm, + // } + + #[derive(Copy, Clone)] + #[repr(C)] + struct InnerOffset(usize); + + /// A piece is a portion of the format string which represents the next part + /// to emit. These are emitted as a stream by the `Parser` class. + #[derive(Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Piece<'a> { + /// A literal string which should directly be emitted + String(&'a str), + /// This describes that formatting should process the next argument (as + /// specified inside) for emission. + NextArgument(*const Argument<'a>), + } + + impl<'a> Drop for Piece<'a> { + fn drop(&mut self) { + println!("dropping Piece: {:?}", self) + } + } + + /// Representation of an argument specification. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub struct Argument<'a> { + /// Where to find this argument + pub position: Position<'a>, + /// The span of the position indicator. Includes any whitespace in implicit + /// positions (`{ }`). + pub position_span: InnerSpan, + /// How to format the argument + pub format: FormatSpec<'a>, + } + + /// Specification for the formatting of an argument in the format string. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub struct FormatSpec<'a> { + /// Optionally specified character to fill alignment with. + pub fill: Option, + /// Span of the optionally specified fill character. + pub fill_span: *const InnerSpan, + /// Optionally specified alignment. + pub align: Alignment, + /// The `+` or `-` flag. + pub sign: *const Sign, + /// The `#` flag. + pub alternate: bool, + /// The `0` flag. + pub zero_pad: bool, + /// The `x` or `X` flag. (Only for `Debug`.) + pub debug_hex: *const DebugHex, + /// The integer precision to use. + pub precision: Count<'a>, + /// The span of the precision formatting flag (for diagnostics). + pub precision_span: *const InnerSpan, + /// The string width requested for the resulting format. + pub width: Count<'a>, + /// The span of the width formatting flag (for diagnostics). + pub width_span: *const InnerSpan, + /// The descriptor string representing the name of the format desired for + /// this argument, this can be empty or any number of characters, although + /// it is required to be one word. + pub ty: &'a str, + /// The span of the descriptor string (for diagnostics). + pub ty_span: *const InnerSpan, + } + + /// Enum describing where an argument for a format can be located. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Position<'a> { + /// The argument is implied to be located at an index + ArgumentImplicitlyIs(usize), + /// The argument is located at a specific index given in the format, + ArgumentIs(usize), + /// The argument has a name. + ArgumentNamed(&'a str), + } + + /// Enum of alignments which are supported. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Alignment { + /// The value will be aligned to the left. + AlignLeft, + /// The value will be aligned to the right. + AlignRight, + /// The value will be aligned in the center. + AlignCenter, + /// The value will take on a default alignment. + AlignUnknown, + } + + /// Enum for the sign flags. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Sign { + /// The `+` flag. + Plus, + /// The `-` flag. + Minus, + } + + /// Enum for the debug hex flags. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum DebugHex { + /// The `x` flag in `{:x?}`. + Lower, + /// The `X` flag in `{:X?}`. + Upper, + } + + /// A count is used for the precision and width parameters of an integer, and + /// can reference either an argument or a literal integer. + #[derive(Copy, Clone, Debug, PartialEq)] + #[repr(C)] + pub enum Count<'a> { + /// The count is specified explicitly. + CountIs(usize), + /// The count is specified by the argument with the given name. + CountIsName(&'a str, InnerSpan), + /// The count is specified by the argument at the given index. + CountIsParam(usize), + /// The count is specified by a star (like in `{:.*}`) that refers to the argument at the given index. + CountIsStar(usize), + /// The count is implied and cannot be explicitly specified. + CountImplied, + } + + impl<'a> From> for Piece<'a> { + fn from(old: generic_format_parser::Piece<'a>) -> Self { + match old { + generic_format_parser::Piece::String(x) => Piece::String(x), + generic_format_parser::Piece::NextArgument(x) => { + // FIXME: This is problematic - if we do this, then we probably run into the issue that the Box + // is freed at the end of the call to collect_pieces. if we just .leak() it, then we have + // a memory leak... should we resend the info back to the Rust lib afterwards to free it? + // this is definitely the best way - store that pointer in the FFI piece and rebuild the box + // in a Rust destructor + let ptr = Box::leak(x); + let dst = Into::::into(*ptr); + + Piece::NextArgument(&dst as *const Argument) + } + } + } + } + + impl<'a> From> for Argument<'a> { + fn from(old: generic_format_parser::Argument<'a>) -> Self { + Argument { + position: old.position.into(), + position_span: old.position_span.into(), + format: old.format.into(), + } + } + } + + impl<'a> From> for Position<'a> { + fn from(old: generic_format_parser::Position<'a>) -> Self { + match old { + generic_format_parser::Position::ArgumentImplicitlyIs(x) => { + Position::ArgumentImplicitlyIs(x.into()) + } + generic_format_parser::Position::ArgumentIs(x) => Position::ArgumentIs(x.into()), + generic_format_parser::Position::ArgumentNamed(x) => { + Position::ArgumentNamed(x.into()) + } + } + } + } + + impl From for InnerSpan { + fn from(old: generic_format_parser::InnerSpan) -> Self { + InnerSpan { + start: old.start, + end: old.end, + } + } + } + + impl<'a> From> for FormatSpec<'a> { + fn from(old: generic_format_parser::FormatSpec<'a>) -> Self { + FormatSpec { + fill: old.fill, + fill_span: old.fill_span.map(Into::into).into_ffi(), + align: old.align.into(), + sign: old.sign.map(Into::into).into_ffi(), + alternate: old.alternate, + zero_pad: old.zero_pad, + debug_hex: old.debug_hex.map(Into::into).into_ffi(), + precision: old.precision.into(), + precision_span: old.precision_span.map(Into::into).into_ffi(), + width: old.width.into(), + width_span: old.width_span.map(Into::into).into_ffi(), + ty: old.ty, + ty_span: old.ty_span.map(Into::into).into_ffi(), + } + } + } + + impl From for DebugHex { + fn from(old: generic_format_parser::DebugHex) -> Self { + match old { + generic_format_parser::DebugHex::Lower => DebugHex::Lower, + generic_format_parser::DebugHex::Upper => DebugHex::Upper, + } + } + } + + impl<'a> From> for Count<'a> { + fn from(old: generic_format_parser::Count<'a>) -> Self { + match old { + generic_format_parser::Count::CountIs(x) => Count::CountIs(x), + generic_format_parser::Count::CountIsName(x, y) => Count::CountIsName(x, y.into()), + generic_format_parser::Count::CountIsParam(x) => Count::CountIsParam(x), + generic_format_parser::Count::CountIsStar(x) => Count::CountIsStar(x), + generic_format_parser::Count::CountImplied => Count::CountImplied, + } + } + } + + impl From for Sign { + fn from(old: generic_format_parser::Sign) -> Self { + match old { + generic_format_parser::Sign::Plus => Sign::Plus, + generic_format_parser::Sign::Minus => Sign::Minus, + } + } + } + + impl From for Alignment { + fn from(old: generic_format_parser::Alignment) -> Self { + match old { + generic_format_parser::Alignment::AlignLeft => Alignment::AlignLeft, + generic_format_parser::Alignment::AlignRight => Alignment::AlignRight, + generic_format_parser::Alignment::AlignCenter => Alignment::AlignCenter, + generic_format_parser::Alignment::AlignUnknown => Alignment::AlignUnknown, + } + } + } +} + +// FIXME: Rename? +pub mod rust { + use generic_format_parser::{ParseMode, Parser, Piece}; + + pub fn collect_pieces(input: &str) -> Vec> { + let parser = Parser::new(input, None, None, true, ParseMode::Format); + + parser.into_iter().collect() + } +} + +// TODO: Should we instead make an FFIVector struct? +#[repr(C)] +pub struct PieceSlice { + base_ptr: *mut ffi::Piece<'static /* FIXME: That's wrong */>, + len: usize, + cap: usize, +} + +#[no_mangle] +pub extern "C" fn collect_pieces(input: *const libc::c_char) -> PieceSlice { + // FIXME: Add comment + let str = unsafe { CStr::from_ptr(input) }; + dbg!(str); + + // FIXME: No unwrap + let pieces: Vec> = rust::collect_pieces(str.to_str().unwrap()) + .into_iter() + .map(Into::into) + .collect(); + + println!("[ARTHUR]: debug: {:?}, {:?}", pieces.as_ptr(), pieces.len()); + + PieceSlice { + len: pieces.len(), + cap: pieces.capacity(), + base_ptr: pieces.leak().as_mut_ptr(), + } +} + +#[no_mangle] +pub extern "C" fn destroy_pieces(PieceSlice { base_ptr, len, cap }: PieceSlice) { + let _ = unsafe { Vec::from_raw_parts(base_ptr, len, cap) }; +}