Skip to content

Commit

Permalink
Format preprocessed token stream in multiple passes
Browse files Browse the repository at this point in the history
Currently, the formatter doesn't handle many scenarios involving preprocessor
`ifdef`s/`endif`s interleaved with `begin`s, module headers, etc.
This patch attempts to solve this problem by performing multiple passes of the
formatting on preprocessed variants of the source. Each of these variants has a
different set of preprocessor branches enabled. Together, they should cover the
entire source (though that doesn't work in all cases yet). After several
formatting passes for different variants of the AST, a correct and properly
formatted file is produced.

Signed-off-by: Krzysztof Bieganski <[email protected]>
  • Loading branch information
kbieganski committed Jul 7, 2023
1 parent 92dc626 commit c90b681
Show file tree
Hide file tree
Showing 27 changed files with 1,054 additions and 283 deletions.
18 changes: 15 additions & 3 deletions common/formatting/tree_unwrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,21 @@ TreeUnwrapper::TreeUnwrapper(
// array, and be able to 'extend' into the array of preformatted_tokens_.
}

const TokenPartitionTree* TreeUnwrapper::Unwrap() {
TreeUnwrapper::TreeUnwrapper(TreeUnwrapper&& other) noexcept
: TreeContextVisitor(std::move(other)),
text_structure_view_(other.text_structure_view_),
preformatted_tokens_(other.preformatted_tokens_),
next_unfiltered_token_(other.next_unfiltered_token_),
current_indentation_spaces_(other.current_indentation_spaces_),
unwrapped_lines_(std::move(other.unwrapped_lines_)) {
if (other.active_unwrapped_lines_ == &other.unwrapped_lines_) {
active_unwrapped_lines_ = &unwrapped_lines_;
} else {
active_unwrapped_lines_ = other.active_unwrapped_lines_;
}
}

void TreeUnwrapper::Unwrap() {
// Collect tokens that appear before first syntax tree leaf, e.g. comments.
CollectLeadingFilteredTokens();

Expand Down Expand Up @@ -105,8 +119,6 @@ const TokenPartitionTree* TreeUnwrapper::Unwrap() {
}

VerifyFullTreeFormatTokenRanges();

return &unwrapped_lines_;
}

std::vector<UnwrappedLine> TreeUnwrapper::FullyPartitionedUnwrappedLines()
Expand Down
6 changes: 4 additions & 2 deletions common/formatting/tree_unwrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class TreeUnwrapper : public TreeContextVisitor {
// Deleted standard interfaces:
TreeUnwrapper() = delete;
TreeUnwrapper(const TreeUnwrapper&) = delete;
TreeUnwrapper(TreeUnwrapper&&) = delete;
TreeUnwrapper(TreeUnwrapper&&) noexcept;
TreeUnwrapper& operator=(const TreeUnwrapper&) = delete;
TreeUnwrapper& operator=(TreeUnwrapper&&) = delete;

Expand All @@ -57,7 +57,7 @@ class TreeUnwrapper : public TreeContextVisitor {
// Partitions the token stream (in text_structure_view_) into
// unwrapped_lines_ by traversing the syntax tree representation.
// TODO(fangism): rename this Partition.
const TokenPartitionTree* Unwrap();
void Unwrap();

// Returns a flattened copy of all of the deepest nodes in the tree of
// unwrapped lines, which represents maximal partitioning into the smallest
Expand All @@ -81,6 +81,8 @@ class TreeUnwrapper : public TreeContextVisitor {
return active_unwrapped_lines_;
}

TokenPartitionTree* UnwrappedLines() { return &unwrapped_lines_; }

// Returns text spanned by the syntax tree being traversed.
absl::string_view FullText() const { return text_structure_view_.Contents(); }

Expand Down
25 changes: 25 additions & 0 deletions common/text/text_structure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,14 @@ TextStructure::~TextStructure() {
CHECK(status.ok()) << status.message() << " (in dtor)";
}

void TextStructure::RebaseTokens(const TextStructure& other) {
CHECK_EQ(contents_->AsStringView().length(),
other.contents_->AsStringView().length());
MutableData().RebaseTokensToSuperstring(other.contents_->AsStringView(),
contents_->AsStringView(), 0);
contents_ = other.contents_;
}

void TextStructureView::ExpandSubtrees(NodeExpansionMap* expansions) {
TokenSequence combined_tokens;
// Gather indices and reconstruct iterators after there are no more
Expand Down Expand Up @@ -560,6 +568,23 @@ void TextStructureView::ExpandSubtrees(NodeExpansionMap* expansions) {
CalculateFirstTokensPerLine();
}

void TextStructureView::ColorStreamViewTokens(size_t color) {
auto view_it = GetTokenStreamView().begin();
auto stream_it = MutableTokenStream().begin();
while (view_it != GetTokenStreamView().end() &&
stream_it != MutableTokenStream().end()) {
if (*view_it == stream_it) {
stream_it->color = color;
++view_it;
++stream_it;
} else if (*view_it < stream_it) {
++view_it;
} else {
++stream_it;
}
}
}

absl::Status TextStructure::StringViewConsistencyCheck() const {
const absl::string_view contents = data_.Contents();
if (!contents.empty() && !IsSubRange(contents, contents_->AsStringView())) {
Expand Down
9 changes: 9 additions & 0 deletions common/text/text_structure.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class TextStructureView {

// Do not copy/assign. This contains pointers/iterators to internals.
TextStructureView(const TextStructureView&) = delete;
TextStructureView(TextStructureView&&) = default;
TextStructureView& operator=(const TextStructureView&) = delete;

absl::string_view Contents() const { return contents_; }
Expand Down Expand Up @@ -172,6 +173,9 @@ class TextStructureView {
// by this function.
void ExpandSubtrees(NodeExpansionMap* expansions);

// Sets the 'color' of all tokens in the token stream view.
void ColorStreamViewTokens(size_t color);

// All of this class's consistency checks combined.
absl::Status InternalConsistencyCheck() const;

Expand Down Expand Up @@ -283,6 +287,11 @@ class TextStructure {
// DeferredExpansion::subanalysis requires this destructor to be virtual.
virtual ~TextStructure();

// Update tokens to point their text into the contents of the given
// TextStructure. The contents_ pointer in this TextStructure will point to
// the other TextStructure's contents.
void RebaseTokens(const TextStructure& other);

const TextStructureView& Data() const { return data_; }

TextStructureView& MutableData() { return data_; }
Expand Down
5 changes: 5 additions & 0 deletions common/text/token_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ class TokenInfo {

bool isEOF() const { return token_enum_ == TK_EOF; }

// The color is used for differentiating tokens from different preprocess
// variants. If color equals 0, the token is part of preprocessor-disabled
// code. Only used in the formatter.
int color = 0; // FIXME: do this differently

protected: // protected, as ExpectedTokenInfo accesses it.
int token_enum_;

Expand Down
2 changes: 1 addition & 1 deletion verilog/CST/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
namespace verilog {
namespace {

static constexpr VerilogPreprocess::Config kDefaultPreprocess;
static VerilogPreprocess::Config kDefaultPreprocess;

using verible::SyntaxTreeSearchTestCase;
using verible::TextStructureView;
Expand Down
3 changes: 3 additions & 0 deletions verilog/analysis/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,13 @@ cc_library(
hdrs = ["flow_tree.h"],
deps = [
"//common/text:token-stream-view",
"//common/util:interval-set",
"//common/util:logging",
"//common/util:status-macros",
"//verilog/parser:verilog-token-enum",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/status",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
],
)
Expand Down
2 changes: 1 addition & 1 deletion verilog/analysis/extractors_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
namespace verilog {
namespace analysis {
namespace {
static constexpr VerilogPreprocess::Config kDefaultPreprocess;
static VerilogPreprocess::Config kDefaultPreprocess;

TEST(CollectInterfaceNamesTest, NonModuleTests) {
const std::pair<absl::string_view, std::set<std::string>> kTestCases[] = {
Expand Down
84 changes: 81 additions & 3 deletions verilog/analysis/flow_tree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "common/util/interval_set.h"
#include "common/util/logging.h"
#include "common/util/status_macros.h"
#include "verilog/parser/verilog_token_enum.h"
Expand Down Expand Up @@ -125,7 +126,7 @@ absl::Status FlowTree::MacroFollows(
conditional_iterator->token_enum() != PP_elsif) {
return absl::InvalidArgumentError("Error macro name can't be extracted.");
}
auto macro_iterator = conditional_iterator + 1;
auto macro_iterator = conditional_iterator + 2;
if (macro_iterator->token_enum() != PP_Identifier) {
return absl::InvalidArgumentError("Expected identifier for macro name.");
}
Expand All @@ -141,7 +142,7 @@ absl::Status FlowTree::AddMacroOfConditional(
return absl::InvalidArgumentError(
"Error no macro follows the conditional directive.");
}
auto macro_iterator = conditional_iterator + 1;
auto macro_iterator = conditional_iterator + 2;
auto macro_identifier = macro_iterator->text();
if (conditional_macro_id_.find(macro_identifier) ==
conditional_macro_id_.end()) {
Expand All @@ -161,7 +162,7 @@ int FlowTree::GetMacroIDOfConditional(
// TODO(karimtera): add a better error handling.
return -1;
}
auto macro_iterator = conditional_iterator + 1;
auto macro_iterator = conditional_iterator + 2;
auto macro_identifier = macro_iterator->text();
// It is always assumed that the macro already exists in the map.
return conditional_macro_id_[macro_identifier];
Expand All @@ -176,6 +177,83 @@ absl::Status FlowTree::GenerateVariants(const VariantReceiver &receiver) {
return DepthFirstSearch(receiver, source_sequence_.begin());
}

absl::StatusOr<FlowTree::DefineVariants> FlowTree::MinCoverDefineVariants() {
auto status = GenerateControlFlowTree();
if (!status.ok()) return status;
verible::IntervalSet<int64_t> covered; // Tokens covered by
// MinCoverDefineVariants.
verible::IntervalSet<int64_t> last_covered; // Tokens covered
// by the previous iterations.
DefineVariants define_variants; // The result – all define variants that
// should cover the entire source
DefineSet visited; // Visited defines are ones that are assumed to be defined
// or undefined (decided in a previous iteration)
const int64_t tok_count = static_cast<int64_t>(source_sequence_.size());
while (!covered.Contains({0, tok_count})) {
DefineSet defines; // Define sets are moved into the define variants list,
// so we make a new one each iteration
visited.clear(); // We keep the visited set to avoid unnecessary
// allocations, but clear it each iteration
TokenSequenceConstIterator tok_it = source_sequence_.begin();
while (tok_it < source_sequence_.end()) {
covered.Add(std::distance(source_sequence_.begin(), tok_it));
if (tok_it->token_enum() == PP_ifdef ||
tok_it->token_enum() == PP_ifndef ||
tok_it->token_enum() == PP_elsif) {
const auto macro_id_it = tok_it + 2;
auto macro_text = macro_id_it->text();
bool negated = tok_it->token_enum() == PP_ifndef;
// If this macro was already visited (either defined/undefined), we
// to stick to the same branch. TODO: handle `defines
if (visited.contains(macro_text)) {
bool assume_condition_is_true =
(negated ^ defines.contains(macro_text));
tok_it = edges_[tok_it][assume_condition_is_true ? 0 : 1];
} else {
// First time we see this macro; mark as visited
visited.insert(macro_text);
const auto if_it = edges_[tok_it][0];
const auto if_idx = std::distance(source_sequence_.begin(), if_it);
const auto else_it = edges_[tok_it][1];
const auto else_idx =
std::distance(source_sequence_.begin(), else_it);
if (!covered.Contains({if_idx, else_idx})) {
// If the `ifdef is not covered, we assume the condition is true
if (!negated) defines.insert(macro_text);
tok_it = if_it;
} else {
// Else we assume the condition is false
if (negated) defines.insert(macro_text);
tok_it = else_it;
}
}
} else {
const auto it = edges_.find(tok_it);
if (it == edges_.end() || it->second.empty()) {
// If there's no outgoing edge, just move to the next token.
tok_it++;
} else {
// Else jump
tok_it = edges_[tok_it][0];
}
}
}
define_variants.push_back(std::move(defines));
// To prevent an infinite loop, if nothing new was covered, break.
if (last_covered == covered) {
// TODO: If there are nested `ifdefs that contradict each other early in
// the source, this will prevent us from traversing the rest of the flow
// tree. It would be better to detect this case, assume that the
// contradicting part is covered, and continue the analysis.
VLOG(4) << "Giving up on finding all define variants";
break; // Perhaps we should error?
}
last_covered = covered;
}
VLOG(4) << "Done generating define variants. Coverage: " << covered;
return define_variants;
}

// Constructs the control flow tree, which determines the edge from each node
// (token index) to the next possible childs, And save edge_from_iterator in
// edges_.
Expand Down
13 changes: 13 additions & 0 deletions verilog/analysis/flow_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#include <string>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "common/text/token_stream_view.h"

namespace verilog {
Expand Down Expand Up @@ -80,6 +82,17 @@ class FlowTree {
// Generates all possible variants.
absl::Status GenerateVariants(const VariantReceiver &receiver);

// Set of macro name defines.
using DefineSet = absl::flat_hash_set<absl::string_view>;

// A list of macro name sets; each set represents a variant of the source;
// together they should cover the entire source.
using DefineVariants = std::vector<DefineSet>;

// Returns the minimum set of defines needed to generate token stream variants
// that cover the entire source.
absl::StatusOr<DefineVariants> MinCoverDefineVariants();

// Returns all the used macros in conditionals, ordered with the same ID as
// used in BitSets.
const std::vector<TokenSequenceConstIterator> &GetUsedMacros() {
Expand Down
16 changes: 9 additions & 7 deletions verilog/analysis/verilog_analyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,17 +248,13 @@ absl::Status VerilogAnalyzer::Analyze() {
// Lex into tokens.
RETURN_IF_ERROR(Tokenize());

// Here would be one place to analyze the raw token stream.
FilterTokensForSyntaxTree();

// Disambiguate tokens using lexical context.
ContextualizeTokens();

// pseudo-preprocess token stream.
// Not all analyses will want to preprocess.
{
VerilogPreprocess preprocessor(preprocess_config_);
preprocessor_data_ = preprocessor.ScanStream(Data().GetTokenStreamView());
verible::TokenStreamView lexed_streamview;
InitTokenStreamView(Data().TokenStream(), &lexed_streamview);
preprocessor_data_ = preprocessor.ScanStream(lexed_streamview);
if (!preprocessor_data_.errors.empty()) {
for (const auto& error : preprocessor_data_.errors) {
rejected_tokens_.push_back(verible::RejectedToken{
Expand Down Expand Up @@ -288,6 +284,12 @@ absl::Status VerilogAnalyzer::Analyze() {
// TODO(fangism): could we just move, swap, or directly reference?
}

// Here would be one place to analyze the raw token stream.
FilterTokensForSyntaxTree();

// Disambiguate tokens using lexical context.
ContextualizeTokens();

auto generator = MakeTokenViewer(Data().GetTokenStreamView());
VerilogParser parser(&generator, filename_);
parse_status_ = FileAnalyzer::Parse(&parser);
Expand Down
6 changes: 3 additions & 3 deletions verilog/analysis/verilog_analyzer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ using verible::SyntaxTreeLeaf;
using verible::TokenInfo;
using verible::TokenInfoTestData;

static constexpr verilog::VerilogPreprocess::Config kDefaultPreprocess;
static verilog::VerilogPreprocess::Config kDefaultPreprocess;

bool TreeContainsToken(const ConcreteSyntaxTree& tree, const TokenInfo& token) {
const auto* matching_leaf =
Expand Down Expand Up @@ -509,10 +509,10 @@ TEST(AnalyzeVerilogAutomaticMode, InferredModuleBodyMode) {
}

TEST(AnalyzeVerilogAutomaticMode, AutomaticWithFallback) {
static constexpr verilog::VerilogPreprocess::Config kNoBranchFilter{
static verilog::VerilogPreprocess::Config kNoBranchFilter{
.filter_branches = false,
};
static constexpr verilog::VerilogPreprocess::Config kWithBranchFilter{
static verilog::VerilogPreprocess::Config kWithBranchFilter{
.filter_branches = true,
};

Expand Down
2 changes: 1 addition & 1 deletion verilog/analysis/verilog_project.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace verilog {
// All files we process with the verilog project, essentially applications that
// build a symbol table (project-tool, kythe-indexer) only benefit from
// processing the same sequence of tokens a synthesis tool sees.
static constexpr verilog::VerilogPreprocess::Config kPreprocessConfig{
static verilog::VerilogPreprocess::Config kPreprocessConfig{
.filter_branches = true,
};

Expand Down
1 change: 1 addition & 0 deletions verilog/formatting/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ cc_library(
"//common/util:vector-tree-iterators",
"//verilog/CST:declaration",
"//verilog/CST:module",
"//verilog/analysis:flow-tree",
"//verilog/analysis:verilog-analyzer",
"//verilog/analysis:verilog-equivalence",
"//verilog/parser:verilog-token-enum",
Expand Down
Loading

0 comments on commit c90b681

Please sign in to comment.