Skip to content

Commit

Permalink
LL(1) parse an AST
Browse files Browse the repository at this point in the history
This "AST" is more accurately described as a parse tree. The basic idea
was to accumulate the attributes on a separate stack, but make note of
when we needed to merge the tree node into the parent node by adding a
sentinel token to the LL(1) parsing stack.
  • Loading branch information
Quincunx271 committed Oct 9, 2022
1 parent 84a2708 commit b6f3d1f
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 8 deletions.
12 changes: 4 additions & 8 deletions src/chef/cfg/ast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,23 @@
#include <variant>
#include <vector>

#include <chef/cfg/cfg.hpp>

// Defines a runtime AST for chef.
//
// It remains to be decided whether this will include concrete pieces such as
// punctuation, or whether it will only include the abstract information.

namespace chef::rt_ast {
struct ast_node;

struct sequence_node {
std::vector<std::unique_ptr<ast_node>> children;
};

struct ast_node {
using value_type = std::variant<cfg_token, sequence_node>;
using element_type = std::variant<cfg_token, std::unique_ptr<ast_node>>;

// The name of the rule that produced this node.
std::string name;
// For nodes which are not the full node of the rule, this identifies
// which piece of the larger rule the node comes from.
std::size_t id = 0;

value_type value;
std::vector<element_type> children;
};
}
71 changes: 71 additions & 0 deletions src/chef/cfg/ll1.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#pragma once

#include <algorithm>
#include <iterator>
#include <map>
#include <optional>
#include <ranges>
#include <vector>

#include <chef/cfg/ast.hpp>
#include <chef/cfg/cfg.hpp>
#include <chef/util/ranges.hpp>

Expand Down Expand Up @@ -110,5 +113,73 @@ namespace chef {

return stack.empty();
}

template <InputRangeOf<const cfg_token&> TokenStream>
std::optional<rt_ast::ast_node> parse_rt(cfg_var start, TokenStream&& tokens) const
{
using std::ranges::begin;
using std::ranges::end;

struct rule_end_sentinel { };

std::vector<std::variant<rule_end_sentinel, cfg_seq::value_type>> stack;
stack.push_back(std::move(start));

std::vector<rt_ast::ast_node> seq_stack;
seq_stack.emplace_back().name = start.value;

for (auto first = begin(tokens); first != end(tokens);) {
if (stack.empty()) {
// An empty stack only matches the empty string, but we have at least one more
// token to process.
return std::nullopt;
}
auto cur = std::move(stack.back());
stack.pop_back();

// Either forms a reference or performs temporary lifetime extension if the
// operator*() actually returns an rvalue.
const cfg_token& next = *first;

if (std::holds_alternative<rule_end_sentinel>(cur)) {
auto node = std::make_unique<rt_ast::ast_node>(std::move(seq_stack.back()));
seq_stack.pop_back();
seq_stack.back().children.push_back(std::move(node));
} else {
auto actual_cur = std::get<cfg_seq::value_type>(std::move(cur));
if (std::holds_alternative<cfg_token>(actual_cur)) {
// Raw tokens must match exactly.
if (std::get<cfg_token>(actual_cur) != next) return std::nullopt;
seq_stack.back().children.push_back(std::get<cfg_token>(actual_cur));
++first;
} else {
stack.emplace_back(rule_end_sentinel{});
seq_stack.emplace_back().name = std::get<cfg_var>(actual_cur).value;
// Variables are expanded according to the next token, and we do not pop
// from the input sequence.
std::vector<cfg_seq::value_type> extract;
expand_variable(extract, std::get<cfg_var>(actual_cur), next);
std::ranges::transform(extract, std::back_inserter(stack),
[](cfg_seq::value_type& val) -> decltype(stack)::value_type {
return std::move(val);
});
}
}
}

while (!stack.empty() && std::holds_alternative<rule_end_sentinel>(stack.back())) {
stack.pop_back();
auto node = std::make_unique<rt_ast::ast_node>(std::move(seq_stack.back()));
seq_stack.pop_back();
seq_stack.back().children.push_back(std::move(node));
}

if (!stack.empty()) {
return std::nullopt;
}
assert(seq_stack.size() == 1);
return std::move(
*std::get<std::unique_ptr<rt_ast::ast_node>>(seq_stack.front().children[0]));
}
};
}
89 changes: 89 additions & 0 deletions src/chef/cfg/ll1.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ using chef::cfg_seq;
using chef::cfg_token;
using chef::cfg_var;
using chef::ll1_table;
namespace rt_ast = chef::rt_ast;

using Catch::Matchers::Contains;

Expand Down Expand Up @@ -157,3 +158,91 @@ TEST_CASE("ll1_table from CFG works")
}
}
}

TEST_CASE("ll1_table::parse_rt works")
{
// Example from https://en.wikipedia.org/wiki/LL_parser#Concrete_example.
// S -> F | (S + F)
// F -> a
// 0: a
// 1: (
// 2: )
// 3: +
const ll1_table table({
{
"S"_var,
{
{0_tok, cfg_seq({"F"_var})},
{1_tok, cfg_seq({1_tok, "S"_var, 3_tok, "F"_var, 2_tok})},
},
},
{"F"_var, {{0_tok, cfg_seq({0_tok})}}},
});

SECTION("accepts the types it should")
{
SECTION("accepts input ranges")
{
std::istringstream iss;
auto input
= std::ranges::istream_view<int>(iss) | views::transform([] TL(cfg_token(_1)));

STATIC_REQUIRE(requires { table.parse_rt("S"_var, input); });
}
SECTION("accepts std::vector<cfg_token>")
{
std::vector<cfg_token> input;

STATIC_REQUIRE(requires { table.parse_rt("S"_var, input); });
STATIC_REQUIRE(requires { table.parse_rt("S"_var, std::as_const(input)); });
}
}

SECTION("parses valid input")
{
// Input: (a + a); from same example as the grammar.
const std::vector<cfg_token> input{1_tok, 0_tok, 3_tok, 0_tok, 2_tok};

std::optional<rt_ast::ast_node> result = table.parse_rt("S"_var, input);
REQUIRE(result.has_value());
CHECK(result->name == "S");
REQUIRE(result->children.size() == 5);

REQUIRE(std::holds_alternative<cfg_token>(result->children[0]));
REQUIRE(std::holds_alternative<cfg_token>(result->children[2]));
REQUIRE(std::holds_alternative<cfg_token>(result->children[4]));
CHECK(std::get<cfg_token>(result->children[0]) == 1_tok);
// CHECK(result->children[1] == "S");
CHECK(std::get<cfg_token>(result->children[2]) == 3_tok);
// CHECK(result->children[3] == "F");
CHECK(std::get<cfg_token>(result->children[4]) == 2_tok);

REQUIRE(std::holds_alternative<std::unique_ptr<rt_ast::ast_node>>(result->children[1]));
REQUIRE(std::holds_alternative<std::unique_ptr<rt_ast::ast_node>>(result->children[3]));

const auto& s_node = std::get<std::unique_ptr<rt_ast::ast_node>>(result->children[1]);
CHECK(s_node->name == "S");
REQUIRE(s_node->children.size() == 1);
CHECK(std::holds_alternative<std::unique_ptr<rt_ast::ast_node>>(s_node->children[0]));
const auto& s_f_node = std::get<std::unique_ptr<rt_ast::ast_node>>(s_node->children[0]);
CHECK(s_f_node->name == "F");
REQUIRE(s_f_node->children.size() == 1);
REQUIRE(std::holds_alternative<cfg_token>(s_f_node->children[0]));
CHECK(std::get<cfg_token>(s_f_node->children[0]) == 0_tok);

const auto& f_node = std::get<std::unique_ptr<rt_ast::ast_node>>(result->children[3]);
CHECK(f_node->name == "F");
REQUIRE(f_node->children.size() == 1);
REQUIRE(std::holds_alternative<cfg_token>(f_node->children[0]));
CHECK(std::get<cfg_token>(f_node->children[0]) == 0_tok);
}
SECTION("rejects invalid input")
{
// Input: (a + a; from same example as the grammar.

// To produce a std::input_range, use istringstream + view::istream().
const std::vector<cfg_token> input{1_tok, 0_tok, 3_tok, 0_tok};

CHECK_FALSE(table.parse_rt("S"_var, input).has_value());
}
}

0 comments on commit b6f3d1f

Please sign in to comment.