From 1dff4d792b6234f43108a519ec1cccb80b4864a2 Mon Sep 17 00:00:00 2001 From: Sam Weaver Date: Mon, 7 Oct 2024 10:44:45 -0400 Subject: [PATCH] Support full reconstruction of HCL from parse tree (#169) --- .gitignore | 3 + hcl2/__init__.py | 2 +- hcl2/api.py | 41 +++++++++ hcl2/hcl2.lark | 16 ++-- hcl2/reconstructor.py | 162 ++++++++++++++++++++++++++++++++++ hcl2/transformer.py | 7 +- test/unit/test_reconstruct.py | 112 +++++++++++++++++++++++ 7 files changed, 334 insertions(+), 9 deletions(-) create mode 100644 hcl2/reconstructor.py create mode 100644 test/unit/test_reconstruct.py diff --git a/.gitignore b/.gitignore index 5a2fc4a..75af5e4 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,6 @@ node_modules/ # Don't commit the generated parser lark_parser.py .lark_cache.bin + +# ASDF tool-versions file +.tool-versions diff --git a/hcl2/__init__.py b/hcl2/__init__.py index 8b41bd8..f56787b 100644 --- a/hcl2/__init__.py +++ b/hcl2/__init__.py @@ -5,4 +5,4 @@ except ImportError: __version__ = "unknown" -from .api import load, loads +from .api import load, loads, parse, parses, transform, writes, AST diff --git a/hcl2/api.py b/hcl2/api.py index 9079fce..4a3ec10 100644 --- a/hcl2/api.py +++ b/hcl2/api.py @@ -1,6 +1,7 @@ """The API that will be exposed to users of this package""" from typing import TextIO +from lark.tree import Tree as AST from hcl2.parser import hcl2 from hcl2.transformer import DictTransformer @@ -26,3 +27,43 @@ def loads(text: str, with_meta=False) -> dict: # Append a new line as a temporary fix tree = hcl2.parse(text + "\n") return DictTransformer(with_meta=with_meta).transform(tree) + + +def parse(file: TextIO) -> AST: + """Load HCL2 syntax tree from a file. + :param file: File with hcl2 to be loaded as a dict. + """ + return parses(file.read()) + + +def parses(text: str) -> AST: + """Load HCL2 syntax tree from a string. + :param text: Text with hcl2 to be loaded as a dict. + """ + # defer this import until this method is called, due to the performance hit + # of rebuilding the grammar without cache + from hcl2.reconstructor import ( # pylint: disable=import-outside-toplevel + hcl2 as uncached_hcl2, + ) + + return uncached_hcl2.parse(text) + + +def transform(ast: AST, with_meta=False) -> dict: + """Convert an HCL2 AST to a dictionary. + :param ast: HCL2 syntax tree, output from `parse` or `parses` + """ + return DictTransformer(with_meta=with_meta).transform(ast) + + +def writes(ast: AST) -> str: + """Convert an HCL2 syntax tree to a string. + :param ast: HCL2 syntax tree, output from `parse` or `parses` + """ + # defer this import until this method is called, due to the performance hit + # of rebuilding the grammar without cache + from hcl2.reconstructor import ( # pylint: disable=import-outside-toplevel + hcl2_reconstructor, + ) + + return hcl2_reconstructor.reconstruct(ast) diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark index 0d6b40b..d26acc1 100644 --- a/hcl2/hcl2.lark +++ b/hcl2/hcl2.lark @@ -1,11 +1,13 @@ start : body body : (new_line_or_comment? (attribute | block))* new_line_or_comment? -attribute : identifier "=" expression +attribute : identifier EQ expression block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}" new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment -new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ )+ +new_line_or_comment: ( NL_OR_COMMENT )+ +NL_OR_COMMENT: /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/ -identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH +identifier : NAME | IN | FOR | IF | FOR_EACH +NAME : /[a-zA-Z_][a-zA-Z0-9_-]*/ IF : "if" IN : "in" FOR : "for" @@ -18,8 +20,9 @@ conditional : expression "?" new_line_or_comment? expression new_line_or_comment ?operation : unary_op | binary_op !unary_op : ("-" | "!") expr_term binary_op : expression binary_term new_line_or_comment? -!binary_operator : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+" +!binary_operator : BINARY_OP binary_term : binary_operator new_line_or_comment? expression +BINARY_OP : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+" expr_term : "(" new_line_or_comment? expression new_line_or_comment? ")" | float_lit @@ -50,10 +53,12 @@ int_lit : DECIMAL+ | DECIMAL+ ("." DECIMAL+)? EXP_MARK DECIMAL+ DECIMAL : "0".."9" EXP_MARK : ("e" | "E") ("+" | "-")? +EQ : /[ \t]*=(?!=|>)/ tuple : "[" (new_line_or_comment* expression new_line_or_comment* ",")* (new_line_or_comment* expression)? new_line_or_comment* "]" object : "{" new_line_or_comment? (object_elem (new_line_and_or_comma object_elem )* new_line_and_or_comma?)? "}" -object_elem : (identifier | expression) ("=" | ":") expression +object_elem : (identifier | expression) ( EQ | ":") expression + heredoc_template : /<<(?P[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc)/ heredoc_template_trim : /<<-(?P[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc_trim)/ @@ -78,4 +83,3 @@ full_splat : "[*]" (get_attr | index)* !for_cond : "if" new_line_or_comment? expression %ignore /[ \t]+/ -%ignore /\/\*(.|\n)*?(\*\/)/ diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py new file mode 100644 index 0000000..f6bbb9b --- /dev/null +++ b/hcl2/reconstructor.py @@ -0,0 +1,162 @@ +"""A reconstructor for HCL2 implemented using Lark's experimental reconstruction functionality""" + +from lark import Lark +from lark.reconstruct import Reconstructor +from lark.utils import is_id_continue + +# this is duplicated from `parser` because we need different options here for +# the reconstructor. please make sure changes are kept in sync between the two +# if necessary. +hcl2 = Lark.open( + "hcl2.lark", + parser="lalr", + # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed: + # + # https://github.com/lark-parser/lark/issues/1472 + # + # cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar + rel_to=__file__, + propagate_positions=True, + maybe_placeholders=False, # Needed for reconstruction +) + +CHAR_SPACE_AFTER = set(',~@<>="|?)]:') +CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'") +KEYWORDS_SPACE_AFTER = [ + "if", + "in", + "for", + "for_each", + "==", + "!=", + "<", + ">", + "<=", + ">=", + "-", + "*", + "/", + "%", + "&&", + "||", + "+", +] +KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER +DIGITS = set("0123456789") +NEVER_SPACE_AFTER = set("[(") +NEVER_SPACE_BEFORE = set("]),.") +NEVER_COMMA_BEFORE = set("])}") +# characters that are OK to come right after an identifier with no space between +IDENT_NO_SPACE = set("()[]") + + +def _add_extra_space(prev_item, item): + # pylint: disable=too-many-boolean-expressions, too-many-return-statements + + ##### the scenarios where explicitly disallow spaces: ##### + + # if we already have a space, don't add another + if prev_item[-1].isspace() or item[0].isspace(): + return False + + # none of the following should be separated by spaces: + # - groups of digits + # - namespaced::function::calls + # - characters within an identifier like array[0]() + if ( + (prev_item[-1] in DIGITS and item[0] in DIGITS) + or item == "::" + or prev_item == "::" + or (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE) + ): + return False + + # specific characters are also blocklisted from having spaces + if prev_item[-1] in NEVER_SPACE_AFTER or item[0] in NEVER_SPACE_BEFORE: + return False + + ##### the scenarios where we add spaces: ##### + + # scenario 1, the prev token ended with an identifier character + # and the next character is not an "IDENT_NO_SPACE" character + if is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE: + return True + + # scenario 2, the prev token or the next token should be followed by a space + if ( + prev_item[-1] in CHAR_SPACE_AFTER + or prev_item in KEYWORDS_SPACE_AFTER + or item[0] in CHAR_SPACE_BEFORE + or item in KEYWORDS_SPACE_BEFORE + ): + return True + + # scenario 3, the previous token was a block opening brace and + # the next token is not a closing brace (so the block is on one + # line and not empty) + if prev_item[-1] == "{" and item[0] != "}": + return True + + ##### otherwise, we don't add a space ##### + return False + + +def _postprocess_reconstruct(items): + """ + Postprocess the stream of tokens derived from the AST during reconstruction. + + For HCL2, this is used exclusively for adding whitespace in the right locations. + """ + prev_item = "" + for item in items: + # first, handle any deferred tokens + if isinstance(prev_item, tuple) and prev_item[0] == "_deferred": + prev_item = prev_item[1] + + # if the deferred token was a comma, see if we're ending a block + if prev_item == ",": + if item[0] not in NEVER_COMMA_BEFORE: + yield prev_item + else: + yield prev_item + + # if we're between two tokens, determine if we need to add an extra space + # we need the previous item and the current item to exist to evaluate these rules + if prev_item and item and _add_extra_space(prev_item, item): + yield " " + + # in some cases, we may want to defer printing the next token + defer_item = False + + # prevent the inclusion of extra commas if they are not intended + if item[0] == ",": + item = ("_deferred", item) + defer_item = True + + # print the actual token + if not defer_item: + yield item + + # store the previous item for the next token + prev_item = item + + # if the last token was deferred, print it before continuing + if isinstance(prev_item, tuple) and prev_item[0] == "_deferred": + yield prev_item[1] + + +class HCLReconstructor: + """This class converts a Lark.Tree AST back into a string representing the underlying HCL code.""" + def __init__(self, parser): + self._recons = Reconstructor(parser) + + def reconstruct(self, tree): + """Convert a Lark.Tree AST back into a string representation of HCL.""" + return self._recons.reconstruct( + tree, + _postprocess_reconstruct, + insert_spaces=False, + ) + + +hcl2_reconstructor = HCLReconstructor(hcl2) diff --git a/hcl2/transformer.py b/hcl2/transformer.py index ddc7269..866cef2 100644 --- a/hcl2/transformer.py +++ b/hcl2/transformer.py @@ -93,7 +93,10 @@ def object_elem(self, args: List) -> Dict: # This returns a dict with a single key/value pair to make it easier to merge these # into a bigger dict that is returned by the "object" function key = self.strip_quotes(args[0]) - value = self.to_string_dollar(args[1]) + if len(args) == 3: + value = self.to_string_dollar(args[2]) + else: + value = self.to_string_dollar(args[1]) return {key: value} @@ -148,7 +151,7 @@ def attribute(self, args: List) -> Attribute: key = str(args[0]) if key.startswith('"') and key.endswith('"'): key = key[1:-1] - value = self.to_string_dollar(args[1]) + value = self.to_string_dollar(args[2]) return Attribute(key, value) def conditional(self, args: List) -> str: diff --git a/test/unit/test_reconstruct.py b/test/unit/test_reconstruct.py new file mode 100644 index 0000000..b9545de --- /dev/null +++ b/test/unit/test_reconstruct.py @@ -0,0 +1,112 @@ +""" Test reconstructing hcl files""" + +import json +from pathlib import Path +from unittest import TestCase + +import hcl2 + + +HELPERS_DIR = Path(__file__).absolute().parent.parent / "helpers" +HCL2_DIR = HELPERS_DIR / "terraform-config" +HCL2_FILES = [str(file.relative_to(HCL2_DIR)) for file in HCL2_DIR.iterdir()] +JSON_DIR = HELPERS_DIR / "terraform-config-json" + + +class TestReconstruct(TestCase): + """Test reconstructing a variety of hcl files""" + + # print any differences fully to the console + maxDiff = None + + def test_write_terraform(self): + """Test reconstructing a set of hcl2 files, to make sure they parse to the same structure""" + for hcl_path in HCL2_FILES: + yield self.check_terraform, hcl_path + + def test_write_terraform_exact(self): + """ + Test reconstructing a set of hcl2 files, to make sure they + reconstruct exactly the same, including whitespace. + """ + + # the reconstruction process is not precise, so some files do not + # reconstruct their whitespace exactly the same, but they are + # syntactically equivalent. This list is a target for further + # improvements to the whitespace handling of the reconstruction + # algorithm. + inexact_files = [ + # the reconstructor loses commas on the last element in an array, + # even if they're in the input file + "iam.tf", + "variables.tf", + # the reconstructor doesn't preserve indentation within comments + # perfectly + "multiline_expressions.tf", + # the reconstructor doesn't preserve the line that a ternary is + # broken on. + "route_table.tf", + ] + + for hcl_path in HCL2_FILES: + if hcl_path not in inexact_files: + yield self.check_whitespace, hcl_path + + def check_terraform(self, hcl_path_str: str): + """ + Loads a single hcl2 file, parses it, reconstructs it, + parses the reconstructed file, and compares with the expected json + """ + hcl_path = (HCL2_DIR / hcl_path_str).absolute() + json_path = JSON_DIR / hcl_path.relative_to(HCL2_DIR).with_suffix(".json") + with hcl_path.open("r") as hcl_file, json_path.open("r") as json_file: + hcl_file_content = hcl_file.read() + try: + hcl_ast = hcl2.parses(hcl_file_content) + except Exception as exc: + assert False, f"failed to tokenize terraform in `{hcl_path_str}`: {exc}" + + try: + hcl_reconstructed = hcl2.writes(hcl_ast) + except Exception as exc: + assert ( + False + ), f"failed to reconstruct terraform in `{hcl_path_str}`: {exc}" + + try: + hcl2_dict = hcl2.loads(hcl_reconstructed) + except Exception as exc: + assert ( + False + ), f"failed to tokenize terraform in file reconstructed from `{hcl_path_str}`: {exc}" + + json_dict = json.load(json_file) + self.assertDictEqual( + hcl2_dict, + json_dict, + f"failed comparing {hcl_path_str} with reconstructed version", + ) + + def check_whitespace(self, hcl_path_str: str): + """Tests that the reconstructed file matches the original file exactly.""" + hcl_path = (HCL2_DIR / hcl_path_str).absolute() + with hcl_path.open("r") as hcl_file: + hcl_file_content = hcl_file.read() + try: + hcl_ast = hcl2.parses(hcl_file_content) + except Exception as exc: + assert False, f"failed to tokenize terraform in `{hcl_path_str}`: {exc}" + + try: + hcl_reconstructed = hcl2.writes(hcl_ast) + except Exception as exc: + assert ( + False + ), f"failed to reconstruct terraform in `{hcl_path_str}`: {exc}" + + self.assertMultiLineEqual( + hcl_reconstructed, + hcl_file_content, + f"file {hcl_path_str} does not match its reconstructed version \ + exactly. this is usually whitespace related.", + )