From 1dff4d792b6234f43108a519ec1cccb80b4864a2 Mon Sep 17 00:00:00 2001
From: Sam Weaver <weaversam8@users.noreply.github.com>
Date: Mon, 7 Oct 2024 10:44:45 -0400
Subject: [PATCH] Support full reconstruction of HCL from parse tree (#169)

---
 .gitignore                    |   3 +
 hcl2/__init__.py              |   2 +-
 hcl2/api.py                   |  41 +++++++++
 hcl2/hcl2.lark                |  16 ++--
 hcl2/reconstructor.py         | 162 ++++++++++++++++++++++++++++++++++
 hcl2/transformer.py           |   7 +-
 test/unit/test_reconstruct.py | 112 +++++++++++++++++++++++
 7 files changed, 334 insertions(+), 9 deletions(-)
 create mode 100644 hcl2/reconstructor.py
 create mode 100644 test/unit/test_reconstruct.py

diff --git a/.gitignore b/.gitignore
index 5a2fc4a..75af5e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,3 +121,6 @@ node_modules/
 # Don't commit the generated parser
 lark_parser.py
 .lark_cache.bin
+
+# ASDF tool-versions file
+.tool-versions
diff --git a/hcl2/__init__.py b/hcl2/__init__.py
index 8b41bd8..f56787b 100644
--- a/hcl2/__init__.py
+++ b/hcl2/__init__.py
@@ -5,4 +5,4 @@
 except ImportError:
     __version__ = "unknown"
 
-from .api import load, loads
+from .api import load, loads, parse, parses, transform, writes, AST
diff --git a/hcl2/api.py b/hcl2/api.py
index 9079fce..4a3ec10 100644
--- a/hcl2/api.py
+++ b/hcl2/api.py
@@ -1,6 +1,7 @@
 """The API that will be exposed to users of this package"""
 from typing import TextIO
 
+from lark.tree import Tree as AST
 from hcl2.parser import hcl2
 from hcl2.transformer import DictTransformer
 
@@ -26,3 +27,43 @@ def loads(text: str, with_meta=False) -> dict:
     # Append a new line as a temporary fix
     tree = hcl2.parse(text + "\n")
     return DictTransformer(with_meta=with_meta).transform(tree)
+
+
+def parse(file: TextIO) -> AST:
+    """Load HCL2 syntax tree from a file.
+    :param file: File with hcl2 to be loaded as a dict.
+    """
+    return parses(file.read())
+
+
+def parses(text: str) -> AST:
+    """Load HCL2 syntax tree from a string.
+    :param text: Text with hcl2 to be loaded as a dict.
+    """
+    # defer this import until this method is called, due to the performance hit
+    # of rebuilding the grammar without cache
+    from hcl2.reconstructor import (  # pylint: disable=import-outside-toplevel
+        hcl2 as uncached_hcl2,
+    )
+
+    return uncached_hcl2.parse(text)
+
+
+def transform(ast: AST, with_meta=False) -> dict:
+    """Convert an HCL2 AST to a dictionary.
+    :param ast: HCL2 syntax tree, output from `parse` or `parses`
+    """
+    return DictTransformer(with_meta=with_meta).transform(ast)
+
+
+def writes(ast: AST) -> str:
+    """Convert an HCL2 syntax tree to a string.
+    :param ast: HCL2 syntax tree, output from `parse` or `parses`
+    """
+    # defer this import until this method is called, due to the performance hit
+    # of rebuilding the grammar without cache
+    from hcl2.reconstructor import (  # pylint: disable=import-outside-toplevel
+        hcl2_reconstructor,
+    )
+
+    return hcl2_reconstructor.reconstruct(ast)
diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark
index 0d6b40b..d26acc1 100644
--- a/hcl2/hcl2.lark
+++ b/hcl2/hcl2.lark
@@ -1,11 +1,13 @@
 start : body
 body : (new_line_or_comment? (attribute | block))* new_line_or_comment?
-attribute : identifier "=" expression
+attribute : identifier EQ expression
 block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}"
 new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment
-new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ )+
+new_line_or_comment: ( NL_OR_COMMENT )+
+NL_OR_COMMENT: /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/
 
-identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH
+identifier : NAME | IN | FOR | IF | FOR_EACH
+NAME : /[a-zA-Z_][a-zA-Z0-9_-]*/
 IF : "if"
 IN : "in"
 FOR : "for"
@@ -18,8 +20,9 @@ conditional : expression "?" new_line_or_comment? expression new_line_or_comment
 ?operation : unary_op | binary_op
 !unary_op : ("-" | "!") expr_term
 binary_op : expression binary_term new_line_or_comment?
-!binary_operator : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+"
+!binary_operator : BINARY_OP
 binary_term : binary_operator new_line_or_comment? expression
+BINARY_OP : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+"
 
 expr_term : "(" new_line_or_comment? expression new_line_or_comment? ")"
             | float_lit
@@ -50,10 +53,12 @@ int_lit : DECIMAL+
             | DECIMAL+ ("." DECIMAL+)? EXP_MARK DECIMAL+
 DECIMAL : "0".."9"
 EXP_MARK : ("e" | "E") ("+" | "-")?
+EQ : /[ \t]*=(?!=|>)/
 
 tuple : "[" (new_line_or_comment* expression new_line_or_comment* ",")* (new_line_or_comment* expression)? new_line_or_comment* "]"
 object : "{" new_line_or_comment? (object_elem (new_line_and_or_comma object_elem )* new_line_and_or_comma?)? "}"
-object_elem : (identifier | expression) ("=" | ":") expression
+object_elem : (identifier | expression) ( EQ | ":") expression
+
 
 heredoc_template : /<<(?P<heredoc>[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc)/
 heredoc_template_trim : /<<-(?P<heredoc_trim>[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc_trim)/
@@ -78,4 +83,3 @@ full_splat : "[*]" (get_attr | index)*
 !for_cond : "if" new_line_or_comment? expression
 
 %ignore /[ \t]+/
-%ignore /\/\*(.|\n)*?(\*\/)/
diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py
new file mode 100644
index 0000000..f6bbb9b
--- /dev/null
+++ b/hcl2/reconstructor.py
@@ -0,0 +1,162 @@
+"""A reconstructor for HCL2 implemented using Lark's experimental reconstruction functionality"""
+
+from lark import Lark
+from lark.reconstruct import Reconstructor
+from lark.utils import is_id_continue
+
+# this is duplicated from `parser` because we need different options here for
+# the reconstructor. please make sure changes are kept in sync between the two
+# if necessary.
+hcl2 = Lark.open(
+    "hcl2.lark",
+    parser="lalr",
+    # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed:
+    #
+    #   https://github.com/lark-parser/lark/issues/1472
+    #
+    # cache=str(PARSER_FILE),  # Disable/Delete file to effect changes to the grammar
+    rel_to=__file__,
+    propagate_positions=True,
+    maybe_placeholders=False,  # Needed for reconstruction
+)
+
+CHAR_SPACE_AFTER = set(',~@<>="|?)]:')
+CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'")
+KEYWORDS_SPACE_AFTER = [
+    "if",
+    "in",
+    "for",
+    "for_each",
+    "==",
+    "!=",
+    "<",
+    ">",
+    "<=",
+    ">=",
+    "-",
+    "*",
+    "/",
+    "%",
+    "&&",
+    "||",
+    "+",
+]
+KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER
+DIGITS = set("0123456789")
+NEVER_SPACE_AFTER = set("[(")
+NEVER_SPACE_BEFORE = set("]),.")
+NEVER_COMMA_BEFORE = set("])}")
+# characters that are OK to come right after an identifier with no space between
+IDENT_NO_SPACE = set("()[]")
+
+
+def _add_extra_space(prev_item, item):
+    # pylint: disable=too-many-boolean-expressions, too-many-return-statements
+
+    ##### the scenarios where explicitly disallow spaces: #####
+
+    # if we already have a space, don't add another
+    if prev_item[-1].isspace() or item[0].isspace():
+        return False
+
+    # none of the following should be separated by spaces:
+    # - groups of digits
+    # - namespaced::function::calls
+    # - characters within an identifier like array[0]()
+    if (
+        (prev_item[-1] in DIGITS and item[0] in DIGITS)
+        or item == "::"
+        or prev_item == "::"
+        or (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE)
+    ):
+        return False
+
+    # specific characters are also blocklisted from having spaces
+    if prev_item[-1] in NEVER_SPACE_AFTER or item[0] in NEVER_SPACE_BEFORE:
+        return False
+
+    ##### the scenarios where we add spaces: #####
+
+    # scenario 1, the prev token ended with an identifier character
+    # and the next character is not an "IDENT_NO_SPACE" character
+    if is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE:
+        return True
+
+    # scenario 2, the prev token or the next token should be followed by a space
+    if (
+        prev_item[-1] in CHAR_SPACE_AFTER
+        or prev_item in KEYWORDS_SPACE_AFTER
+        or item[0] in CHAR_SPACE_BEFORE
+        or item in KEYWORDS_SPACE_BEFORE
+    ):
+        return True
+
+    # scenario 3, the previous token was a block opening brace and
+    # the next token is not a closing brace (so the block is on one
+    # line and not empty)
+    if prev_item[-1] == "{" and item[0] != "}":
+        return True
+
+    ##### otherwise, we don't add a space #####
+    return False
+
+
+def _postprocess_reconstruct(items):
+    """
+    Postprocess the stream of tokens derived from the AST during reconstruction.
+
+    For HCL2, this is used exclusively for adding whitespace in the right locations.
+    """
+    prev_item = ""
+    for item in items:
+        # first, handle any deferred tokens
+        if isinstance(prev_item, tuple) and prev_item[0] == "_deferred":
+            prev_item = prev_item[1]
+
+            # if the deferred token was a comma, see if we're ending a block
+            if prev_item == ",":
+                if item[0] not in NEVER_COMMA_BEFORE:
+                    yield prev_item
+            else:
+                yield prev_item
+
+        # if we're between two tokens, determine if we need to add an extra space
+        # we need the previous item and the current item to exist to evaluate these rules
+        if prev_item and item and _add_extra_space(prev_item, item):
+            yield " "
+
+        # in some cases, we may want to defer printing the next token
+        defer_item = False
+
+        # prevent the inclusion of extra commas if they are not intended
+        if item[0] == ",":
+            item = ("_deferred", item)
+            defer_item = True
+
+        # print the actual token
+        if not defer_item:
+            yield item
+
+        # store the previous item for the next token
+        prev_item = item
+
+    # if the last token was deferred, print it before continuing
+    if isinstance(prev_item, tuple) and prev_item[0] == "_deferred":
+        yield prev_item[1]
+
+
+class HCLReconstructor:
+    """This class converts a Lark.Tree AST back into a string representing the underlying HCL code."""
+    def __init__(self, parser):
+        self._recons = Reconstructor(parser)
+
+    def reconstruct(self, tree):
+        """Convert a Lark.Tree AST back into a string representation of HCL."""
+        return self._recons.reconstruct(
+            tree,
+            _postprocess_reconstruct,
+            insert_spaces=False,
+        )
+
+
+hcl2_reconstructor = HCLReconstructor(hcl2)
diff --git a/hcl2/transformer.py b/hcl2/transformer.py
index ddc7269..866cef2 100644
--- a/hcl2/transformer.py
+++ b/hcl2/transformer.py
@@ -93,7 +93,10 @@ def object_elem(self, args: List) -> Dict:
         # This returns a dict with a single key/value pair to make it easier to merge these
         # into a bigger dict that is returned by the "object" function
         key = self.strip_quotes(args[0])
-        value = self.to_string_dollar(args[1])
+        if len(args) == 3:
+            value = self.to_string_dollar(args[2])
+        else:
+            value = self.to_string_dollar(args[1])
 
         return {key: value}
 
@@ -148,7 +151,7 @@ def attribute(self, args: List) -> Attribute:
         key = str(args[0])
         if key.startswith('"') and key.endswith('"'):
             key = key[1:-1]
-        value = self.to_string_dollar(args[1])
+        value = self.to_string_dollar(args[2])
         return Attribute(key, value)
 
     def conditional(self, args: List) -> str:
diff --git a/test/unit/test_reconstruct.py b/test/unit/test_reconstruct.py
new file mode 100644
index 0000000..b9545de
--- /dev/null
+++ b/test/unit/test_reconstruct.py
@@ -0,0 +1,112 @@
+""" Test reconstructing hcl files"""
+
+import json
+from pathlib import Path
+from unittest import TestCase
+
+import hcl2
+
+
+HELPERS_DIR = Path(__file__).absolute().parent.parent / "helpers"
+HCL2_DIR = HELPERS_DIR / "terraform-config"
+HCL2_FILES = [str(file.relative_to(HCL2_DIR)) for file in HCL2_DIR.iterdir()]
+JSON_DIR = HELPERS_DIR / "terraform-config-json"
+
+
+class TestReconstruct(TestCase):
+    """Test reconstructing a variety of hcl files"""
+
+    # print any differences fully to the console
+    maxDiff = None
+
+    def test_write_terraform(self):
+        """Test reconstructing a set of hcl2 files, to make sure they parse to the same structure"""
+        for hcl_path in HCL2_FILES:
+            yield self.check_terraform, hcl_path
+
+    def test_write_terraform_exact(self):
+        """
+        Test reconstructing a set of hcl2 files, to make sure they
+        reconstruct exactly the same, including whitespace.
+        """
+
+        # the reconstruction process is not precise, so some files do not
+        # reconstruct their whitespace exactly the same, but they are
+        # syntactically equivalent. This list is a target for further
+        # improvements to the whitespace handling of the reconstruction
+        # algorithm.
+        inexact_files = [
+            # the reconstructor loses commas on the last element in an array,
+            # even if they're in the input file
+            "iam.tf",
+            "variables.tf",
+            # the reconstructor doesn't preserve indentation within comments
+            # perfectly
+            "multiline_expressions.tf",
+            # the reconstructor doesn't preserve the line that a ternary is
+            # broken on.
+            "route_table.tf",
+        ]
+
+        for hcl_path in HCL2_FILES:
+            if hcl_path not in inexact_files:
+                yield self.check_whitespace, hcl_path
+
+    def check_terraform(self, hcl_path_str: str):
+        """
+        Loads a single hcl2 file, parses it, reconstructs it,
+        parses the reconstructed file, and compares with the expected json
+        """
+        hcl_path = (HCL2_DIR / hcl_path_str).absolute()
+        json_path = JSON_DIR / hcl_path.relative_to(HCL2_DIR).with_suffix(".json")
+        with hcl_path.open("r") as hcl_file, json_path.open("r") as json_file:
+            hcl_file_content = hcl_file.read()
+            try:
+                hcl_ast = hcl2.parses(hcl_file_content)
+            except Exception as exc:
+                assert False, f"failed to tokenize terraform in `{hcl_path_str}`: {exc}"
+
+            try:
+                hcl_reconstructed = hcl2.writes(hcl_ast)
+            except Exception as exc:
+                assert (
+                    False
+                ), f"failed to reconstruct terraform in `{hcl_path_str}`: {exc}"
+
+            try:
+                hcl2_dict = hcl2.loads(hcl_reconstructed)
+            except Exception as exc:
+                assert (
+                    False
+                ), f"failed to tokenize terraform in file reconstructed from `{hcl_path_str}`: {exc}"
+
+            json_dict = json.load(json_file)
+            self.assertDictEqual(
+                hcl2_dict,
+                json_dict,
+                f"failed comparing {hcl_path_str} with reconstructed version",
+            )
+
+    def check_whitespace(self, hcl_path_str: str):
+        """Tests that the reconstructed file matches the original file exactly."""
+        hcl_path = (HCL2_DIR / hcl_path_str).absolute()
+        with hcl_path.open("r") as hcl_file:
+            hcl_file_content = hcl_file.read()
+            try:
+                hcl_ast = hcl2.parses(hcl_file_content)
+            except Exception as exc:
+                assert False, f"failed to tokenize terraform in `{hcl_path_str}`: {exc}"
+
+            try:
+                hcl_reconstructed = hcl2.writes(hcl_ast)
+            except Exception as exc:
+                assert (
+                    False
+                ), f"failed to reconstruct terraform in `{hcl_path_str}`: {exc}"
+
+            self.assertMultiLineEqual(
+                hcl_reconstructed,
+                hcl_file_content,
+                f"file {hcl_path_str} does not match its reconstructed version \
+                    exactly. this is usually whitespace related.",
+            )