Support full reconstruction of HCL from parse tree (#169)

amplify-education · Oct 7, 2024 · 1dff4d7 · 1dff4d7
1 parent 92f2ee2
commit 1dff4d7
Show file tree

Hide file tree

Showing 7 changed files with 334 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -121,3 +121,6 @@ node_modules/
 # Don't commit the generated parser
 lark_parser.py
 .lark_cache.bin
+
+# ASDF tool-versions file
+.tool-versions
diff --git a/hcl2/__init__.py b/hcl2/__init__.py
@@ -5,4 +5,4 @@
 except ImportError:
  __version__ = "unknown"
 
-from .api import load, loads
+from .api import load, loads, parse, parses, transform, writes, AST
diff --git a/hcl2/api.py b/hcl2/api.py
@@ -1,6 +1,7 @@
 """The API that will be exposed to users of this package"""
 from typing import TextIO
 
+from lark.tree import Tree as AST
 from hcl2.parser import hcl2
 from hcl2.transformer import DictTransformer
 
@@ -26,3 +27,43 @@ def loads(text: str, with_meta=False) -> dict:
  # Append a new line as a temporary fix
  tree = hcl2.parse(text + "\n")
  return DictTransformer(with_meta=with_meta).transform(tree)
+
+
+def parse(file: TextIO) -> AST:
+ """Load HCL2 syntax tree from a file.
+ :param file: File with hcl2 to be loaded as a dict.
+ """
+ return parses(file.read())
+
+
+def parses(text: str) -> AST:
+ """Load HCL2 syntax tree from a string.
+ :param text: Text with hcl2 to be loaded as a dict.
+ """
+ # defer this import until this method is called, due to the performance hit
+ # of rebuilding the grammar without cache
+ from hcl2.reconstructor import ( # pylint: disable=import-outside-toplevel
+ hcl2 as uncached_hcl2,
+ )
+
+ return uncached_hcl2.parse(text)
+
+
+def transform(ast: AST, with_meta=False) -> dict:
+ """Convert an HCL2 AST to a dictionary.
+ :param ast: HCL2 syntax tree, output from `parse` or `parses`
+ """
+ return DictTransformer(with_meta=with_meta).transform(ast)
+
+
+def writes(ast: AST) -> str:
+ """Convert an HCL2 syntax tree to a string.
+ :param ast: HCL2 syntax tree, output from `parse` or `parses`
+ """
+ # defer this import until this method is called, due to the performance hit
+ # of rebuilding the grammar without cache
+ from hcl2.reconstructor import ( # pylint: disable=import-outside-toplevel
+ hcl2_reconstructor,
+ )
+
+ return hcl2_reconstructor.reconstruct(ast)
diff --git a/hcl2/hcl2.lark b/hcl2/hcl2.lark
@@ -1,11 +1,13 @@
 start : body
 body : (new_line_or_comment? (attribute | block))* new_line_or_comment?
-attribute : identifier "=" expression
+attribute : identifier EQ expression
 block : identifier (identifier | STRING_LIT)* new_line_or_comment? "{" body "}"
 new_line_and_or_comma: new_line_or_comment | "," | "," new_line_or_comment
-new_line_or_comment: ( /\n/ | /#.*\n/ | /\/\/.*\n/ )+
+new_line_or_comment: ( NL_OR_COMMENT )+
+NL_OR_COMMENT: /\n[ \t]*/ | /#.*\n/ | /\/\/.*\n/ | /\/\*(.|\n)*?(\*\/)/
 
-identifier : /[a-zA-Z_][a-zA-Z0-9_-]*/ | IN | FOR | IF | FOR_EACH
+identifier : NAME | IN | FOR | IF | FOR_EACH
+NAME : /[a-zA-Z_][a-zA-Z0-9_-]*/
 IF : "if"
 IN : "in"
 FOR : "for"
@@ -18,8 +20,9 @@ conditional : expression "?" new_line_or_comment? expression new_line_or_comment
 ?operation : unary_op | binary_op
 !unary_op : ("-" | "!") expr_term
 binary_op : expression binary_term new_line_or_comment?
-!binary_operator : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+"
+!binary_operator : BINARY_OP
 binary_term : binary_operator new_line_or_comment? expression
+BINARY_OP : "==" | "!=" | "<" | ">" | "<=" | ">=" | "-" | "*" | "/" | "%" | "&&" | "||" | "+"
 
 expr_term : "(" new_line_or_comment? expression new_line_or_comment? ")"
  | float_lit
@@ -50,10 +53,12 @@ int_lit : DECIMAL+
  | DECIMAL+ ("." DECIMAL+)? EXP_MARK DECIMAL+
 DECIMAL : "0".."9"
 EXP_MARK : ("e" | "E") ("+" | "-")?
+EQ : /[ \t]*=(?!=|>)/
 
 tuple : "[" (new_line_or_comment* expression new_line_or_comment* ",")* (new_line_or_comment* expression)? new_line_or_comment* "]"
 object : "{" new_line_or_comment? (object_elem (new_line_and_or_comma object_elem )* new_line_and_or_comma?)? "}"
-object_elem : (identifier | expression) ("=" | ":") expression
+object_elem : (identifier | expression) ( EQ | ":") expression
+
 
 heredoc_template : /<<(?P<heredoc>[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc)/
 heredoc_template_trim : /<<-(?P<heredoc_trim>[a-zA-Z][a-zA-Z0-9._-]+)\n(?:.|\n)*?(?P=heredoc_trim)/
@@ -78,4 +83,3 @@ full_splat : "[*]" (get_attr | index)*
 !for_cond : "if" new_line_or_comment? expression
 
 %ignore /[ \t]+/
-%ignore /\/\*(.|\n)*?(\*\/)/
diff --git a/hcl2/reconstructor.py b/hcl2/reconstructor.py
@@ -0,0 +1,162 @@
+"""A reconstructor for HCL2 implemented using Lark's experimental reconstruction functionality"""
+
+from lark import Lark
+from lark.reconstruct import Reconstructor
+from lark.utils import is_id_continue
+
+# this is duplicated from `parser` because we need different options here for
+# the reconstructor. please make sure changes are kept in sync between the two
+# if necessary.
+hcl2 = Lark.open(
+ "hcl2.lark",
+ parser="lalr",
+ # Caching must be disabled to allow for reconstruction until lark-parser/lark#1472 is fixed:
+ #
+ # https://github.com/lark-parser/lark/issues/1472
+ #
+ # cache=str(PARSER_FILE), # Disable/Delete file to effect changes to the grammar
+ rel_to=__file__,
+ propagate_positions=True,
+ maybe_placeholders=False, # Needed for reconstruction
+)
+
+CHAR_SPACE_AFTER = set(',~@<>="|?)]:')
+CHAR_SPACE_BEFORE = (CHAR_SPACE_AFTER - set(",=")) | set("'")
+KEYWORDS_SPACE_AFTER = [
+ "if",
+ "in",
+ "for",
+ "for_each",
+ "==",
+ "!=",
+ "<",
+ ">",
+ "<=",
+ ">=",
+ "-",
+ "*",
+ "/",
+ "%",
+ "&&",
+ "||",
+ "+",
+]
+KEYWORDS_SPACE_BEFORE = KEYWORDS_SPACE_AFTER
+DIGITS = set("0123456789")
+NEVER_SPACE_AFTER = set("[(")
+NEVER_SPACE_BEFORE = set("]),.")
+NEVER_COMMA_BEFORE = set("])}")
+# characters that are OK to come right after an identifier with no space between
+IDENT_NO_SPACE = set("()[]")
+
+
+def _add_extra_space(prev_item, item):
+ # pylint: disable=too-many-boolean-expressions, too-many-return-statements
+
+ ##### the scenarios where explicitly disallow spaces: #####
+
+ # if we already have a space, don't add another
+ if prev_item[-1].isspace() or item[0].isspace():
+ return False
+
+ # none of the following should be separated by spaces:
+ # - groups of digits
+ # - namespaced::function::calls
+ # - characters within an identifier like array[0]()
+ if (
+ (prev_item[-1] in DIGITS and item[0] in DIGITS)
+ or item == "::"
+ or prev_item == "::"
+ or (prev_item[-1] in IDENT_NO_SPACE and item[0] in IDENT_NO_SPACE)
+ ):
+ return False
+
+ # specific characters are also blocklisted from having spaces
+ if prev_item[-1] in NEVER_SPACE_AFTER or item[0] in NEVER_SPACE_BEFORE:
+ return False
+
+ ##### the scenarios where we add spaces: #####
+
+ # scenario 1, the prev token ended with an identifier character
+ # and the next character is not an "IDENT_NO_SPACE" character
+ if is_id_continue(prev_item[-1]) and not item[0] in IDENT_NO_SPACE:
+ return True
+
+ # scenario 2, the prev token or the next token should be followed by a space
+ if (
+ prev_item[-1] in CHAR_SPACE_AFTER
+ or prev_item in KEYWORDS_SPACE_AFTER
+ or item[0] in CHAR_SPACE_BEFORE
+ or item in KEYWORDS_SPACE_BEFORE
+ ):
+ return True
+
+ # scenario 3, the previous token was a block opening brace and
+ # the next token is not a closing brace (so the block is on one
+ # line and not empty)
+ if prev_item[-1] == "{" and item[0] != "}":
+ return True
+
+ ##### otherwise, we don't add a space #####
+ return False
+
+
+def _postprocess_reconstruct(items):
+ """
+ Postprocess the stream of tokens derived from the AST during reconstruction.
+
+ For HCL2, this is used exclusively for adding whitespace in the right locations.
+ """
+ prev_item = ""
+ for item in items:
+ # first, handle any deferred tokens
+ if isinstance(prev_item, tuple) and prev_item[0] == "_deferred":
+ prev_item = prev_item[1]
+
+ # if the deferred token was a comma, see if we're ending a block
+ if prev_item == ",":
+ if item[0] not in NEVER_COMMA_BEFORE:
+ yield prev_item
+ else:
+ yield prev_item
+
+ # if we're between two tokens, determine if we need to add an extra space
+ # we need the previous item and the current item to exist to evaluate these rules
+ if prev_item and item and _add_extra_space(prev_item, item):
+ yield " "
+
+ # in some cases, we may want to defer printing the next token
+ defer_item = False
+
+ # prevent the inclusion of extra commas if they are not intended
+ if item[0] == ",":
+ item = ("_deferred", item)
+ defer_item = True
+
+ # print the actual token
+ if not defer_item:
+ yield item
+
+ # store the previous item for the next token
+ prev_item = item
+
+ # if the last token was deferred, print it before continuing
+ if isinstance(prev_item, tuple) and prev_item[0] == "_deferred":
+ yield prev_item[1]
+
+
+class HCLReconstructor:
+ """This class converts a Lark.Tree AST back into a string representing the underlying HCL code."""
+ def __init__(self, parser):
+ self._recons = Reconstructor(parser)
+
+ def reconstruct(self, tree):
+ """Convert a Lark.Tree AST back into a string representation of HCL."""
+ return self._recons.reconstruct(
+ tree,
+ _postprocess_reconstruct,
+ insert_spaces=False,
+ )
+
+
+hcl2_reconstructor = HCLReconstructor(hcl2)
diff --git a/hcl2/transformer.py b/hcl2/transformer.py
@@ -93,7 +93,10 @@ def object_elem(self, args: List) -> Dict:
  # This returns a dict with a single key/value pair to make it easier to merge these
  # into a bigger dict that is returned by the "object" function
  key = self.strip_quotes(args[0])
- value = self.to_string_dollar(args[1])
+ if len(args) == 3:
+ value = self.to_string_dollar(args[2])
+ else:
+ value = self.to_string_dollar(args[1])
 
  return {key: value}
 
@@ -148,7 +151,7 @@ def attribute(self, args: List) -> Attribute:
  key = str(args[0])
  if key.startswith('"') and key.endswith('"'):
  key = key[1:-1]
- value = self.to_string_dollar(args[1])
+ value = self.to_string_dollar(args[2])
  return Attribute(key, value)
 
  def conditional(self, args: List) -> str: