XLSForm · lindsay-stevens · Dec 24, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/pyxform/entities/entity_declaration.py b/pyxform/entities/entity_declaration.py
@@ -46,7 +46,7 @@ def __init__(self, name: str, type: str, parameters: dict, **kwargs):
         super().__init__(name=name, **kwargs)
 
     def xml_instance(self, **kwargs):
-        parameters = self.get(const.PARAMETERS, {})
+        parameters = self.parameters
 
         attributes = {
             EC.DATASET.value: parameters.get(EC.DATASET, ""),
@@ -75,7 +75,7 @@ def xml_bindings(self, survey: "Survey"):
         """
         See the class comment for an explanation of the logic for generating bindings.
         """
-        parameters = self.get(const.PARAMETERS, {})
+        parameters = self.parameters
         entity_id_expression = parameters.get(EC.ENTITY_ID, None)
         create_condition = parameters.get(EC.CREATE_IF, None)
         update_condition = parameters.get(EC.UPDATE_IF, None)

diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
@@ -1,12 +1,8 @@
 import re
-from collections.abc import Iterable
 from functools import lru_cache
 
 
-def get_expression_lexer(name_only: bool = False) -> re.Scanner:
-    """
-    Get a expression lexer (scanner) for parsing.
-    """
+def get_lexer_rules():
     # ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
     # (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
     # They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
@@ -29,7 +25,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
     date_time_regex = date_regex + "T" + time_regex
 
     # Rule order is significant - match priority runs top to bottom.
-    lexer_rules = {
+    return {
         # https://www.w3.org/TR/xmlschema-2/#dateTime
         "DATETIME": date_time_regex,
         "DATE": date_regex,
@@ -49,7 +45,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
         "SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
         "COMMA": r",",
         "WHITESPACE": r"\s+",
-        "PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
+        "PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
         "FUNC_CALL": ncname_regex + r"\(",
         "XPATH_PRED_START": ncname_regex + r"\[",
         "XPATH_PRED_END": r"\]",
@@ -60,15 +56,21 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
         "OTHER": r".+?",  # Catch any other character so that parsing doesn't stop.
     }
 
+
+LEXER_RULES = get_lexer_rules()
+RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
+RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
+RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])
+
+
+def get_expression_lexer() -> re.Scanner:
     def get_tokenizer(name):
         def tokenizer(scan, value) -> ExpLexerToken | str:
-            if name_only:
-                return name
             return ExpLexerToken(name, value, scan.match.start(), scan.match.end())
 
         return tokenizer
 
-    lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
+    lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
     # re.Scanner is undocumented but has been around since at least 2003
     # https://mail.python.org/pipermail/python-dev/2003-April/035075.html
     return re.Scanner(lexicon)
@@ -84,9 +86,8 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
         self.end: int = end
 
 
-# Scanner takes a few 100ms to compile so use these shared instances.
+# Scanner takes a few 100ms to compile so use the shared instance.
 _EXPRESSION_LEXER = get_expression_lexer()
-_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)
 
 
 @lru_cache(maxsize=128)
@@ -103,32 +104,29 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
     return tokens, remainder
 
 
-def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
-    """
-    Does the expression contain single token of one of the provided token types?
-    """
-    if not expression:
-        return False
-    tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
-    if 1 == len(tokens) and tokens[0] in token_types:
-        return True
-    else:
-        return False
-
-
 def is_pyxform_reference(value: str) -> bool:
     """
     Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
     """
-    if not value or len(value) <= 3:  # Needs 3 characters for "${}", plus a name inside.
-        return False
-    return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
+    # Needs 3 characters for "${}", plus a name inside.
+    return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.match(value))
 
 
 def is_xml_tag(value: str) -> bool:
     """
     Does the input string contain only a valid XML tag / element name?
     """
-    if not value:
-        return False
-    return is_single_token_expression(expression=value, token_types=("NAME",))
+    return value and bool(RE_ONLY_NCNAME.match(value))
+
+
+def has_last_saved(value: str) -> bool:
+    """
+    Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
+    """
+    # Needs 14 characters for "${last-saved#}", plus a name inside.
+    return (
+        value
+        and len(value) > 14
+        and "${last-saved#" in value
+        and RE_ANY_PYXFORM_REF.search(value)
+    )
diff --git a/pyxform/parsing/instance_expression.py b/pyxform/parsing/instance_expression.py
@@ -21,11 +21,13 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
     :param xml_text: XML text that may contain an instance expression.
     :return: Tokens in instance expression, and the string position boundaries.
     """
+    tokens, _ = parse_expression(xml_text)
+    if not tokens:
+        return []
     instance_enter = False
     path_enter = False
     pred_enter = False
     last_token = None
-    tokens, _ = parse_expression(xml_text)
     boundaries = []
 
     for t in tokens:
@@ -96,8 +98,11 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
     :param survey: The Survey that the context is in.
     :return: The possibly modified string.
     """
+    # 9 = len("instance(")
+    if 9 >= len(xml_text):
+        return xml_text
     boundaries = find_boundaries(xml_text=xml_text)
-    if 0 < len(boundaries):
+    if boundaries:
         new_strings = []
         for start, end in boundaries:
             old_str = xml_text[start:end]
@@ -116,6 +121,6 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
         # expression positions due to incremental replacement.
         offset = 0
         for s, e, o, n in new_strings:
-            xml_text = xml_text[: s + offset] + n + xml_text[e + offset :]
+            xml_text = f"{xml_text[: s + offset]}{n}{xml_text[e + offset :]}"
             offset += len(n) - len(o)
     return xml_text