Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More performance improvements #743

Merged
merged 9 commits into from
Dec 24, 2024
4 changes: 2 additions & 2 deletions pyxform/entities/entity_declaration.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, name: str, type: str, parameters: dict, **kwargs):
super().__init__(name=name, **kwargs)

def xml_instance(self, **kwargs):
parameters = self.get(const.PARAMETERS, {})
parameters = self.parameters

attributes = {
EC.DATASET.value: parameters.get(EC.DATASET, ""),
Expand Down Expand Up @@ -75,7 +75,7 @@ def xml_bindings(self, survey: "Survey"):
"""
See the class comment for an explanation of the logic for generating bindings.
"""
parameters = self.get(const.PARAMETERS, {})
parameters = self.parameters
entity_id_expression = parameters.get(EC.ENTITY_ID, None)
create_condition = parameters.get(EC.CREATE_IF, None)
update_condition = parameters.get(EC.UPDATE_IF, None)
Expand Down
60 changes: 29 additions & 31 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import re
from collections.abc import Iterable
from functools import lru_cache


def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"""
Get a expression lexer (scanner) for parsing.
"""
def get_lexer_rules():
# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
Expand All @@ -29,7 +25,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
date_time_regex = date_regex + "T" + time_regex

# Rule order is significant - match priority runs top to bottom.
lexer_rules = {
return {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
Expand All @@ -49,7 +45,7 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": r"\$\{" + ncname_regex + r"(#" + ncname_regex + r")?" + r"\}",
"PYXFORM_REF": r"\$\{(last-saved#)?" + ncname_regex + r"\}",
"FUNC_CALL": ncname_regex + r"\(",
"XPATH_PRED_START": ncname_regex + r"\[",
"XPATH_PRED_END": r"\]",
Expand All @@ -60,15 +56,21 @@ def get_expression_lexer(name_only: bool = False) -> re.Scanner:
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}


LEXER_RULES = get_lexer_rules()
RE_ONLY_NCNAME = re.compile(rf"""^{LEXER_RULES["NAME"]}$""")
RE_ONLY_PYXFORM_REF = re.compile(rf"""^{LEXER_RULES["PYXFORM_REF"]}$""")
RE_ANY_PYXFORM_REF = re.compile(LEXER_RULES["PYXFORM_REF"])


def get_expression_lexer() -> re.Scanner:
def get_tokenizer(name):
def tokenizer(scan, value) -> ExpLexerToken | str:
if name_only:
return name
return ExpLexerToken(name, value, scan.match.start(), scan.match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in lexer_rules.items()]
lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)
Expand All @@ -84,9 +86,8 @@ def __init__(self, name: str, value: str, start: int, end: int) -> None:
self.end: int = end


# Scanner takes a few 100ms to compile so use these shared instances.
# Scanner takes a few 100ms to compile so use the shared instance.
_EXPRESSION_LEXER = get_expression_lexer()
_TOKEN_NAME_LEXER = get_expression_lexer(name_only=True)


@lru_cache(maxsize=128)
Expand All @@ -103,32 +104,29 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
return tokens, remainder


def is_single_token_expression(expression: str, token_types: Iterable[str]) -> bool:
"""
Does the expression contain single token of one of the provided token types?
"""
if not expression:
return False
tokens, _ = _TOKEN_NAME_LEXER.scan(expression.strip())
if 1 == len(tokens) and tokens[0] in token_types:
return True
else:
return False


def is_pyxform_reference(value: str) -> bool:
"""
Does the input string contain only a valid Pyxform reference? e.g. ${my_question}
"""
if not value or len(value) <= 3: # Needs 3 characters for "${}", plus a name inside.
return False
return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",))
# Needs 3 characters for "${}", plus a name inside.
return value and len(value) > 3 and bool(RE_ONLY_PYXFORM_REF.match(value))


def is_xml_tag(value: str) -> bool:
"""
Does the input string contain only a valid XML tag / element name?
"""
if not value:
return False
return is_single_token_expression(expression=value, token_types=("NAME",))
return value and bool(RE_ONLY_NCNAME.match(value))


def has_last_saved(value: str) -> bool:
"""
Does the input string contain a valid '#last-saved' Pyxform reference? e.g. ${last-saved#my_question}
"""
# Needs 14 characters for "${last-saved#}", plus a name inside.
return (
value
and len(value) > 14
and "${last-saved#" in value
and RE_ANY_PYXFORM_REF.search(value)
)
11 changes: 8 additions & 3 deletions pyxform/parsing/instance_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
:param xml_text: XML text that may contain an instance expression.
:return: Tokens in instance expression, and the string position boundaries.
"""
tokens, _ = parse_expression(xml_text)
if not tokens:
return []
instance_enter = False
path_enter = False
pred_enter = False
last_token = None
tokens, _ = parse_expression(xml_text)
boundaries = []

for t in tokens:
Expand Down Expand Up @@ -96,8 +98,11 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
:param survey: The Survey that the context is in.
:return: The possibly modified string.
"""
# 9 = len("instance(")
if 9 >= len(xml_text):
return xml_text
boundaries = find_boundaries(xml_text=xml_text)
if 0 < len(boundaries):
if boundaries:
new_strings = []
for start, end in boundaries:
old_str = xml_text[start:end]
Expand All @@ -116,6 +121,6 @@ def replace_with_output(xml_text: str, context: "SurveyElement", survey: "Survey
# expression positions due to incremental replacement.
offset = 0
for s, e, o, n in new_strings:
xml_text = xml_text[: s + offset] + n + xml_text[e + offset :]
xml_text = f"{xml_text[: s + offset]}{n}{xml_text[e + offset :]}"
offset += len(n) - len(o)
return xml_text
Loading
Loading