diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py index 017b67511..34f44e7c7 100644 --- a/pyxform/entities/entities_parsing.py +++ b/pyxform/entities/entities_parsing.py @@ -3,7 +3,8 @@ from pyxform import constants as const from pyxform.aliases import yes_no from pyxform.errors import PyXFormError -from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag +from pyxform.parsing.expression import is_xml_tag +from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings EC = const.EntityColumns @@ -73,7 +74,7 @@ def get_validated_dataset_name(entity): f"Invalid entity list name: '{dataset}'. Names may not include periods." ) - if not is_valid_xml_tag(dataset): + if not is_xml_tag(dataset): if isinstance(dataset, bytes): dataset = dataset.decode("utf-8") @@ -118,7 +119,7 @@ def validate_entity_saveto( f"{error_start} the entity property name '{save_to}' starts with reserved prefix {const.ENTITIES_RESERVED_PREFIX}." ) - if not is_valid_xml_tag(save_to): + if not is_xml_tag(save_to): if isinstance(save_to, bytes): save_to = save_to.decode("utf-8") diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py index af9198594..29df4fa1d 100644 --- a/pyxform/parsing/expression.py +++ b/pyxform/parsing/expression.py @@ -37,9 +37,9 @@ def get_expression_lexer() -> re.Scanner: "TIME": time_regex, "NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+", # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex - "OPS_MATH": r"[\*\+\-]|mod|div", + "OPS_MATH": r"[\*\+\-]| mod | div ", "OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=", - "OPS_BOOL": r"and|or", + "OPS_BOOL": r" and | or ", "OPS_UNION": r"\|", "OPEN_PAREN": r"\(", "CLOSE_PAREN": r"\)", @@ -107,3 +107,17 @@ def is_single_token_expression(expression: str, token_types: Iterable[str]) -> b return True else: return False + + +def is_pyxform_reference(value: str) -> bool: + """ + Does the input string contain only a valid Pyxform reference? e.g. ${my_question} + """ + return is_single_token_expression(expression=value, token_types=("PYXFORM_REF",)) + + +def is_xml_tag(value: str) -> bool: + """ + Does the input string contain only a valid XML tag / element name? + """ + return is_single_token_expression(expression=value, token_types=("NAME",)) diff --git a/pyxform/survey_element.py b/pyxform/survey_element.py index 504e83d0b..3468f92f9 100644 --- a/pyxform/survey_element.py +++ b/pyxform/survey_element.py @@ -11,6 +11,7 @@ from pyxform import aliases as alias from pyxform import constants as const from pyxform.errors import PyXFormError +from pyxform.parsing.expression import is_xml_tag from pyxform.question_type_dictionary import QUESTION_TYPE_DICT from pyxform.utils import ( BRACKETED_TAG_REGEX, @@ -19,7 +20,6 @@ node, ) from pyxform.xls2json import print_pyobj_to_json -from pyxform.xlsparseutils import is_valid_xml_tag if TYPE_CHECKING: from pyxform.utils import DetachableElement @@ -140,7 +140,7 @@ def add_children(self, children): SUPPORTED_MEDIA = ("image", "big-image", "audio", "video") def validate(self): - if not is_valid_xml_tag(self.name): + if not is_xml_tag(self.name): invalid_char = re.search(INVALID_XFORM_TAG_REGEXP, self.name) raise PyXFormError( f"The name '{self.name}' contains an invalid character '{invalid_char.group(0)}'. Names {const.XML_IDENTIFIER_ERROR_MESSAGE}" diff --git a/pyxform/validators/pyxform/choices.py b/pyxform/validators/pyxform/choices.py new file mode 100644 index 000000000..c97298e75 --- /dev/null +++ b/pyxform/validators/pyxform/choices.py @@ -0,0 +1,59 @@ +from pyxform import constants +from pyxform.errors import PyXFormError + +INVALID_NAME = ( + "[row : {row}] On the 'choices' sheet, the 'name' value is invalid. " + "Choices must have a name. " + "Learn more: https://xlsform.org/en/#setting-up-your-worksheets" +) +INVALID_LABEL = ( + "[row : {row}] On the 'choices' sheet, the 'label' value is invalid. " + "Choices should have a label. " + "Learn more: https://xlsform.org/en/#setting-up-your-worksheets" +) +INVALID_HEADER = ( + "[row : 1] On the 'choices' sheet, the '{column}' value is invalid. " + "Column headers must not be empty and must not contain spaces. " + "Learn more: https://xlsform.org/en/#setting-up-your-worksheets" +) +INVALID_DUPLICATE = ( + "[row : {row}] On the 'choices' sheet, the 'name' value is invalid. " + "Choice names must be unique for each choice list. " + "If this is intentional, use the setting 'allow_choice_duplicates'. " + "Learn more: https://xlsform.org/#choice-names." +) + + +def validate_headers( + headers: tuple[tuple[str, ...], ...], warnings: list[str] +) -> list[str]: + def check(): + for header in headers: + header = header[0] + if header != constants.LIST_NAME_S and (" " in header or header == ""): + warnings.append(INVALID_HEADER.format(column=header)) + yield header + + return list(check()) + + +def validate_choices( + options: list[dict], warnings: list[str], allow_duplicates: bool = False +) -> None: + seen_options = set() + duplicate_errors = [] + for option in options: + if "name" not in option: + raise PyXFormError(INVALID_NAME.format(row=option["__row"])) + elif "label" not in option: + warnings.append(INVALID_LABEL.format(row=option["__row"])) + + if not allow_duplicates: + name = option["name"] + if name in seen_options: + duplicate_errors.append(INVALID_DUPLICATE.format(row=option["__row"])) + else: + seen_options.add(name) + + if 0 < len(duplicate_errors): + raise PyXFormError("\n".join(duplicate_errors)) diff --git a/pyxform/validators/pyxform/question_types.py b/pyxform/validators/pyxform/question_types.py index 7ec18edde..3c62673f9 100644 --- a/pyxform/validators/pyxform/question_types.py +++ b/pyxform/validators/pyxform/question_types.py @@ -3,7 +3,7 @@ """ from pyxform.errors import PyXFormError -from pyxform.parsing.expression import is_single_token_expression +from pyxform.parsing.expression import is_pyxform_reference from pyxform.utils import PYXFORM_REFERENCE_REGEX BACKGROUND_GEOPOINT_CALCULATION = "[row : {r}] For 'background-geopoint' questions, the 'calculation' column must be empty." @@ -25,9 +25,7 @@ def validate_background_geopoint_calculation(row: dict, row_num: int) -> bool: def validate_background_geopoint_trigger(row: dict, row_num: int) -> bool: """A background-geopoint must have a trigger.""" - if not row.get("trigger", False) or not is_single_token_expression( - expression=row["trigger"], token_types=["PYXFORM_REF"] - ): + if not row.get("trigger", False) or not is_pyxform_reference(value=row["trigger"]): raise PyXFormError(TRIGGER_INVALID.format(r=row_num, t=row["type"])) return True diff --git a/pyxform/xlsparseutils.py b/pyxform/validators/pyxform/sheet_misspellings.py similarity index 76% rename from pyxform/xlsparseutils.py rename to pyxform/validators/pyxform/sheet_misspellings.py index 280f706a0..c83fef313 100644 --- a/pyxform/xlsparseutils.py +++ b/pyxform/validators/pyxform/sheet_misspellings.py @@ -1,14 +1,8 @@ -import re from collections.abc import KeysView from pyxform import constants from pyxform.utils import levenshtein_distance -# http://www.w3.org/TR/REC-xml/ -TAG_START_CHAR = r"[a-zA-Z:_]" -TAG_CHAR = r"[a-zA-Z:_0-9\-.]" -XFORM_TAG_REGEXP = re.compile(rf"^{TAG_START_CHAR}{TAG_CHAR}*$") - def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None": """ @@ -36,10 +30,3 @@ def find_sheet_misspellings(key: str, keys: "KeysView") -> "str | None": return msg else: return None - - -def is_valid_xml_tag(tag): - """ - Use a regex to see if there are any invalid characters (i.e. spaces). - """ - return re.search(XFORM_TAG_REGEXP, tag) diff --git a/pyxform/xls2json.py b/pyxform/xls2json.py index 3b260e22b..7e4e6ebe8 100644 --- a/pyxform/xls2json.py +++ b/pyxform/xls2json.py @@ -6,7 +6,6 @@ import os import re import sys -from collections import Counter from typing import IO, Any from pyxform import aliases, constants @@ -21,15 +20,16 @@ validate_entity_saveto, ) from pyxform.errors import PyXFormError -from pyxform.parsing.expression import is_single_token_expression +from pyxform.parsing.expression import is_pyxform_reference, is_xml_tag from pyxform.utils import PYXFORM_REFERENCE_REGEX, coalesce, default_is_dynamic +from pyxform.validators.pyxform import choices as vc from pyxform.validators.pyxform import parameters_generic, select_from_file from pyxform.validators.pyxform import question_types as qt from pyxform.validators.pyxform.android_package_name import validate_android_package_name from pyxform.validators.pyxform.pyxform_reference import validate_pyxform_reference_syntax +from pyxform.validators.pyxform.sheet_misspellings import find_sheet_misspellings from pyxform.validators.pyxform.translations_checks import SheetTranslations from pyxform.xls2json_backends import csv_to_dict, xls_to_dict, xlsx_to_dict -from pyxform.xlsparseutils import find_sheet_misspellings, is_valid_xml_tag SMART_QUOTES = {"\u2018": "'", "\u2019": "'", "\u201c": '"', "\u201d": '"'} RE_SMART_QUOTES = re.compile(r"|".join(re.escape(old) for old in SMART_QUOTES)) @@ -175,7 +175,12 @@ def dealias_types(dict_array): return dict_array -def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool = False): +def clean_text_values( + sheet_name: str, + data: list[dict], + strip_whitespace: bool = False, + add_row_number: bool = False, +) -> list[dict]: """ Go though the dict array and strips all text values. Also replaces multiple spaces with single spaces. @@ -192,6 +197,8 @@ def clean_text_values(sheet_name: str, data: list[dict], strip_whitespace: bool validate_pyxform_reference_syntax( value=value, sheet_name=sheet_name, row_number=row_number, key=key ) + if add_row_number: + row["__row"] = row_number return data @@ -513,7 +520,11 @@ def workbook_to_json( # ########## Choices sheet ########## choices_sheet = workbook_dict.get(constants.CHOICES, []) - choices_sheet = clean_text_values(sheet_name=constants.CHOICES, data=choices_sheet) + choices_sheet = clean_text_values( + sheet_name=constants.CHOICES, + data=choices_sheet, + add_row_number=True, + ) choices_sheet = dealias_and_group_headers( dict_array=choices_sheet, header_aliases=aliases.list_header, @@ -523,73 +534,27 @@ def workbook_to_json( choices = group_dictionaries_by_key( list_of_dicts=choices_sheet.data, key=constants.LIST_NAME_S ) - if 0 < len(choices): - json_dict[constants.CHOICES] = choices # To combine the warning into one message, the check for missing choices translation # columns is run with Survey sheet below. - # Make sure all the options have the required properties: - warnedabout = set() - for list_name, options in choices.items(): + # Warn and remove invalid headers in case the form uses headers for notes. + invalid_headers = vc.validate_headers(choices_sheet.headers, warnings) + allow_duplicates = aliases.yes_no.get( + settings.get("allow_choice_duplicates", False), False + ) + for options in choices.values(): + vc.validate_choices( + options=options, + warnings=warnings, + allow_duplicates=allow_duplicates, + ) for option in options: - if "name" not in option: - info = "[list_name : " + list_name + "]" - raise PyXFormError( - "On the choices sheet there is a option with no name. " + info - ) - if "label" not in option: - info = "[list_name : " + list_name + "]" - warnings.append( - "On the choices sheet there is a option with no label. " + info - ) - # chrislrobert's fix for a cryptic error message: - # see: https://code.google.com/p/opendatakit/issues/detail?id=833&start=200 - option_keys = list(option.keys()) - for headername in option_keys: - # Using warnings and removing the bad columns - # instead of throwing errors because some forms - # use choices column headers for notes. - if " " in headername: - if headername not in warnedabout: - warnedabout.add(headername) - warnings.append( - "On the choices sheet there is " - + 'a column ("' - + headername - + '") with an illegal header. ' - + "Headers cannot include spaces." - ) - del option[headername] - elif headername == "": - warnings.append( - "On the choices sheet there is a value" - + " in a column with no header." - ) - del option[headername] - list_name_choices = [option.get("name") for option in options] - if len(list_name_choices) != len(set(list_name_choices)): - duplicate_setting = settings.get("allow_choice_duplicates") - for v in Counter(list_name_choices).values(): - if v > 1: - if not duplicate_setting or duplicate_setting.capitalize() != "Yes": - choice_duplicates = [ - item - for item, count in Counter(list_name_choices).items() - if count > 1 - ] - - if choice_duplicates: - raise PyXFormError( - "The name column for the '{}' choice list contains these duplicates: {}. Duplicate names " - "will be impossible to identify in analysis unless a previous value in a cascading " - "select differentiates them. If this is intentional, you can set the " - "allow_choice_duplicates setting to 'yes'. Learn more: https://xlsform.org/#choice-names.".format( - list_name, - ", ".join( - [f"'{dupe}'" for dupe in choice_duplicates] - ), - ) - ) + for invalid_header in invalid_headers: + option.pop(invalid_header, None) + del option["__row"] + + if 0 < len(choices): + json_dict[constants.CHOICES] = choices # ########## Entities sheet ########### entities_sheet = workbook_dict.get(constants.ENTITIES, []) @@ -945,7 +910,7 @@ def workbook_to_json( ROW_FORMAT_STRING % row_number + " Question or group with no name." ) question_name = str(row[constants.NAME]) - if not is_valid_xml_tag(question_name): + if not is_xml_tag(question_name): if isinstance(question_name, bytes): question_name = question_name.decode("utf-8") @@ -1022,10 +987,7 @@ def workbook_to_json( repeat_count_expression = new_json_dict.get("control", {}).get("jr:count") if repeat_count_expression: # Simple expressions don't require a new node, they can reference directly. - simple_expression = is_single_token_expression( - expression=repeat_count_expression, token_types=["PYXFORM_REF"] - ) - if not simple_expression: + if not is_pyxform_reference(value=repeat_count_expression): generated_node_name = new_json_dict["name"] + "_count" parent_children_array.append( { diff --git a/tests/parsing/__init__.py b/tests/parsing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/parsing/test_expression.py b/tests/parsing/test_expression.py new file mode 100644 index 000000000..1fe3ad42c --- /dev/null +++ b/tests/parsing/test_expression.py @@ -0,0 +1,49 @@ +from pyxform.parsing.expression import is_xml_tag + +from tests.pyxform_test_case import PyxformTestCase + +positive = [ + ("A", "Single uppercase letter"), + ("ab", "Lowercase letters"), + ("_u", "Leading underscore"), + ("A12", "Leading uppercase letter with digit"), + ("A-1.23", "Leading uppercase letter with hyphen, period, and digit"), + ("Name123-456", "Mixed case, digits, hyphen"), + ("𐐀n", "Leading unicode"), + ("Αλ", "Following unicode"), + ("name:name", "NCName, colon, NCName"), + ("name_with_colon:_and_extras", "NCName, colon, NCName (non-letter characters)"), + # Other special character tokens are excluded by ncname_regex. + ("nameor", "Contains another parser token (or)"), + ("nameand", "Contains another parser token (and)"), + ("namemod", "Contains another parser token (mod)"), + ("namediv", "Contains another parser token (div)"), +] + +negative = [ + ("", "Empty string"), + (" ", "Space"), + ("123name", "Leading digit"), + ("-name", "Leading hyphen"), + (".name", "Leading period"), + (":name", "Leading colon"), + ("name$", "Invalid character ($)"), + ("name with space", "Invalid character (space)"), + ("na@me", "Invalid character (@)"), + ("na#me", "Invalid character (#)"), + ("name:name:name", "Invalid structure (multiple colons)"), +] + + +class TestExpression(PyxformTestCase): + def test_is_xml_tag__positive(self): + """Should accept positive match cases i.e. valid xml tag names.""" + for case, description in positive: + with self.subTest(case=case, description=description): + self.assertTrue(is_xml_tag(case)) + + def test_is_xml_tag__negative(self): + """Should reject negative match cases i.e. invalid xml tag names.""" + for case, description in negative: + with self.subTest(case=case, description=description): + self.assertFalse(is_xml_tag(case)) diff --git a/tests/test_fields.py b/tests/test_fields.py index 6c44fc9eb..555915352 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -2,6 +2,8 @@ Test duplicate survey question field name. """ +from pyxform.validators.pyxform import choices as vc + from tests.pyxform_test_case import PyxformTestCase from tests.xpath_helpers.choices import xpc from tests.xpath_helpers.questions import xpq @@ -57,9 +59,7 @@ def test_duplicate_choices_without_setting(self): | | list | b | option c | """, errored=True, - error__contains=[ - "The name column for the 'list' choice list contains these duplicates: 'b'" - ], + error__contains=[vc.INVALID_DUPLICATE.format(row=4)], ) def test_multiple_duplicate_choices_without_setting(self): @@ -77,8 +77,10 @@ def test_multiple_duplicate_choices_without_setting(self): """, errored=True, error__contains=[ - "The name column for the 'list' choice list contains these duplicates: 'a', 'b'" + vc.INVALID_DUPLICATE.format(row=3), + vc.INVALID_DUPLICATE.format(row=5), ], + debug=True, ) def test_duplicate_choices_with_setting_not_set_to_yes(self): @@ -97,9 +99,7 @@ def test_duplicate_choices_with_setting_not_set_to_yes(self): | | Duplicates | Bob | """, errored=True, - error__contains=[ - "The name column for the 'list' choice list contains these duplicates: 'b'" - ], + error__contains=[vc.INVALID_DUPLICATE.format(row=4)], ) def test_duplicate_choices_with_allow_choice_duplicates_setting(self): diff --git a/tests/test_sheet_columns.py b/tests/test_sheet_columns.py index a5b85a42f..3a7e708d1 100644 --- a/tests/test_sheet_columns.py +++ b/tests/test_sheet_columns.py @@ -2,6 +2,8 @@ Test XLSForm sheet names. """ +from pyxform.validators.pyxform import choices as vc + from tests.pyxform_test_case import PyxformTestCase from tests.utils import prep_for_xml_contains from tests.xpath_helpers.choices import xpc @@ -156,7 +158,7 @@ def test_invalid_choices_sheet_fails(self): ] ), errored=True, - error__contains=["option with no name"], + error__contains=[vc.INVALID_NAME.format(row=2)], ) def test_missing_list_name(self): diff --git a/tests/test_xlsform_spec.py b/tests/test_xlsform_spec.py index c3442fae2..891c591b5 100644 --- a/tests/test_xlsform_spec.py +++ b/tests/test_xlsform_spec.py @@ -1,3 +1,5 @@ +from pyxform.validators.pyxform import choices as vc + from tests.pyxform_test_case import PyxformTestCase @@ -61,10 +63,10 @@ def test_warnings__count(self): ) self.maxDiff = 2000 expected = [ - "On the choices sheet there is a option with no label. [list_name : a_b]", - "On the choices sheet there is a option with no label. [list_name : a_b]", - "On the choices sheet there is a option with no label. [list_name : animals]", - "On the choices sheet there is a option with no label. [list_name : animals]", + vc.INVALID_LABEL.format(row=4), + vc.INVALID_LABEL.format(row=5), + vc.INVALID_LABEL.format(row=6), + vc.INVALID_LABEL.format(row=7), "[row : 9] Repeat has no label: {'name': 'repeat_test', 'type': 'begin repeat'}", "[row : 10] Group has no label: {'name': 'group_test', 'type': 'begin group'}", "[row : 17] Group has no label: {'name': 'name', 'type': 'begin group'}", diff --git a/tests/xform_test_case/test_bugs.py b/tests/xform_test_case/test_bugs.py index c022a8121..c5db20d1f 100644 --- a/tests/xform_test_case/test_bugs.py +++ b/tests/xform_test_case/test_bugs.py @@ -10,6 +10,7 @@ from pyxform.errors import PyXFormError from pyxform.utils import has_external_choices from pyxform.validators.odk_validate import ODKValidateError, check_xform +from pyxform.validators.pyxform import choices as vc from pyxform.xls2json import SurveyReader, parse_file_to_workbook_dict from pyxform.xls2json_backends import xlsx_to_dict from pyxform.xls2xform import convert @@ -79,7 +80,11 @@ def test_conversion(self): ) # The "column with no header" warning is probably not reachable since XLS/X # pre-processing ignores any columns without a header. - observed = [w for w in warnings if "Headers cannot include spaces" in w] + observed = [ + w + for w in warnings + if w == vc.INVALID_HEADER.format(column="header with spaces") + ] self.assertEqual(1, len(observed), warnings) def test_values_with_spaces_are_cleaned(self):