From dff36c621f2dafd6a13faee9528dd736afe20b7b Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 4 Apr 2024 18:58:18 -0500 Subject: [PATCH] Add support for caching prerelease schemas. Generates a warning on validation --- hed/errors/error_types.py | 2 + hed/errors/schema_error_messages.py | 5 + hed/schema/hed_cache.py | 173 +++++++++++++++++----------- hed/schema/hed_schema_io.py | 7 ++ hed/schema/schema_compliance.py | 22 ++++ tests/schema/test_hed_schema_io.py | 3 +- 6 files changed, 144 insertions(+), 68 deletions(-) diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index e6c9fc1d..3f35833e 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -125,6 +125,8 @@ class SchemaWarnings: SCHEMA_NON_PLACEHOLDER_HAS_CLASS = 'SCHEMA_NON_PLACEHOLDER_HAS_CLASS' SCHEMA_PROLOGUE_CHARACTER_INVALID = "SCHEMA_PROLOGUE_CHARACTER_INVALID" + SCHEMA_PRERELEASE_VERSION_USED = "SCHEMA_PRERELEASE_VERSION_USED" + class SchemaAttributeErrors: SCHEMA_ATTRIBUTE_INVALID = 'SCHEMA_ATTRIBUTE_INVALID' diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py index 4995abc9..e88a275d 100644 --- a/hed/errors/schema_error_messages.py +++ b/hed/errors/schema_error_messages.py @@ -23,6 +23,11 @@ def schema_error_unknown_attribute(attribute_name, source_tag): f"or was used outside of it's defined class." +@hed_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, default_severity=ErrorSeverity.WARNING) +def schema_error_SCHEMA_PRERELEASE_VERSION_USED(current_version, known_versions): + return f"Schema version {current_version} used, which is prerelease or unofficial. Known versions are: {', '.join(known_versions)}" + + @hed_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, default_severity=ErrorSeverity.WARNING, actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID) def schema_error_invalid_character_prologue(char_index, source_string, section_name): diff --git a/hed/schema/hed_cache.py b/hed/schema/hed_cache.py index d51eeafa..584b95bb 100644 --- a/hed/schema/hed_cache.py +++ b/hed/schema/hed_cache.py @@ -28,10 +28,13 @@ HED_XML_PREFIX = 'HED' HED_XML_EXTENSION = '.xml' hedxml_suffix = "/hedxml" # The suffix for schema and library schema at the given urls +prerelease_suffix = "/prerelease" # The prerelease schemas at the given URLs -DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml" +DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema" LIBRARY_HED_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/library_schemas" -DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL, LIBRARY_HED_URL,) +DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL,) +DEFAULT_LIBRARY_URL_LIST = (LIBRARY_HED_URL,) + DEFAULT_SKIP_FOLDERS = ('deprecated',) @@ -62,7 +65,7 @@ def get_cache_directory(): return HED_CACHE_DIRECTORY -def get_hed_versions(local_hed_directory=None, library_name=None): +def get_hed_versions(local_hed_directory=None, library_name=None, check_prerelease=False): """ Get the HED versions in the hed directory. Parameters: @@ -70,6 +73,7 @@ def get_hed_versions(local_hed_directory=None, library_name=None): library_name (str or None): An optional schema library name. None retrieves the standard schema only. Pass "all" to retrieve all standard and library schemas as a dict. + check_prerelease (bool): If True, results can include prerelease schemas Returns: list or dict: List of version numbers or dictionary {library_name: [versions]}. @@ -83,6 +87,8 @@ def get_hed_versions(local_hed_directory=None, library_name=None): all_hed_versions = {} local_directory = local_hed_directory + if check_prerelease and not local_directory.endswith(prerelease_suffix): + local_directory += prerelease_suffix try: hed_files = os.listdir(local_directory) except FileNotFoundError: @@ -104,14 +110,14 @@ def get_hed_versions(local_hed_directory=None, library_name=None): return all_hed_versions -def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None): +def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None, check_prerelease=False): """ Get HED XML file path in a directory. Only returns filenames that exist. Parameters: library_name (str or None): Optional the schema library name. xml_version (str): Returns this version if it exists local_hed_directory (str): Path to local hed directory. Defaults to HED_CACHE_DIRECTORY - + check_prerelease(bool): Also check for prerelease schemas Returns: str: The path to the latest HED version the hed directory. @@ -119,11 +125,11 @@ def get_hed_version_path(xml_version, library_name=None, local_hed_directory=Non if not local_hed_directory: local_hed_directory = HED_CACHE_DIRECTORY - hed_versions = get_hed_versions(local_hed_directory, library_name) + hed_versions = get_hed_versions(local_hed_directory, library_name, check_prerelease) if not hed_versions or not xml_version: return None if xml_version in hed_versions: - return _create_xml_filename(xml_version, library_name, local_hed_directory) + return _create_xml_filename(xml_version, library_name, local_hed_directory, check_prerelease) def cache_local_versions(cache_folder): @@ -148,11 +154,12 @@ def cache_local_versions(cache_folder): return -1 -def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None): +def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_LIBRARY_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None): """ Cache all schemas at the given URLs. Parameters: - hed_base_urls (str or list): Path or list of paths. + hed_base_urls (str or list): Path or list of paths. These should point to a single folder. + hed_library_urls (str or list): Path or list of paths. These should point to a folder containing library folders. skip_folders (list): A list of subfolders to skip over when downloading. cache_folder (str): The folder holding the cache. @@ -170,8 +177,10 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP if not cache_folder: cache_folder = HED_CACHE_DIRECTORY - if not isinstance(hed_base_urls, (list, tuple)): + if isinstance(hed_base_urls, str): hed_base_urls = [hed_base_urls] + if isinstance(hed_library_urls, str): + hed_library_urls = [hed_library_urls] os.makedirs(cache_folder, exist_ok=True) last_timestamp = _read_last_cached_time(cache_folder) current_timestamp = time.time() @@ -182,12 +191,17 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP try: cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock") with portalocker.Lock(cache_lock_filename, timeout=1): + all_hed_versions = {} for hed_base_url in hed_base_urls: - all_hed_versions = _get_hed_xml_versions_from_url(hed_base_url, skip_folders=skip_folders, - get_libraries=True) - for library_name, hed_versions in all_hed_versions.items(): - for version, version_info in hed_versions.items(): - _cache_hed_version(version, library_name, version_info, cache_folder=cache_folder) + new_hed_versions = _get_hed_xml_versions_one_library(hed_base_url) + _merge_in_versions(all_hed_versions, new_hed_versions) + for hed_library_url in hed_library_urls: + new_hed_versions = _get_hed_xml_versions_from_url_all_libraries(hed_library_url, skip_folders=skip_folders) + _merge_in_versions(all_hed_versions, new_hed_versions) + + for library_name, hed_versions in all_hed_versions.items(): + for version, version_info in hed_versions.items(): + _cache_hed_version(version, library_name, version_info, cache_folder=cache_folder) _write_last_cached_time(current_timestamp, cache_folder) except portalocker.exceptions.LockException or ValueError or URLError: @@ -196,18 +210,6 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP return 0 -def _cache_specific_url(hed_xml_url, cache_filename): - """Copies a specific url to the cache at the given filename""" - cache_folder = cache_filename.rpartition("/")[0] - os.makedirs(cache_folder, exist_ok=True) - temp_hed_xml_file = url_to_file(hed_xml_url) - if temp_hed_xml_file: - cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename) - os.remove(temp_hed_xml_file) - return cache_filename - return None - - def _copy_installed_schemas_to_cache(cache_folder): """Copies the schemas from the install folder to the cache""" installed_files = os.listdir(INSTALLED_CACHE_LOCATION) @@ -264,12 +266,13 @@ def _check_if_url(hed_xml_or_url): return False -def _create_xml_filename(hed_xml_version, library_name=None, hed_directory=None): +def _create_xml_filename(hed_xml_version, library_name=None, hed_directory=None, prerelease=False): """Returns the default file name format for the given version""" + prerelease_prefix = f"prerelease/" if prerelease else "" if library_name: - hed_xml_basename = f"{HED_XML_PREFIX}_{library_name}_{hed_xml_version}{HED_XML_EXTENSION}" + hed_xml_basename = f"{prerelease_prefix}{HED_XML_PREFIX}_{library_name}_{hed_xml_version}{HED_XML_EXTENSION}" else: - hed_xml_basename = HED_XML_PREFIX + hed_xml_version + HED_XML_EXTENSION + hed_xml_basename = prerelease_prefix + HED_XML_PREFIX + hed_xml_version + HED_XML_EXTENSION if hed_directory: hed_xml_filename = os.path.join(hed_directory, hed_xml_basename) @@ -281,15 +284,60 @@ def _sort_version_list(hed_versions): return sorted(hed_versions, key=Version, reverse=True) -def _get_hed_xml_versions_from_url(hed_base_url, library_name=None, - skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False): +def _get_hed_xml_versions_one_folder(hed_folder_url): + url_request = make_url_request(hed_folder_url) + url_data = str(url_request.read(), 'utf-8') + loaded_json = json.loads(url_data) + + all_hed_versions = {} + for file_entry in loaded_json: + if file_entry['type'] == "dir": + continue + expression_match = version_pattern.match(file_entry["name"]) + if expression_match is not None: + version = expression_match.group(3) + found_library_name = expression_match.group(2) + if found_library_name not in all_hed_versions: + all_hed_versions[found_library_name] = {} + all_hed_versions[found_library_name][version] = file_entry["sha"], file_entry["download_url"], hed_folder_url.endswith(prerelease_suffix) + + return all_hed_versions + + +def _get_hed_xml_versions_one_library(hed_one_library_url): + all_hed_versions = {} + try: + finalized_versions = \ + _get_hed_xml_versions_one_folder(hed_one_library_url + hedxml_suffix) + _merge_in_versions(all_hed_versions, finalized_versions) + except urllib.error.URLError: + # Silently ignore ones without a hedxml section for now. + pass + try: + pre_release_folder_versions = \ + _get_hed_xml_versions_one_folder(hed_one_library_url + prerelease_suffix) + _merge_in_versions(all_hed_versions, pre_release_folder_versions) + except urllib.error.URLError: + # Silently ignore ones without a prerelease section for now. + pass + + ordered_versions = {} + for hed_library_name, hed_versions in all_hed_versions.items(): + ordered_versions1 = _sort_version_list(hed_versions) + ordered_versions2 = [(version, hed_versions[version]) for version in ordered_versions1] + ordered_versions[hed_library_name] = dict(ordered_versions2) + + return ordered_versions + + +def _get_hed_xml_versions_from_url_all_libraries(hed_base_library_url, library_name=None, skip_folders=DEFAULT_SKIP_FOLDERS): """ Get all available schemas and their hash values Parameters: - hed_base_url (str): A single GitHub API url to cache - library_name(str or None): If str, cache only the named library schemas + hed_base_library_url(str): A single GitHub API url to cache, which contains library schema folders + The subfolders should be a schema folder containing hedxml and/or prerelease folders. + library_name(str or None): If str, cache only the named library schemas. skip_folders (list): A list of sub folders to skip over when downloading. - get_libraries (bool): If True, return a dictionary of version numbers, with an entry for each library name. Returns: list or dict: List of version numbers or dictionary {library_name: [versions]}. @@ -300,46 +348,25 @@ def _get_hed_xml_versions_from_url(hed_base_url, library_name=None, - The directories on GitHub are of the form: https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml """ - url_request = make_url_request(hed_base_url) + url_request = make_url_request(hed_base_library_url) url_data = str(url_request.read(), 'utf-8') loaded_json = json.loads(url_data) all_hed_versions = {} for file_entry in loaded_json: if file_entry['type'] == "dir": - if hed_base_url.endswith(hedxml_suffix): - continue if file_entry['name'] in skip_folders: continue - try: - sub_folder_versions = \ - _get_hed_xml_versions_from_url(hed_base_url + "/" + file_entry['name'] + hedxml_suffix, - skip_folders=skip_folders, get_libraries=True) - except urllib.error.URLError: - # Silently ignore ones without a hedxml section for now. - continue - _merge_in_versions(all_hed_versions, sub_folder_versions) - expression_match = version_pattern.match(file_entry["name"]) - if expression_match is not None: - version = expression_match.group(3) - found_library_name = expression_match.group(2) - if not get_libraries and found_library_name != library_name: + found_library_name = file_entry['name'] + if library_name is not None and found_library_name != library_name: continue - if found_library_name not in all_hed_versions: - all_hed_versions[found_library_name] = {} - all_hed_versions[found_library_name][version] = file_entry["sha"], file_entry["download_url"] - - ordered_versions = {} - for hed_library_name, hed_versions in all_hed_versions.items(): - ordered_versions1 = _sort_version_list(hed_versions) - ordered_versions2 = [(version, hed_versions[version]) for version in ordered_versions1] - ordered_versions[hed_library_name] = dict(ordered_versions2) + single_library_versions = _get_hed_xml_versions_one_library(hed_base_library_url + "/" + found_library_name) + _merge_in_versions(all_hed_versions, single_library_versions) + continue - if get_libraries: - return ordered_versions - if library_name in ordered_versions: - return ordered_versions[library_name] - return {} + if library_name in all_hed_versions: + return all_hed_versions[library_name] + return all_hed_versions def _merge_in_versions(all_hed_versions, sub_folder_versions): @@ -393,12 +420,24 @@ def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename): def _cache_hed_version(version, library_name, version_info, cache_folder): """Cache the given hed version""" - sha_hash, download_url = version_info + sha_hash, download_url, prerelease = version_info - possible_cache_filename = _create_xml_filename(version, library_name, cache_folder) + possible_cache_filename = _create_xml_filename(version, library_name, cache_folder, prerelease) local_sha_hash = _calculate_sha1(possible_cache_filename) if sha_hash == local_sha_hash: return possible_cache_filename return _cache_specific_url(download_url, possible_cache_filename) + + +def _cache_specific_url(hed_xml_url, cache_filename): + """Copies a specific url to the cache at the given filename""" + cache_folder = cache_filename.rpartition("/")[0] + os.makedirs(cache_folder, exist_ok=True) + temp_hed_xml_file = url_to_file(hed_xml_url) + if temp_hed_xml_file: + cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename) + os.remove(temp_hed_xml_file) + return cache_filename + return None diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 7137bf02..23b2d40d 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -220,15 +220,22 @@ def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None, f"Must specify a schema version by number, found no version on {xml_version} schema.", filename=name) try: + # 1. Try fully local copy final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder) if not final_hed_xml_file: hed_cache.cache_local_versions(xml_folder) + # 2. Cache the schemas included in hedtools and try local again final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder) hed_schema = load_schema(final_hed_xml_file, schema=schema, name=name) except HedFileError as e: if e.code == HedExceptions.FILE_NOT_FOUND: + # Cache all schemas if we haven't recently. hed_cache.cache_xml_versions(cache_folder=xml_folder) + # 3. See if we got a copy from online final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder) + # 4. Finally check for a pre-release one + if not final_hed_xml_file: + final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder, check_prerelease=True) if not final_hed_xml_file: raise HedFileError(HedExceptions.FILE_NOT_FOUND, f"HED version '{xml_version}' not found in cache: {hed_cache.get_cache_directory()}", diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index 703cc7cb..33b32409 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -8,6 +8,8 @@ get_allowed_characters_by_name, get_problem_indexes, validate_schema_description_new from hed.schema.schema_validation_util_deprecated import validate_schema_tag, validate_schema_description, verify_no_brackets from functools import partial +from hed.schema import hed_cache +from semantic_version import Version def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handler=None): @@ -36,6 +38,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl name = hed_schema.filename error_handler.push_error_context(ErrorContext.FILE_NAME, name) + issues_list += validator.check_if_prerelease_version() issues_list += validator.check_prologue_epilogue() issues_list += validator.check_invalid_chars() issues_list += validator.check_attributes() @@ -85,6 +88,25 @@ def __init__(self, hed_schema, error_handler): self.error_handler = error_handler self._new_character_validation = hed_schema.schema_83_props + def check_if_prerelease_version(self): + issues = [] + libraries = self.hed_schema.library.split(",") + versions = self.hed_schema.version_number.split(",") + for library, version in zip(libraries, versions): + all_known_versions = hed_cache.get_hed_versions(library_name=library) + if "," not in library and not all_known_versions or Version(all_known_versions[0]) < Version(version): + issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, version, + all_known_versions) + + if self.hed_schema.with_standard: + all_known_versions = hed_cache.get_hed_versions() + if not all_known_versions or Version(all_known_versions[0]) < Version(self.hed_schema.with_standard): + issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, + self.hed_schema.with_standard, + all_known_versions) + self.error_handler.add_context_and_filter(issues) + return issues + def check_prologue_epilogue(self): issues = [] if self._new_character_validation: diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index bfd79371..bf8db95f 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -120,7 +120,8 @@ def test_verify_utf8_dupe(self): schema_path = os.path.join(base_dir, "schema_utf8_dupe.mediawiki") schema = load_schema(schema_path) issues = schema.check_compliance() - self.assertEqual(len(issues), 1) + # This can be 1 or 2, depending on if the "pre-release" warning shows up. + self.assertTrue(1 <= len(issues) <= 2) # Note it finds both of these as a duplicate self.assertTrue(schema.get_tag_entry("Wßord"))