Skip to content

Commit

Permalink
Merge pull request #901 from IanCa/develop
Browse files Browse the repository at this point in the history
Add support for caching prerelease schemas.
  • Loading branch information
VisLab authored Apr 5, 2024
2 parents 3eaaff5 + dff36c6 commit 12750cf
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 68 deletions.
2 changes: 2 additions & 0 deletions hed/errors/error_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ class SchemaWarnings:
SCHEMA_NON_PLACEHOLDER_HAS_CLASS = 'SCHEMA_NON_PLACEHOLDER_HAS_CLASS'
SCHEMA_PROLOGUE_CHARACTER_INVALID = "SCHEMA_PROLOGUE_CHARACTER_INVALID"

SCHEMA_PRERELEASE_VERSION_USED = "SCHEMA_PRERELEASE_VERSION_USED"


class SchemaAttributeErrors:
SCHEMA_ATTRIBUTE_INVALID = 'SCHEMA_ATTRIBUTE_INVALID'
Expand Down
5 changes: 5 additions & 0 deletions hed/errors/schema_error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def schema_error_unknown_attribute(attribute_name, source_tag):
f"or was used outside of it's defined class."


@hed_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, default_severity=ErrorSeverity.WARNING)
def schema_error_SCHEMA_PRERELEASE_VERSION_USED(current_version, known_versions):
return f"Schema version {current_version} used, which is prerelease or unofficial. Known versions are: {', '.join(known_versions)}"


@hed_error(SchemaWarnings.SCHEMA_PROLOGUE_CHARACTER_INVALID, default_severity=ErrorSeverity.WARNING,
actual_code=SchemaWarnings.SCHEMA_CHARACTER_INVALID)
def schema_error_invalid_character_prologue(char_index, source_string, section_name):
Expand Down
173 changes: 106 additions & 67 deletions hed/schema/hed_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,13 @@
HED_XML_PREFIX = 'HED'
HED_XML_EXTENSION = '.xml'
hedxml_suffix = "/hedxml" # The suffix for schema and library schema at the given urls
prerelease_suffix = "/prerelease" # The prerelease schemas at the given URLs

DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml"
DEFAULT_HED_LIST_VERSIONS_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema"
LIBRARY_HED_URL = "https://api.github.com/repos/hed-standard/hed-schemas/contents/library_schemas"
DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL, LIBRARY_HED_URL,)
DEFAULT_URL_LIST = (DEFAULT_HED_LIST_VERSIONS_URL,)
DEFAULT_LIBRARY_URL_LIST = (LIBRARY_HED_URL,)


DEFAULT_SKIP_FOLDERS = ('deprecated',)

Expand Down Expand Up @@ -62,14 +65,15 @@ def get_cache_directory():
return HED_CACHE_DIRECTORY


def get_hed_versions(local_hed_directory=None, library_name=None):
def get_hed_versions(local_hed_directory=None, library_name=None, check_prerelease=False):
""" Get the HED versions in the hed directory.
Parameters:
local_hed_directory (str): Directory to check for versions which defaults to hed_cache.
library_name (str or None): An optional schema library name.
None retrieves the standard schema only.
Pass "all" to retrieve all standard and library schemas as a dict.
check_prerelease (bool): If True, results can include prerelease schemas
Returns:
list or dict: List of version numbers or dictionary {library_name: [versions]}.
Expand All @@ -83,6 +87,8 @@ def get_hed_versions(local_hed_directory=None, library_name=None):

all_hed_versions = {}
local_directory = local_hed_directory
if check_prerelease and not local_directory.endswith(prerelease_suffix):
local_directory += prerelease_suffix
try:
hed_files = os.listdir(local_directory)
except FileNotFoundError:
Expand All @@ -104,26 +110,26 @@ def get_hed_versions(local_hed_directory=None, library_name=None):
return all_hed_versions


def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None):
def get_hed_version_path(xml_version, library_name=None, local_hed_directory=None, check_prerelease=False):
""" Get HED XML file path in a directory. Only returns filenames that exist.
Parameters:
library_name (str or None): Optional the schema library name.
xml_version (str): Returns this version if it exists
local_hed_directory (str): Path to local hed directory. Defaults to HED_CACHE_DIRECTORY
check_prerelease(bool): Also check for prerelease schemas
Returns:
str: The path to the latest HED version the hed directory.
"""
if not local_hed_directory:
local_hed_directory = HED_CACHE_DIRECTORY

hed_versions = get_hed_versions(local_hed_directory, library_name)
hed_versions = get_hed_versions(local_hed_directory, library_name, check_prerelease)
if not hed_versions or not xml_version:
return None
if xml_version in hed_versions:
return _create_xml_filename(xml_version, library_name, local_hed_directory)
return _create_xml_filename(xml_version, library_name, local_hed_directory, check_prerelease)


def cache_local_versions(cache_folder):
Expand All @@ -148,11 +154,12 @@ def cache_local_versions(cache_folder):
return -1


def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, hed_library_urls=DEFAULT_LIBRARY_URL_LIST, skip_folders=DEFAULT_SKIP_FOLDERS, cache_folder=None):
""" Cache all schemas at the given URLs.
Parameters:
hed_base_urls (str or list): Path or list of paths.
hed_base_urls (str or list): Path or list of paths. These should point to a single folder.
hed_library_urls (str or list): Path or list of paths. These should point to a folder containing library folders.
skip_folders (list): A list of subfolders to skip over when downloading.
cache_folder (str): The folder holding the cache.
Expand All @@ -170,8 +177,10 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP
if not cache_folder:
cache_folder = HED_CACHE_DIRECTORY

if not isinstance(hed_base_urls, (list, tuple)):
if isinstance(hed_base_urls, str):
hed_base_urls = [hed_base_urls]
if isinstance(hed_library_urls, str):
hed_library_urls = [hed_library_urls]
os.makedirs(cache_folder, exist_ok=True)
last_timestamp = _read_last_cached_time(cache_folder)
current_timestamp = time.time()
Expand All @@ -182,12 +191,17 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP
try:
cache_lock_filename = os.path.join(cache_folder, "cache_lock.lock")
with portalocker.Lock(cache_lock_filename, timeout=1):
all_hed_versions = {}
for hed_base_url in hed_base_urls:
all_hed_versions = _get_hed_xml_versions_from_url(hed_base_url, skip_folders=skip_folders,
get_libraries=True)
for library_name, hed_versions in all_hed_versions.items():
for version, version_info in hed_versions.items():
_cache_hed_version(version, library_name, version_info, cache_folder=cache_folder)
new_hed_versions = _get_hed_xml_versions_one_library(hed_base_url)
_merge_in_versions(all_hed_versions, new_hed_versions)
for hed_library_url in hed_library_urls:
new_hed_versions = _get_hed_xml_versions_from_url_all_libraries(hed_library_url, skip_folders=skip_folders)
_merge_in_versions(all_hed_versions, new_hed_versions)

for library_name, hed_versions in all_hed_versions.items():
for version, version_info in hed_versions.items():
_cache_hed_version(version, library_name, version_info, cache_folder=cache_folder)

_write_last_cached_time(current_timestamp, cache_folder)
except portalocker.exceptions.LockException or ValueError or URLError:
Expand All @@ -196,18 +210,6 @@ def cache_xml_versions(hed_base_urls=DEFAULT_URL_LIST, skip_folders=DEFAULT_SKIP
return 0


def _cache_specific_url(hed_xml_url, cache_filename):
"""Copies a specific url to the cache at the given filename"""
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
return cache_filename
return None


def _copy_installed_schemas_to_cache(cache_folder):
"""Copies the schemas from the install folder to the cache"""
installed_files = os.listdir(INSTALLED_CACHE_LOCATION)
Expand Down Expand Up @@ -264,12 +266,13 @@ def _check_if_url(hed_xml_or_url):
return False


def _create_xml_filename(hed_xml_version, library_name=None, hed_directory=None):
def _create_xml_filename(hed_xml_version, library_name=None, hed_directory=None, prerelease=False):
"""Returns the default file name format for the given version"""
prerelease_prefix = f"prerelease/" if prerelease else ""
if library_name:
hed_xml_basename = f"{HED_XML_PREFIX}_{library_name}_{hed_xml_version}{HED_XML_EXTENSION}"
hed_xml_basename = f"{prerelease_prefix}{HED_XML_PREFIX}_{library_name}_{hed_xml_version}{HED_XML_EXTENSION}"
else:
hed_xml_basename = HED_XML_PREFIX + hed_xml_version + HED_XML_EXTENSION
hed_xml_basename = prerelease_prefix + HED_XML_PREFIX + hed_xml_version + HED_XML_EXTENSION

if hed_directory:
hed_xml_filename = os.path.join(hed_directory, hed_xml_basename)
Expand All @@ -281,15 +284,60 @@ def _sort_version_list(hed_versions):
return sorted(hed_versions, key=Version, reverse=True)


def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
skip_folders=DEFAULT_SKIP_FOLDERS, get_libraries=False):
def _get_hed_xml_versions_one_folder(hed_folder_url):
url_request = make_url_request(hed_folder_url)
url_data = str(url_request.read(), 'utf-8')
loaded_json = json.loads(url_data)

all_hed_versions = {}
for file_entry in loaded_json:
if file_entry['type'] == "dir":
continue
expression_match = version_pattern.match(file_entry["name"])
if expression_match is not None:
version = expression_match.group(3)
found_library_name = expression_match.group(2)
if found_library_name not in all_hed_versions:
all_hed_versions[found_library_name] = {}
all_hed_versions[found_library_name][version] = file_entry["sha"], file_entry["download_url"], hed_folder_url.endswith(prerelease_suffix)

return all_hed_versions


def _get_hed_xml_versions_one_library(hed_one_library_url):
all_hed_versions = {}
try:
finalized_versions = \
_get_hed_xml_versions_one_folder(hed_one_library_url + hedxml_suffix)
_merge_in_versions(all_hed_versions, finalized_versions)
except urllib.error.URLError:
# Silently ignore ones without a hedxml section for now.
pass
try:
pre_release_folder_versions = \
_get_hed_xml_versions_one_folder(hed_one_library_url + prerelease_suffix)
_merge_in_versions(all_hed_versions, pre_release_folder_versions)
except urllib.error.URLError:
# Silently ignore ones without a prerelease section for now.
pass

ordered_versions = {}
for hed_library_name, hed_versions in all_hed_versions.items():
ordered_versions1 = _sort_version_list(hed_versions)
ordered_versions2 = [(version, hed_versions[version]) for version in ordered_versions1]
ordered_versions[hed_library_name] = dict(ordered_versions2)

return ordered_versions


def _get_hed_xml_versions_from_url_all_libraries(hed_base_library_url, library_name=None, skip_folders=DEFAULT_SKIP_FOLDERS):
""" Get all available schemas and their hash values
Parameters:
hed_base_url (str): A single GitHub API url to cache
library_name(str or None): If str, cache only the named library schemas
hed_base_library_url(str): A single GitHub API url to cache, which contains library schema folders
The subfolders should be a schema folder containing hedxml and/or prerelease folders.
library_name(str or None): If str, cache only the named library schemas.
skip_folders (list): A list of sub folders to skip over when downloading.
get_libraries (bool): If True, return a dictionary of version numbers, with an entry for each library name.
Returns:
list or dict: List of version numbers or dictionary {library_name: [versions]}.
Expand All @@ -300,46 +348,25 @@ def _get_hed_xml_versions_from_url(hed_base_url, library_name=None,
- The directories on GitHub are of the form:
https://api.github.com/repos/hed-standard/hed-schemas/contents/standard_schema/hedxml
"""
url_request = make_url_request(hed_base_url)
url_request = make_url_request(hed_base_library_url)
url_data = str(url_request.read(), 'utf-8')
loaded_json = json.loads(url_data)

all_hed_versions = {}
for file_entry in loaded_json:
if file_entry['type'] == "dir":
if hed_base_url.endswith(hedxml_suffix):
continue
if file_entry['name'] in skip_folders:
continue
try:
sub_folder_versions = \
_get_hed_xml_versions_from_url(hed_base_url + "/" + file_entry['name'] + hedxml_suffix,
skip_folders=skip_folders, get_libraries=True)
except urllib.error.URLError:
# Silently ignore ones without a hedxml section for now.
continue
_merge_in_versions(all_hed_versions, sub_folder_versions)
expression_match = version_pattern.match(file_entry["name"])
if expression_match is not None:
version = expression_match.group(3)
found_library_name = expression_match.group(2)
if not get_libraries and found_library_name != library_name:
found_library_name = file_entry['name']
if library_name is not None and found_library_name != library_name:
continue
if found_library_name not in all_hed_versions:
all_hed_versions[found_library_name] = {}
all_hed_versions[found_library_name][version] = file_entry["sha"], file_entry["download_url"]

ordered_versions = {}
for hed_library_name, hed_versions in all_hed_versions.items():
ordered_versions1 = _sort_version_list(hed_versions)
ordered_versions2 = [(version, hed_versions[version]) for version in ordered_versions1]
ordered_versions[hed_library_name] = dict(ordered_versions2)
single_library_versions = _get_hed_xml_versions_one_library(hed_base_library_url + "/" + found_library_name)
_merge_in_versions(all_hed_versions, single_library_versions)
continue

if get_libraries:
return ordered_versions
if library_name in ordered_versions:
return ordered_versions[library_name]
return {}
if library_name in all_hed_versions:
return all_hed_versions[library_name]
return all_hed_versions


def _merge_in_versions(all_hed_versions, sub_folder_versions):
Expand Down Expand Up @@ -393,12 +420,24 @@ def _safe_move_tmp_to_folder(temp_hed_xml_file, dest_filename):

def _cache_hed_version(version, library_name, version_info, cache_folder):
"""Cache the given hed version"""
sha_hash, download_url = version_info
sha_hash, download_url, prerelease = version_info

possible_cache_filename = _create_xml_filename(version, library_name, cache_folder)
possible_cache_filename = _create_xml_filename(version, library_name, cache_folder, prerelease)
local_sha_hash = _calculate_sha1(possible_cache_filename)

if sha_hash == local_sha_hash:
return possible_cache_filename

return _cache_specific_url(download_url, possible_cache_filename)


def _cache_specific_url(hed_xml_url, cache_filename):
"""Copies a specific url to the cache at the given filename"""
cache_folder = cache_filename.rpartition("/")[0]
os.makedirs(cache_folder, exist_ok=True)
temp_hed_xml_file = url_to_file(hed_xml_url)
if temp_hed_xml_file:
cache_filename = _safe_move_tmp_to_folder(temp_hed_xml_file, cache_filename)
os.remove(temp_hed_xml_file)
return cache_filename
return None
7 changes: 7 additions & 0 deletions hed/schema/hed_schema_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,15 +220,22 @@ def _load_schema_version_sub(xml_version, schema_namespace="", xml_folder=None,
f"Must specify a schema version by number, found no version on {xml_version} schema.",
filename=name)
try:
# 1. Try fully local copy
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder)
if not final_hed_xml_file:
hed_cache.cache_local_versions(xml_folder)
# 2. Cache the schemas included in hedtools and try local again
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder)
hed_schema = load_schema(final_hed_xml_file, schema=schema, name=name)
except HedFileError as e:
if e.code == HedExceptions.FILE_NOT_FOUND:
# Cache all schemas if we haven't recently.
hed_cache.cache_xml_versions(cache_folder=xml_folder)
# 3. See if we got a copy from online
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder)
# 4. Finally check for a pre-release one
if not final_hed_xml_file:
final_hed_xml_file = hed_cache.get_hed_version_path(xml_version, library_name, xml_folder, check_prerelease=True)
if not final_hed_xml_file:
raise HedFileError(HedExceptions.FILE_NOT_FOUND,
f"HED version '{xml_version}' not found in cache: {hed_cache.get_cache_directory()}",
Expand Down
22 changes: 22 additions & 0 deletions hed/schema/schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
get_allowed_characters_by_name, get_problem_indexes, validate_schema_description_new
from hed.schema.schema_validation_util_deprecated import validate_schema_tag, validate_schema_description, verify_no_brackets
from functools import partial
from hed.schema import hed_cache
from semantic_version import Version


def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handler=None):
Expand Down Expand Up @@ -36,6 +38,7 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl
name = hed_schema.filename
error_handler.push_error_context(ErrorContext.FILE_NAME, name)

issues_list += validator.check_if_prerelease_version()
issues_list += validator.check_prologue_epilogue()
issues_list += validator.check_invalid_chars()
issues_list += validator.check_attributes()
Expand Down Expand Up @@ -85,6 +88,25 @@ def __init__(self, hed_schema, error_handler):
self.error_handler = error_handler
self._new_character_validation = hed_schema.schema_83_props

def check_if_prerelease_version(self):
issues = []
libraries = self.hed_schema.library.split(",")
versions = self.hed_schema.version_number.split(",")
for library, version in zip(libraries, versions):
all_known_versions = hed_cache.get_hed_versions(library_name=library)
if "," not in library and not all_known_versions or Version(all_known_versions[0]) < Version(version):
issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED, version,
all_known_versions)

if self.hed_schema.with_standard:
all_known_versions = hed_cache.get_hed_versions()
if not all_known_versions or Version(all_known_versions[0]) < Version(self.hed_schema.with_standard):
issues += ErrorHandler.format_error(SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED,
self.hed_schema.with_standard,
all_known_versions)
self.error_handler.add_context_and_filter(issues)
return issues

def check_prologue_epilogue(self):
issues = []
if self._new_character_validation:
Expand Down
Loading

0 comments on commit 12750cf

Please sign in to comment.