From 8c8b78f71dc514abe444b6f6bf125aaf707c38ea Mon Sep 17 00:00:00 2001 From: Anke Koke Date: Wed, 27 Sep 2023 16:19:59 +0200 Subject: [PATCH 1/4] fix: remove job flag to prevent multiple mlflow runs --- niceml/dagster/jobs/jobs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/niceml/dagster/jobs/jobs.py b/niceml/dagster/jobs/jobs.py index f36b4cfb..187116c8 100644 --- a/niceml/dagster/jobs/jobs.py +++ b/niceml/dagster/jobs/jobs.py @@ -1,5 +1,5 @@ """Module containing all dagster jobs""" -from dagster_mlflow import mlflow_tracking, end_mlflow_on_run_finished +from dagster_mlflow import mlflow_tracking from niceml.config.hydra import hydra_conf_mapping_factory from niceml.dagster.ops.analysis import analysis @@ -35,7 +35,6 @@ def job_data_generation(): df_normalization(current_data_location) -@end_mlflow_on_run_finished @job(config=hydra_conf_mapping_factory(), resource_defs={"mlflow": mlflow_tracking}) def job_train(): """Job for training an experiment""" @@ -54,7 +53,6 @@ def job_train(): exptests(exp_context) # pylint: disable=no-value-for-parameter -@end_mlflow_on_run_finished @job(config=hydra_conf_mapping_factory(), resource_defs={"mlflow": mlflow_tracking}) def job_eval(): """Job for evaluating experiment""" From 98f25df82a7275260f9588830cae727c2a21c0d5 Mon Sep 17 00:00:00 2001 From: Denis Stalz-John Date: Wed, 18 Oct 2023 10:21:26 +0200 Subject: [PATCH 2/4] fix: Replace lambda `class_extractor` in `DirClsDataInfoListing` with a private function because lambdas are not pickable (#85) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 📥 Pull Request Description Replace lambda `class_extractor` in `DirClsDataInfoListing` with a private function because lambdas are not pickable ## 📝 Checklist Please make sure you've completed the following tasks before submitting this pull request: - [x] Pre-commit hooks were executed - [x] Changes have been reviewed by at least one other developer - [ ] Tests have been added or updated to cover the changes (only necessary if the changes affect the executable code) - [x] All tests ran successfully - [x] All merge conflicts are resolved - [ ] Documentation has been updated to reflect the changes - [ ] Any necessary migrations have been run --- niceml/data/datainfolistings/clsdatainfolisting.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/niceml/data/datainfolistings/clsdatainfolisting.py b/niceml/data/datainfolistings/clsdatainfolisting.py index 2a9eb1f3..59aba75a 100644 --- a/niceml/data/datainfolistings/clsdatainfolisting.py +++ b/niceml/data/datainfolistings/clsdatainfolisting.py @@ -31,12 +31,14 @@ def __init__( label_suffix: str = ".json", image_suffixes: Optional[List[str]] = None, ): + """Init method of LabelClsDataInfoListing""" self.sub_dir = sub_dir self.data_location = data_location self.label_suffix = label_suffix self.image_suffixes = image_suffixes or [".png", ".jpg", ".jpeg"] def list(self, data_description: DataDescription) -> List[ClsDataInfo]: + """Lists all data infos""" output_data_description: OutputVectorDataDescription = check_instance( data_description, OutputVectorDataDescription ) @@ -73,6 +75,11 @@ def list(self, data_description: DataDescription) -> List[ClsDataInfo]: return new_data_info_list +def _default_class_extractor(input_str: str) -> str: + """Default class extractor for DirClsDataInfoListing""" + return splitext(input_str)[0].rsplit("_", maxsplit=1)[-1] + + class DirClsDataInfoListing( DataInfoListing ): # pylint: disable=too-few-public-methods, too-many-arguments @@ -85,14 +92,14 @@ def __init__( class_extractor: Optional[Callable] = None, image_suffixes: Optional[List[str]] = None, ): + """Init method of DirClsDataInfoListing""" self.sub_dir = sub_dir self.location = location - self.class_extractor = class_extractor or ( - lambda x: splitext(x)[0].rsplit("_", maxsplit=1)[-1] - ) + self.class_extractor = class_extractor or _default_class_extractor self.image_suffixes = image_suffixes or [".png", ".jpg", ".jpeg"] def list(self, data_description: DataDescription) -> List[ClsDataInfo]: + """Lists all data infos""" output_data_description: OutputVectorDataDescription = check_instance( data_description, OutputVectorDataDescription ) From 61488b29cde38ecabda9c60188150724409ca757 Mon Sep 17 00:00:00 2001 From: Anke Koke <79906866+ankeko@users.noreply.github.com> Date: Wed, 8 Nov 2023 13:55:12 +0100 Subject: [PATCH 3/4] fix: Generate mkdocs graphs with multiple dependencies (#86) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 📥 Pull Request Description Implemented option to generate pipeline graphs of dagster jobs with multiple op dependencies. ## 👀 Affected Areas Documentation ## 📝 Checklist Please make sure you've completed the following tasks before submitting this pull request: - [x] Pre-commit hooks were executed - [x] Changes have been reviewed by at least one other developer - [ ] Tests have been added or updated to cover the changes (only necessary if the changes affect the executable code) - [x] All tests ran successfully - [x] All merge conflicts are resolved - [ ] Documentation has been updated to reflect the changes - [ ] Any necessary migrations have been run --- niceml/mkdocs/mdgraph.py | 14 ++++++++++---- niceml/mkdocs/mdjob.py | 8 +++----- niceml/mkdocs/mdop.py | 8 +++----- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/niceml/mkdocs/mdgraph.py b/niceml/mkdocs/mdgraph.py index 872b20ee..f5e45bdb 100644 --- a/niceml/mkdocs/mdgraph.py +++ b/niceml/mkdocs/mdgraph.py @@ -1,14 +1,20 @@ """Module for generating a graph in mkdocs""" -from dagster import JobDefinition +from dagster import JobDefinition, DependencyDefinition, MultiDependencyDefinition def get_graph_md(job: JobDefinition) -> str: """Creates a graph as str with material for mkdocs""" deps = job.graph.dependencies graph_str = "" - for key, value in deps.items(): - for _, val2 in value.items(): - graph_str += f" {val2.node} --> {key.name};\n" + for node, node_dependencies in deps.items(): + for _, dependencies in node_dependencies.items(): + if isinstance(dependencies, DependencyDefinition): + graph_str += f" {dependencies.node} --> {node.name};\n" + elif isinstance(dependencies, MultiDependencyDefinition): + for dependency in dependencies.dependencies: + graph_str += f" {dependency.node} --> {node.name};\n" + else: + raise AttributeError("'dependencies' is not of expected type.") if len(graph_str) == 0: return "" diff --git a/niceml/mkdocs/mdjob.py b/niceml/mkdocs/mdjob.py index 858e4537..5d62e427 100644 --- a/niceml/mkdocs/mdjob.py +++ b/niceml/mkdocs/mdjob.py @@ -1,11 +1,9 @@ """Module for generating mkdocs str for jobs""" from typing import List -from dagster.core.definitions import NodeDefinition - from niceml.mkdocs.mdgraph import get_graph_md from niceml.mkdocs.mdop import get_md_op -from dagster import JobDefinition +from dagster import JobDefinition, OpDefinition def get_job_md(job: JobDefinition, include_graph: bool = True) -> str: @@ -17,13 +15,13 @@ def get_job_md(job: JobDefinition, include_graph: bool = True) -> str: graph_md = get_graph_md(job) if len(graph_md) > 0: job_md += graph_md + "\n\n" - op_list: List[NodeDefinition] = get_ops_from_job(job) + op_list: List[OpDefinition] = get_ops_from_job(job) for cur_op in op_list: job_md += get_md_op(cur_op) return job_md -def get_ops_from_job(job: JobDefinition) -> List[NodeDefinition]: +def get_ops_from_job(job: JobDefinition) -> List[OpDefinition]: """Returns all ops from job""" return job.all_node_defs diff --git a/niceml/mkdocs/mdop.py b/niceml/mkdocs/mdop.py index 92602a98..23f71195 100644 --- a/niceml/mkdocs/mdop.py +++ b/niceml/mkdocs/mdop.py @@ -1,13 +1,11 @@ """Module for generating markdown strings for dagster ops""" from typing import Dict, List -from dagster._core.definitions import NodeDefinition - from niceml.mkdocs.mdtable import get_md_table -from dagster import Field +from dagster import Field, OpDefinition -def get_md_op(op_def: NodeDefinition) -> str: +def get_md_op(op_def: OpDefinition) -> str: """generates markdown strings for dagster ops""" col_widths: List[int] = [80, 120] op_fields = get_op_fields(op_def) @@ -24,7 +22,7 @@ def get_md_op(op_def: NodeDefinition) -> str: return cur_md -def get_op_fields(op_def: NodeDefinition) -> Dict[str, Field]: +def get_op_fields(op_def: OpDefinition) -> Dict[str, Field]: """returns fields from OpDefinition""" try: return op_def.config_schema.config_type.fields From aafe59a160dd73d84697ddfa0ff26447139c32f5 Mon Sep 17 00:00:00 2001 From: Anke Koke <79906866+ankeko@users.noreply.github.com> Date: Wed, 8 Nov 2023 14:49:15 +0100 Subject: [PATCH 4/4] fix: Reload finished experiments in dashboard (#87) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 📥 Pull Request Description If the dashboard was started during a running experiment, the experiment was visible in the dashboard but with incomplete information. Even after the experiment was finished, the Cache was not updated, so the experiment would still be incomplete in the dashboard. This issue was fixed by checking for the latest modification in the experiment files and updating the cache, if it was outdated. Additionaly, the test pipeline was fixed, by defining the execution sequence of tests. ## 👀 Affected Areas - dashboard - test pipeline ## 📝 Checklist Please make sure you've completed the following tasks before submitting this pull request: - [x] Pre-commit hooks were executed - [x] Changes have been reviewed by at least one other developer - [x] Tests have been added or updated to cover the changes (only necessary if the changes affect the executable code) - [x] All tests ran successfully - [x] All merge conflicts are resolved - [ ] Documentation has been updated to reflect the changes - [ ] Any necessary migrations have been run --------- Co-authored-by: Denis Stalz-John --- niceml/config/envconfig.py | 1 + niceml/dashboard/remotettrainutils.py | 14 +++--- niceml/experiments/experimentcontext.py | 43 +++++++++++++++-- niceml/experiments/experimentinfo.py | 10 +++- niceml/experiments/experimentmanager.py | 14 ++++++ niceml/experiments/localexperimentcache.py | 31 +++++++++++- niceml/experiments/schemas/yamlexpmember.py | 12 ++++- tests/end_to_end/components/__init__.py | 0 .../components/test_localexpcache.py | 47 +++++++++++++++++++ tests/end_to_end/conftest.py | 2 +- .../experiments/test_experimentcontext.py | 24 ++++++++++ 11 files changed, 183 insertions(+), 15 deletions(-) create mode 100644 tests/end_to_end/components/__init__.py create mode 100644 tests/end_to_end/components/test_localexpcache.py diff --git a/niceml/config/envconfig.py b/niceml/config/envconfig.py index c799a8ee..11c5b1a4 100644 --- a/niceml/config/envconfig.py +++ b/niceml/config/envconfig.py @@ -12,6 +12,7 @@ ENVIRONMENT_KEY = "ENVIRONMENT" DESCRIPTION_KEY = "DESCRIPTION" LOCAL_EXP_CACHE_PATH_KEY = "LOCAL_EXP_CACHE_PATH" +LAST_MODIFIED_KEY = "LAST_MODIFIED" def replace_id_keys(input_str: str, short_id: str, run_id: str) -> str: diff --git a/niceml/dashboard/remotettrainutils.py b/niceml/dashboard/remotettrainutils.py index 1429c6b8..f53ef78f 100644 --- a/niceml/dashboard/remotettrainutils.py +++ b/niceml/dashboard/remotettrainutils.py @@ -40,7 +40,7 @@ def select_to_load_exps( That means which are not in the experiment manager""" experiments_to_load = [] for exp_info in exp_info_list: - if exp_info not in exp_manager: + if exp_manager.is_exp_modified(exp_info.short_id, exp_info.last_modified): experiments_to_load.append(exp_info) return experiments_to_load @@ -66,7 +66,8 @@ def load_experiments( local_exp_cache: Optional[ExperimentCache] = None, ): """Load the experiments from the cloud storage and - stores them in the experiment manager. Additionally, they are saved in the local cache""" + stores them in the experiment manager. Additionally, they are saved in the local cache + """ experiments: List[ExperimentData] dir_info_list: List[str] = [] load_exp_info_list: List[ExperimentInfo] = [] @@ -78,7 +79,9 @@ def _check_and_load_cache( ) -> List[ExperimentData]: experiments_list = [] for cur_exp_info in exp_info_list: - if local_exp_cache is not None and cur_exp_info.short_id in local_exp_cache: + if local_exp_cache is not None and not local_exp_cache.should_reload( + cur_exp_info + ): initialized_df_loader: DfLoader = df_loader_factory.create_df_loader( storage, cur_exp_info.exp_filepath ) @@ -114,10 +117,7 @@ def _check_and_load_cache( ) if experiment is not None: experiments.append(experiment) - if ( - local_exp_cache is not None - and experiment.get_short_id() not in local_exp_cache - ): + if local_exp_cache is not None: local_exp_cache.save_experiment(experiment) prog_bar.progress(idx / load_exp_count) status_text.text(f"Cached {idx}/{load_exp_count} experiments") diff --git a/niceml/experiments/experimentcontext.py b/niceml/experiments/experimentcontext.py index 30540921..e81191c2 100644 --- a/niceml/experiments/experimentcontext.py +++ b/niceml/experiments/experimentcontext.py @@ -1,4 +1,5 @@ """Module for the ExperimentContext""" +import logging from dataclasses import dataclass from os.path import join from typing import Optional, Union @@ -7,6 +8,7 @@ from fsspec import AbstractFileSystem from PIL import Image +from niceml.config.envconfig import LAST_MODIFIED_KEY from niceml.config.hydra import instantiate_from_yaml from niceml.data.datadescriptions.datadescription import DataDescription from niceml.experiments.expfilenames import ExperimentFilenames, OpNames @@ -21,6 +23,7 @@ write_parquet, write_yaml, ) +from niceml.utilities.timeutils import generate_timestamp @dataclass @@ -36,6 +39,7 @@ def write_parquet( dataframe: pd.DataFrame, data_path: str, compression: Optional[str] = "gzip", + apply_last_modified: bool = True, **kwargs, ): """writes the dataframe as parquet file relative to the experiment""" @@ -48,6 +52,8 @@ def write_parquet( file_system=file_system, **kwargs, ) + if apply_last_modified: + self.update_last_modified() def read_parquet(self, data_path: str) -> pd.DataFrame: """reads the dataframe as parquet file relative to the experiment""" @@ -59,7 +65,9 @@ def read_yaml(self, data_path: str) -> dict: with open_location(self.fs_config) as (file_system, root_path): return read_yaml(join(root_path, data_path), file_system=file_system) - def write_yaml(self, data: dict, data_path: str, **kwargs): + def write_yaml( + self, data: dict, data_path: str, apply_last_modified: bool = True, **kwargs + ): """writes the yaml file relative to the experiment""" with open_location(self.fs_config) as (file_system, root_path): write_yaml( @@ -68,13 +76,21 @@ def write_yaml(self, data: dict, data_path: str, **kwargs): file_system=file_system, **kwargs, ) + if apply_last_modified: + self.update_last_modified() def read_csv(self, data_path: str) -> pd.DataFrame: """Reads a csv file relative to the experiment""" with open_location(self.fs_config) as (file_system, root_path): return read_csv(join(root_path, data_path), file_system=file_system) - def write_csv(self, data: pd.DataFrame, data_path: str, **kwargs): + def write_csv( + self, + data: pd.DataFrame, + data_path: str, + apply_last_modified: bool = True, + **kwargs, + ): """Writes a csv file relative to the experiment""" with open_location(self.fs_config) as (file_system, root_path): write_csv( @@ -83,11 +99,17 @@ def write_csv(self, data: pd.DataFrame, data_path: str, **kwargs): file_system=file_system, **kwargs, ) + if apply_last_modified: + self.update_last_modified() - def write_image(self, image: Image.Image, data_path: str): + def write_image( + self, image: Image.Image, data_path: str, apply_last_modified: bool = True + ): """Writes an image relative to the experiment""" with open_location(self.fs_config) as (file_system, root_path): write_image(image, join(root_path, data_path), file_system=file_system) + if apply_last_modified: + self.update_last_modified() def read_image(self, data_path: str) -> Image.Image: """Reads an image relative to the experiment""" @@ -114,3 +136,18 @@ def instantiate_datadescription_from_yaml(self) -> DataDescription: file_system=exp_fs, ) return data_description + + def update_last_modified(self, timestamp: Optional[str] = None): + """Updates the last modified timestamp of the experiment info""" + timestamp = timestamp or generate_timestamp() + try: + exp_info_dict = self.read_yaml(ExperimentFilenames.EXP_INFO) + exp_info_dict[LAST_MODIFIED_KEY] = timestamp + self.write_yaml( + exp_info_dict, ExperimentFilenames.EXP_INFO, apply_last_modified=False + ) + except FileNotFoundError: + logging.getLogger(__name__).warning( + "Could not update last modified timestamp, because the " + "experiment info file was not found." + ) diff --git a/niceml/experiments/experimentinfo.py b/niceml/experiments/experimentinfo.py index 18fd35f8..de85314d 100644 --- a/niceml/experiments/experimentinfo.py +++ b/niceml/experiments/experimentinfo.py @@ -15,6 +15,7 @@ EXP_TYPE_KEY, RUN_ID_KEY, SHORT_ID_KEY, + LAST_MODIFIED_KEY, ) from niceml.utilities.idutils import ALPHANUMERICLIST from niceml.utilities.ioutils import read_yaml @@ -34,6 +35,7 @@ class ExperimentInfo: description: str exp_dir: str exp_filepath: Optional[str] = None + last_modified: Optional[str] = None def as_save_dict(self) -> dict: """Returns a dictionary which can be saved to a yaml file""" @@ -46,8 +48,13 @@ def as_save_dict(self) -> dict: ENVIRONMENT_KEY: self.environment, DESCRIPTION_KEY: self.description, EXP_DIR_KEY: self.exp_dir, + LAST_MODIFIED_KEY: self.last_modified, } + def is_modified(self, other) -> bool: + """Checks if the other experiment info is modified""" + return self.last_modified != other.last_modified + def load_exp_info( exp_info_file, file_system: Optional[AbstractFileSystem] = None @@ -72,6 +79,7 @@ def experiment_info_factory(data: dict, path: Optional[str] = None) -> Experimen description=data.get(DESCRIPTION_KEY, ""), exp_dir=data.get(EXP_DIR_KEY, ""), exp_filepath=path, + last_modified=data.get(LAST_MODIFIED_KEY, None), ) @@ -91,7 +99,7 @@ def get_exp_id_from_name(input_name: str) -> str: f"ID not found anywhere starting with 'id_': {input_name}" ) cur_id = input_name[index + 3 : index + 7] - if len(cur_id) != 4: + if len(cur_id) != 4: # noqa: PLR2004 raise ExpIdNotFoundError(f"ID not complete: {input_name}") if any((x not in ALPHANUMERICLIST for x in cur_id)): raise ExpIdNotFoundError( diff --git a/niceml/experiments/experimentmanager.py b/niceml/experiments/experimentmanager.py index 66295155..85686834 100644 --- a/niceml/experiments/experimentmanager.py +++ b/niceml/experiments/experimentmanager.py @@ -36,6 +36,7 @@ def add_experiment(self, experiment: ExperimentData): self.exp_dict[experiment.get_run_id()] = experiment def __contains__(self, exp_id: Union[str, ExperimentInfo]): + """Checks if the experiment is in the manager""" if type(exp_id) == ExperimentInfo: exp_id = exp_id.short_id for experiment in self.experiments: @@ -99,6 +100,13 @@ def get_metrics(self, experiments: Optional[List[str]] = None) -> List[str]: return sorted(list(metric_set)) + def is_exp_modified(self, exp_id: str, new_time_str: str) -> bool: + """Checks if the experiment has been modified""" + if exp_id not in self.exp_dict: + return True + exp = self.get_exp_by_id(exp_id) + return exp.exp_info.is_modified(new_time_str) + def get_datasets(self) -> List[str]: """Returns a list of all datasets used in the experiments""" dataset_set: Set[str] = set() @@ -234,6 +242,7 @@ def get_metrics_visu_df( def get_value_information_dict( self, info_path: List[str], list_connection_str: str = "x" ) -> Dict[Any, List[str]]: + """Returns a dict with information about the values""" value_information_dict = defaultdict(list) for exp in self.experiments: try: @@ -254,6 +263,7 @@ def get_epochs_information_dict(self) -> Dict[int, List[str]]: return epochs_information_dict def get_datasets_information_dict(self) -> Dict[str, List[str]]: + """Returns a dict with information about the datasets""" datasets_information_dict = defaultdict(list) for exp in self.experiments: dataset = exp.get_experiment_path().split("/")[0] @@ -261,10 +271,12 @@ def get_datasets_information_dict(self) -> Dict[str, List[str]]: return datasets_information_dict def get_dataset(self, exp: ExperimentData) -> str: + """Returns the dataset of the given experiment""" dataset = exp.get_experiment_path().split("/")[0] return dataset def get_date_information_dict(self) -> Dict[date, List[str]]: + """Returns a dict with information about the dates""" date_information_dict = defaultdict(list) for exp in self.experiments: date_string = exp.exp_info.run_id.split("T")[0] @@ -273,6 +285,7 @@ def get_date_information_dict(self) -> Dict[date, List[str]]: return date_information_dict def get_experiment_type_information_dict(self) -> Dict[str, List[str]]: + """Returns a dict with information about the experiment types""" experiment_type_information_dict = defaultdict(list) for exp in self.experiments: experiment_type = exp.get_experiment_path().split("/")[-1].split("-")[0] @@ -300,6 +313,7 @@ def local_exp_manager_factory(path: str) -> ExperimentManager: def get_add_min_max(metric_name: str, mode_dict: Dict[str, str]) -> Tuple[bool, bool]: + """Returns if min and max should be added""" add_min: bool = True add_max: bool = True for key, mode in mode_dict.items(): diff --git a/niceml/experiments/localexperimentcache.py b/niceml/experiments/localexperimentcache.py index b338f611..50cea8ab 100644 --- a/niceml/experiments/localexperimentcache.py +++ b/niceml/experiments/localexperimentcache.py @@ -12,7 +12,11 @@ from niceml.data.storages.localstorage import LocalStorage from niceml.experiments.expdatastorageloader import create_expdata_from_storage from niceml.experiments.experimentdata import ExperimentData -from niceml.experiments.experimentinfo import ExpIdNotFoundError +from niceml.experiments.experimentinfo import ( + ExpIdNotFoundError, + ExperimentInfo, + load_exp_info, +) from niceml.experiments.expfilenames import ExperimentFilenames from niceml.utilities.ioutils import list_dir, write_parquet from niceml.utilities.regexutils import check_exp_name @@ -38,10 +42,21 @@ def load_experiment( ) -> ExperimentData: """Loads the experiment from the cache""" + @abstractmethod + def load_exp_info(self, exp_id: str) -> ExperimentInfo: + """Loads the experiment info from the cache""" + @abstractmethod def save_experiment(self, exp_data: ExperimentData): """Saves the experiment to the cache""" + def should_reload(self, experiment_info: ExperimentInfo) -> bool: + """Checks if the experiment should be reloaded""" + if experiment_info.short_id not in self: + return True + loaded_exp_info = self.load_exp_info(experiment_info.short_id) + return experiment_info.is_modified(loaded_exp_info) + def create_exp_file_df(exp_data: ExperimentData) -> pd.DataFrame: """Creates a dataframe with the experiment files""" @@ -100,11 +115,13 @@ class LocalExperimentCache(ExperimentCache): """Implementation of the ExperimentCache interface for local disk experiments.""" def __init__(self, store_folder: str): + """Factory method for the local experiment cache""" self.store_folder = store_folder if not isdir(self.store_folder): makedirs(self.store_folder, exist_ok=True) def __contains__(self, exp_id: str) -> bool: + """Checks if the experiment is in the cache""" try: self.find_folder_name_of_exp_id(exp_id) return True @@ -112,6 +129,7 @@ def __contains__(self, exp_id: str) -> bool: return False def get_exp_count_in_cache(self): + """Returns the amount of cached experiments""" return len(get_exp_folder_list(self.store_folder)) def find_folder_name_of_exp_id(self, exp_id) -> Optional[str]: @@ -122,6 +140,16 @@ def find_folder_name_of_exp_id(self, exp_id) -> Optional[str]: return folder_name raise ExpIdNotFoundError(f"Experiment with id: {exp_id} not in Cache") + def load_exp_info(self, exp_id: str) -> ExperimentInfo: + """Loads the experiment info from the cache""" + exp_folder = self.find_folder_name_of_exp_id(exp_id) + exp_info_file = join( + self.store_folder, exp_folder, ExperimentFilenames.EXP_INFO + ) + exp_info = load_exp_info(exp_info_file) + + return exp_info + def load_experiment( self, exp_id: str, @@ -136,6 +164,7 @@ def load_experiment( ) def save_experiment(self, exp_data: ExperimentData): + """Saves the experiment to the cache""" if self.store_folder is None: return self._create_output_folders(exp_data=exp_data) diff --git a/niceml/experiments/schemas/yamlexpmember.py b/niceml/experiments/schemas/yamlexpmember.py index ec8998dc..f461a336 100644 --- a/niceml/experiments/schemas/yamlexpmember.py +++ b/niceml/experiments/schemas/yamlexpmember.py @@ -19,6 +19,7 @@ def __init__( description: str, yaml_schema: Optional[schema.Schema] = None, ): + """Constructor of YamlMember""" super().__init__( path=path, required=required, @@ -28,6 +29,7 @@ def __init__( self.yaml_schema: Optional[schema.Schema] = yaml_schema def validate(self, exp_data: ExperimentData) -> bool: + """Validates the yaml file""" result = super().validate(exp_data) val_data = exp_data.get_loaded_yaml(self.path) val_result = self._validate_schema(val_data) @@ -47,16 +49,22 @@ class ExpInfoMember(YamlMember): """Specific member of the experiment containing the experiment info""" def __init__(self): + """Constructor of ExpInfoMember""" + short_id_len = 4 + run_id_len = 24 exp_schema = schema.Schema( { envc.EXP_NAME_KEY: str, envc.ENVIRONMENT_KEY: dict, envc.DESCRIPTION_KEY: str, envc.EXP_PREFIX_KEY: str, - envc.SHORT_ID_KEY: lambda val: isinstance(val, str) and len(val) == 4, - envc.RUN_ID_KEY: lambda val: isinstance(val, str) and len(val) == 24, + envc.SHORT_ID_KEY: lambda val: isinstance(val, str) + and len(val) == short_id_len, + envc.RUN_ID_KEY: lambda val: isinstance(val, str) + and len(val) == run_id_len, envc.EXP_TYPE_KEY: str, envc.EXP_DIR_KEY: str, + schema.Optional(envc.LAST_MODIFIED_KEY): str, } ) super().__init__( diff --git a/tests/end_to_end/components/__init__.py b/tests/end_to_end/components/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/end_to_end/components/test_localexpcache.py b/tests/end_to_end/components/test_localexpcache.py new file mode 100644 index 00000000..623d5912 --- /dev/null +++ b/tests/end_to_end/components/test_localexpcache.py @@ -0,0 +1,47 @@ +from tempfile import TemporaryDirectory + +import pytest + +from niceml.experiments.experimentdata import ExperimentData +from niceml.experiments.experimentinfo import ExperimentInfo +from niceml.experiments.localexperimentcache import ( + LocalExperimentCache, + ExperimentCache, +) +from niceml.utilities.timeutils import generate_timestamp + + +@pytest.fixture() +def local_exp_cache() -> ExperimentCache: + """Factory method for the local experiment cache""" + with TemporaryDirectory() as tmp_dir: + yield LocalExperimentCache(tmp_dir) + + +def test_localexperiment_cache( + load_eval_experiment: ExperimentData, local_exp_cache: ExperimentCache +): + assert load_eval_experiment is not None + + assert load_eval_experiment.get_short_id() not in local_exp_cache + assert local_exp_cache.get_exp_count_in_cache() == 0 + + local_exp_cache.save_experiment(load_eval_experiment) + assert local_exp_cache.get_exp_count_in_cache() == 1 + + assert load_eval_experiment.exp_info.short_id in local_exp_cache + + loaded_exp_info = local_exp_cache.load_exp_info(load_eval_experiment.get_short_id()) + + assert isinstance(loaded_exp_info, ExperimentInfo) + assert loaded_exp_info.short_id == load_eval_experiment.get_short_id() + + loaded_exp = local_exp_cache.load_experiment(load_eval_experiment.get_short_id()) + assert isinstance(loaded_exp, ExperimentData) + assert loaded_exp.get_short_id() == load_eval_experiment.get_short_id() + + assert not local_exp_cache.should_reload(loaded_exp_info) + + # change timestamp + loaded_exp_info.last_modified = generate_timestamp() + assert local_exp_cache.should_reload(loaded_exp_info) diff --git a/tests/end_to_end/conftest.py b/tests/end_to_end/conftest.py index f4c3a5a5..d5650b50 100644 --- a/tests/end_to_end/conftest.py +++ b/tests/end_to_end/conftest.py @@ -24,7 +24,7 @@ def tmp_dir() -> str: @pytest.fixture(scope="session") -def trainregression(tmp_dir: str): +def trainregression(tmp_dir: str, generate_number_data: str): os.environ["EXPERIMENT_URI"] = tmp_dir os.environ["EPOCHS"] = "1" config_path = ( diff --git a/tests/unit/niceml/experiments/test_experimentcontext.py b/tests/unit/niceml/experiments/test_experimentcontext.py index 4e6f09ae..53911744 100644 --- a/tests/unit/niceml/experiments/test_experimentcontext.py +++ b/tests/unit/niceml/experiments/test_experimentcontext.py @@ -6,7 +6,10 @@ import pytest from PIL import Image +from niceml.config.envconfig import LAST_MODIFIED_KEY from niceml.experiments.experimentcontext import ExperimentContext +from niceml.experiments.experimentdata import ExperimentData +from niceml.experiments.expfilenames import ExperimentFilenames from niceml.utilities.fsspec.locationutils import LocationConfig @@ -77,3 +80,24 @@ def test_read_write_image(experiment_context, exp_tmp_dir): result = experiment_context.read_image(image_path) result_np = np.array(result) assert np.array_equal(np_image, result_np) + + +def test_update_last_modified( + experiment_context: ExperimentContext, + exp_tmp_dir: str, + experiment_data: ExperimentData, +): + # Generate a timestamp to be used for updating last modified + timestamp = "2022-01-01T00:00:00" + + experiment_context.write_yaml( + experiment_data.exp_info.as_save_dict(), ExperimentFilenames.EXP_INFO + ) + # Call the update_last_modified method + experiment_context.update_last_modified(timestamp) + + # Read the updated experiment info file + updated_exp_info = experiment_context.read_yaml(ExperimentFilenames.EXP_INFO) + + # Assert the last modified timestamp is updated + assert updated_exp_info[LAST_MODIFIED_KEY] == timestamp