From 0ffff5de2096f42679eee43fba9724e95aece7b8 Mon Sep 17 00:00:00 2001 From: Ekaterina Noskova Date: Fri, 19 Apr 2024 20:42:16 +0200 Subject: [PATCH] Add lower bounds for populations' splits, fix documentation about it --- docs/source/user_manual/installation.rst | 40 +++++- .../user_manual/set_model/set_model.rst | 7 +- .../set_model/set_model_struct.rst | 22 ++- example_params | 17 ++- gadma/cli/params_template | 15 +- gadma/cli/settings.py | 3 + gadma/cli/settings_storage.py | 131 ++++++++++++++++-- gadma/engines/dadi_moments_common.py | 50 +++---- gadma/models/demographic_model.py | 14 +- gadma/models/tree_demographic_model.py | 10 +- 10 files changed, 239 insertions(+), 70 deletions(-) diff --git a/docs/source/user_manual/installation.rst b/docs/source/user_manual/installation.rst index ed39f9e9..6db9a55d 100644 --- a/docs/source/user_manual/installation.rst +++ b/docs/source/user_manual/installation.rst @@ -17,18 +17,18 @@ GADMA requires the following dependencies (`requirements/minimal.txt`): * Python3 * NumPy (>= 1.2.0) * Scipy (>= 0.6.0) -* ruamel.yaml +* ruamel.yaml (<0.18.0) * ``dadi`` (>= 1.7.0) * ``moments`` (>= 1.0.0) * ``momi`` -* ``moments.LD`` (manual installation of ``moments`` with `--ld-extension` flag) +* ``moments.LD`` (is installed alongside with ``moments``) * nlopt (for ``dadi``) * Cython (for ``moments``) * mpmath (for ``moments``) To draw demographic models one should also install the following packages (`requirements/minimal.txt`): -* matplotlib (>= 0.98.1) +* matplotlib (>= 0.98.1, <3.5) * Pillow (>= 4.2.1) - optional * ``moments`` (>= 1.0.0) @@ -51,11 +51,10 @@ To run Bayesian optimization `smac` of specified version is requered (`requireme ``momi`` package sometimes is not installed correctly for Windows and MacOS. If ``momi`` is not available please install it manually following the installation instructions in `momi's manual `_. .. note:: - ``momentsLD`` - the extension of ``moments``, should be installed manually following the installation instructions in `moments's manual `_. + ``momentsLD`` - the extension of ``moments``, it is installed together with ``moments``. Getting help for engine installation ------------------------------------ - If there are some troubles installing the engine, please, first of all check the table below for the ability to install this engine on your system. You are always welcome to `open an issue `_ on GitHub for getting help. GADMA has automatic tests on GitHUb for engines on different systems (Linux, Windows, MacOS). The following table indicates (according to our tests) if engine could be installed on specified system: @@ -88,7 +87,7 @@ GADMA has automatic tests on GitHUb for engines on different systems (Linux, Win Installing the latest release ------------------------------ -The latest release of GADMA is easily installed via ``pip`` or ``conda`` (``bioconda``): +The latest release of GADMA can be easily installed via ``pip`` or ``conda`` (``bioconda``): .. code-block:: console @@ -114,6 +113,35 @@ or $ conda install -c bioconda moments +Troubleshooting +--------------- + +If you experience problems with dependencies, we recommend to create an empty `conda environment `_: + +.. code-block:: console + + $ conda create -n gadma_env python=3.10 + $ conda activate gadma_env + +It is possible to install versions that are used for testing by downloading file `minimal.txt` from `here `_ and install requirements using: + +.. code-block:: console + + $ pip install -r minimal.txt + $ pip install gadma + +For **MacOS with M processor** we suggest the following recipe (credit to `@Enricobazzi `_): + +.. code-block:: console + + $ pip install git+https://github.com/MomentsLD/moments.git + $ conda install -c conda-forge dadi + $ conda install -c conda-forge scikit-allel + $ pip install gadma + $ pip uninstall ruamel.yaml + $ pip install "ruamel.yaml<0.18.0" + $ pip uninstall matplotlib + $ pip install "matplotlib<3.5" Manual installation ----------------------------- diff --git a/docs/source/user_manual/set_model/set_model.rst b/docs/source/user_manual/set_model/set_model.rst index 90dbfb22..46e82568 100644 --- a/docs/source/user_manual/set_model/set_model.rst +++ b/docs/source/user_manual/set_model/set_model.rst @@ -23,7 +23,7 @@ Types of demographic models GADMA could infer two base types of demographic models: -* `Demographic model with structire `__ (up to 3 populations). It is a more flexible model type as dynamics of population size change could be inferred and it has a lot of options for parameters. +* `Demographic model with structure `__ (up to 3 populations). It is a more flexible model type as dynamics of population size change could be inferred and it has a lot of options for parameters. .. admonition:: Related options @@ -42,8 +42,11 @@ GADMA could infer two base types of demographic models: * ``Inbreeding`` infers inbreeding coefficients (only for ``dadi`` engine). * ``Selection`` infers selection coefficients. * ``Ancestral size as parameter`` disables multinomial approach of ``dadi`` and ``moments`` engines when ancestral size is inferred implicitly. + * ``Lower bound of first split`` limits lower bound of the most ancient split. * ``Upper bound of first split`` limits upper bound of the most ancient split. - * ``Upper bound of second split`` limits upper bounds of next to the most ancient split. + * ``Lower bound of second split`` limits lower bound of the next to the most ancient split. + * ``Upper bound of second split`` limits upper bound of the next to the most ancient split. + * `Custom demographic model `__. It is a usual user-specified model like in ``dadi``, ``moments`` and other tools for demographic inference. Using such a model will give more control over parameters and could be used for inference of more than 3 populations but is less flexible. diff --git a/docs/source/user_manual/set_model/set_model_struct.rst b/docs/source/user_manual/set_model/set_model_struct.rst index d4e0fc8a..bda666fb 100644 --- a/docs/source/user_manual/set_model/set_model_struct.rst +++ b/docs/source/user_manual/set_model/set_model_struct.rst @@ -200,16 +200,28 @@ Split could be set in two ways: # param file Split fractions: True # for 1) point -Upper bound of split -_____________________ +Upper and lower bounds of splits +________________________________ -To limit time of some split one should specify an option in the parameter file. Splits are numbered from the most ancient. So split 1 is split that occurred with the ancient population and split 2 is the next division of the second population (exist only for three populations). There are two appropriate options: ``Upper bound of first split`` and ``Upper bound of second split``. +It is possible to limit time of split events in the demographic model with structure. In order to do that one should specify one or multiple options in the parameter file that refer to lower and upper bounds of split events. Splits are numbered from the most ancient, so split 1 is a split event that occurred with the ancient population and split 2 is the next division of the second population (exist only for three populations). There are three options corresponded to split times: ``Lower bound of first split``, ``Upper bound of first split``, ``Lower bound of second split`` and ``Upper bound of second split``. -One should translate time from years into genetic units, therefore divide it by ``2 * T_g``, where ``T_g`` is time (in years) for one generation. For example, one wants to limit the last split to 2000 years. Time for one generation is estimated as 24 years, then one should specify in the parameter file: +Bounds should be specified in GENERATIONS. In order to translate time from years to generations, divide it by ``T_g``, where ``T_g`` is time (in years) for one generation. For example, assume we want the last split to be between 1000 and 2000 years. Time for one generation is estimated to be 24 years. Therefore we construct the following parameter file: .. code-block:: none # param_file ... - Upper bound of second split : 41.666 + Lower bound of second split : 41.666 + Upper bound of second split : 83.333 + ... + + It is allowed to set any of those four options, just make sure they make sense. It is possible to set only one bound or one lower and one upper bounds for different splits: + + .. code-block:: none + + # param_file + ... + # In that particular case upper bound for the second split exists automatically + Upper bound of first split : 30 + Lower bound of second split : 10 ... diff --git a/example_params b/example_params index 88378912..79d1ceee 100644 --- a/example_params +++ b/example_params @@ -12,7 +12,7 @@ # If it is resumed from other directory and output directory # isn't set, GADMA will add '_resumed' for previous output # directory. -Output directory: my_example_run +Output directory: my_example_run_2 #!!! @@ -224,22 +224,31 @@ Inbreeding: False # Default: False Ancestral size as parameter: False -# It is possible to limit the time of splits. +# It is possible to limit the time of splits by bounds' specification. # Split 1 is the most ancient split. -# !Note that time is in genetic units (2 * time for 1 generation): +# !Note that time is in generations: # e.g. we want to limit by 150 kya, time for one generation is -# 25 years, then bound will be 150000 / (2*25) = 3000. +# 25 years, then bound will be 150000 / 25 = 6000. +# +# Lower bound for split 1 (in case of 2 or 3 populations). +# Default: None +Lower bound of first split: Null # # Upper bound for split 1 (in case of 2 or 3 populations). # Default: None Upper bound of first split: Null +# Lower bound for split 2 (in case of 3 populations). +# Default: None +Lower bound of second split: Null +# # Upper bound for split 2 (in case of 3 populations). # Default: None Upper bound of second split: Null + #!!! # Local optimization. # diff --git a/gadma/cli/params_template b/gadma/cli/params_template index 8146f4d1..67091d55 100644 --- a/gadma/cli/params_template +++ b/gadma/cli/params_template @@ -224,22 +224,31 @@ Inbreeding : # Default: False Ancestral size as parameter : -# It is possible to limit the time of splits. +# It is possible to limit the time of splits by bounds' specification. # Split 1 is the most ancient split. -# !Note that time is in genetic units (2 * time for 1 generation): +# !Note that time is in generations: # e.g. we want to limit by 150 kya, time for one generation is -# 25 years, then bound will be 150000 / (2*25) = 3000. +# 25 years, then bound will be 150000 / 25 = 6000. +# +# Lower bound for split 1 (in case of 2 or 3 populations). +# Default: None +Lower bound of first split : # # Upper bound for split 1 (in case of 2 or 3 populations). # Default: None Upper bound of first split : +# Lower bound for split 2 (in case of 3 populations). +# Default: None +Lower bound of second split : +# # Upper bound for split 2 (in case of 3 populations). # Default: None Upper bound of second split : + #!!! # Local optimization. # diff --git a/gadma/cli/settings.py b/gadma/cli/settings.py index 1221ac6e..265e97ef 100644 --- a/gadma/cli/settings.py +++ b/gadma/cli/settings.py @@ -66,7 +66,10 @@ # Time bounds upper_bound_of_first_split = None +lower_bound_of_first_split = None upper_bound_of_second_split = None +lower_bound_of_second_split = None + # Glocal optimizer global_optimizer = "Genetic_algorithm" diff --git a/gadma/cli/settings_storage.py b/gadma/cli/settings_storage.py index 26728d73..5bf84f25 100644 --- a/gadma/cli/settings_storage.py +++ b/gadma/cli/settings_storage.py @@ -198,7 +198,9 @@ class SettingsStorage(object): _float_attrs = ['theta0', 'time_for_generation', 'eps', 'const_of_time_in_drawing', 'vmin', 'min_n', 'max_n', 'min_t', 'max_t', 'min_m', 'max_m', + 'lower_bound_of_first_split', 'upper_bound_of_first_split', + 'lower_bound_of_second_split', 'upper_bound_of_second_split', 'const_for_mutation_strength', 'const_for_mutation_rate', @@ -575,7 +577,30 @@ def __setattr__(self, name, value): continue if len(getattr(self, attr_name)) != value: raise ValueError(f"Length of {attr_name} should be equal " - f"to {self.number_of_populations}.") + f"to {value}.") + if value < 2: + if self.lower_bound_of_first_split is not None: + raise ValueError( + "Do not specify `Lower bound of first split` as there" + f" is no split for 1 population" + ) + if self.upper_bound_of_first_split is not None: + raise ValueError( + "Do not specify `Upper bound of first split` as there" + f" is no split for 1 population" + ) + if value < 3: + if self.lower_bound_of_second_split is not None: + raise ValueError( + "Do not specify `Lower bound of second split` as there" + f" is no second split for {value} populations" + ) + if self.upper_bound_of_second_split is not None: + raise ValueError( + "Do not specify `Upper bound of second split` as there" + f" is no second split for {value} populations" + ) + # 3.7 If we set fractions or size of generation then we create/update # GA options that depend on these values. elif name == 'fractions' or name == 'size_of_generation': @@ -1238,26 +1263,51 @@ def get_engine_args(self, engine_id=None): args = () return args - def get_linear_constrain_for_model(self, model): + def get_linear_constrains_for_model(self, model): """ - Returns linear constrain for model based of setted upper bound of - splits. NOT WORKING. + Returns linear constrain for model based of setted upper and lower + bounds of splits. MAYBE NOT WORKING """ if (self.upper_bound_of_first_split is None and - self.upper_bound_of_second_split is None): + self.lower_bound_of_first_split is None and + self.upper_bound_of_second_split is None and + self.lower_bound_of_second_split is None): return None A = list() ub = list() - if (self.upper_bound_of_first_split is not None): + lb = list() + # check for the first split + if (self.upper_bound_of_first_split is not None or + self.lower_bound_of_first_split is not None): A1, b1 = model.get_involved_for_split_time_vars(1) A.append(A1) - ub.append(self.upper_bound_of_first_split / 2 - b1) - if (self.upper_bound_of_second_split is not None): + if self.upper_bound_of_first_split is not None: + ub.append(self.upper_bound_of_first_split / 2 - b1) + else: + ub.append(np.inf) + if self.lower_bound_of_first_split is not None: + lb.append(self.lower_bound_of_first_split / 2 - b1) + else: + lb.append(-np.inf) + # check for the first split + if (self.upper_bound_of_second_split is not None or + self.lower_bound_of_second_split is not None): A2, b2 = model.get_involved_for_split_time_vars(2) A.append(A2) - ub.append(self.upper_bound_of_second_split / 2 - b2) - lb = - np.inf * np.ones(len(ub)) - return LinearConstrain(np.array(A), np.array(lb), np.array(ub)) + if self.upper_bound_of_second_split is not None: + ub.append(self.upper_bound_of_second_split / 2 - b2) + else: + ub.append(np.inf) + if self.lower_bound_of_second_split is not None: + lb.append(self.lower_bound_of_second_split / 2 - b2) + else: + lb.append(-np.inf) + constrains = [] + for a, l, u in zip(A, lb, ub): + constrains.append( + LinearConstrain(np.array(a), np.array(l), np.array(u)) + ) + return constrains def get_model(self): """ @@ -1294,8 +1344,8 @@ def get_model(self): recombination_rate=rec_rate, has_inbr=create_inbr ) - constrain = self.get_linear_constrain_for_model(model) - model.linear_constrain = constrain + constrains = self.get_linear_constrains_for_model(model) + model.linear_constrains = constrains return model elif ((self.custom_filename is not None or self.model_func is not None) and @@ -1687,6 +1737,61 @@ def is_valid(self): labels) = check_and_return_projections_and_labels( self.data_holder) + # check for lower and upper bounds of population splits + # Check that bounds are specified when they are allowed + if (hasattr(self, "number_of_populations") and + self.number_of_populations < 2): + if self.lower_bound_of_first_split is not None: + raise ValueError( + "Do not specify `Lower bound of first split` as there is" + f" no split for 1 population" + ) + if self.upper_bound_of_first_split is not None: + raise ValueError( + "Do not specify `Upper bound of first split` as there is" + f" no split for 1 population" + ) + + if (hasattr(self, "number_of_populations") and + self.number_of_populations < 3): + if self.lower_bound_of_second_split is not None: + raise ValueError( + "Do not specify `Lower bound of second split` as there" + f" is no second split for {self.number_of_populations}" + " populations" + ) + if self.upper_bound_of_second_split is not None: + raise ValueError( + "Do not specify `Upper bound of second split` as there" + f" is no second split for {self.number_of_populations}" + " populations" + ) + # Check that values are not contraversal + if (self.lower_bound_of_first_split is not None and + self.upper_bound_of_first_split is not None): + if (self.lower_bound_of_first_split > + self.upper_bound_of_first_split): + raise ValueError( + "`Lower bound of first split` should be less than `Upper" + " bound of first split`" + ) + if (self.lower_bound_of_second_split is not None and + self.upper_bound_of_second_split is not None): + if (self.lower_bound_of_second_split > + self.upper_bound_of_second_split): + raise ValueError( + "`Lower bound of second split` should be less than `Upper" + " bound of second split`" + ) + if (self.upper_bound_of_first_split is not None and + self.lower_bound_of_second_split is not None): + if (self.upper_bound_of_first_split >= + self.lower_bound_of_second_split): + raise ValueError( + "`Lower bound of second split` should be less than `Upper" + " bound of first split`" + ) + def print_citations(self): """ Prints neccessary citations according to current settings. diff --git a/gadma/engines/dadi_moments_common.py b/gadma/engines/dadi_moments_common.py index 27fca696..e69492ff 100644 --- a/gadma/engines/dadi_moments_common.py +++ b/gadma/engines/dadi_moments_common.py @@ -79,32 +79,32 @@ def _get_theta_from_sfs(self, values, model_sfs): model_sfs, self.data) Nanc = self.get_N_ancestral_from_theta(1.0) or 1.0 theta0 = 1 / Nanc - if self.model.linear_constrain is None: + if self.model.linear_constrains is None: return optimal_theta # If we have constrains we deal with them: - upper_lb = None - lower_ub = None - Ax = self.model.linear_constrain._get_value(values) - theta_lb = theta0 * self.model.linear_constrain.lb / Ax - theta_ub = theta0 * self.model.linear_constrain.ub / Ax - if np.any(theta_lb > theta_ub): - inds = np.where(theta_lb > theta_ub) - raise ValueError(f"Lower bounds for {inds} constrains in model" - f" are greater than upper bounds." - f"Please check linear constrain:\n" - f"{self.model.linear_constrain}\n" - f"and values:\n{values}.") - theta_upper_lb = max(theta_lb) - theta_lower_ub = min(theta_ub) - if theta_upper_lb > theta_lower_ub: - raise ValueError(f"Upper lower bound ({upper_lb}) in greater than" - f" lower upper bound ({lower_ub}). Please check " - f" linear constrain:\n" - f"{self.model.linear_constrain}\n" - f"and values:\n{values}.") - - optimal_theta = max(optimal_theta, theta_upper_lb) - optimal_theta = min(optimal_theta, theta_lower_ub) + for constrain in self.model.linear_constrains: + upper_lb = None + lower_ub = None + Ax = constrain._get_value(values) + theta_lb = theta0 * constrain.lb / Ax + theta_ub = theta0 * constrain.ub / Ax + if np.any(theta_lb > theta_ub): + inds = np.where(theta_lb > theta_ub) + raise ValueError( + f"The lower bounds for {inds} inds in the model constrain" + " are greater than the upper bounds. Please check the" + f"linear constrain:\n{constrain}\nand values:\n{values}" + ) + theta_upper_lb = max(theta_lb) + theta_lower_ub = min(theta_ub) + if theta_upper_lb > theta_lower_ub: + raise ValueError( + f"The upper lower bound ({upper_lb}) is greater than" + f"the lower upper bound ({lower_ub}). Please check the" + f" linear constrain:\n{constrain}\n and values:\n{values}" + ) + optimal_theta = max(optimal_theta, theta_upper_lb) + optimal_theta = min(optimal_theta, theta_lower_ub) return optimal_theta # for i, (lb, ub, val) in enumerate(zip(self.model.linear_constrain.lb, # self.model.linear_constrain.ub, @@ -260,7 +260,7 @@ def evaluate(self, values, grid_sizes): # Otherwise we continue and write failed results at the end # TODO: process it - if not self.multinom and self.model.linear_constrain is not None: + if not self.multinom and self.model.linear_constrains is not None: raise ValueError(f"{self.id} engine could not process constrains " "on demographic model parameters (bounds of time " "splits) in not-multinom mode.") diff --git a/gadma/models/demographic_model.py b/gadma/models/demographic_model.py index a223ac9d..c0f61b63 100644 --- a/gadma/models/demographic_model.py +++ b/gadma/models/demographic_model.py @@ -26,13 +26,13 @@ class DemographicModel(Model): they have multinom mode when this size is generated automatically from the rest of the parameters. :type has_anc_size: bool - :param linear_constrain: linear constrain on parameters. - :type linear_constrain: :class:`gadma.optimizers.LinearConstrain` + :param linear_constrains: list of linear constrains on parameters. + :type linear_constrains: list of :class:`gadma.optimizers.LinearConstrain` """ def __init__(self, gen_time=None, theta0=None, mutation_rate=None, recombination_rate=None, Nref=None, - has_anc_size=False, linear_constrain=None): + has_anc_size=False, linear_constrains=None): super(DemographicModel, self).__init__(raise_excep=False) self.gen_time = gen_time self.Nref = Nref # rescaling factor @@ -41,7 +41,7 @@ def __init__(self, gen_time=None, theta0=None, mutation_rate=None, self.recombination_rate = recombination_rate self.has_anc_size = has_anc_size self.fixed_vars = {} - self.linear_constrain = linear_constrain + self.linear_constrains = linear_constrains def add_variable(self, variable): """ @@ -180,7 +180,7 @@ class EpochDemographicModel(DemographicModel): def __init__(self, gen_time=None, theta0=None, mutation_rate=None, recombination_rate=None, Nref=None, - has_anc_size=None, Nanc_size=None, linear_constrain=None, + has_anc_size=None, Nanc_size=None, linear_constrains=None, inbreeding_args=None): if has_anc_size is None: has_anc_size = Nanc_size is not None @@ -193,7 +193,7 @@ def __init__(self, gen_time=None, theta0=None, mutation_rate=None, recombination_rate=recombination_rate, Nref=Nref, has_anc_size=has_anc_size, - linear_constrain=linear_constrain + linear_constrains=linear_constrains ) if not has_anc_size: assert Nanc_size is None, str(Nanc_size) @@ -538,7 +538,7 @@ def get_value(entity): recombination_rate=self.recombination_rate, gen_time=self.gen_time, theta0=self.theta0, - linear_constrain=self.linear_constrain + linear_constrains=self.linear_constrains ) if len(self.events) == 0: model.add_leaf( diff --git a/gadma/models/tree_demographic_model.py b/gadma/models/tree_demographic_model.py index d225a2ec..a4b45a4b 100644 --- a/gadma/models/tree_demographic_model.py +++ b/gadma/models/tree_demographic_model.py @@ -22,8 +22,8 @@ class TreeDemographicModel(DemographicModel): :type gen_time: float :param recombination_rate: Recombination rate per generation per base :type recombination_rate: float - :param linear_constrain: linear constrain on parameters. - :type linear_constrain: :class:`gadma.optimizers.LinearConstrain` + :param linear_constrains: list of linear constrains on parameters. + :type linear_constrains: list of :class:`gadma.optimizers.LinearConstrain` :note: Model works properly provided that you have only one ancestral population, otherwise model behavior is undefined. @@ -35,7 +35,7 @@ def __init__(self, mutation_rate=None, recombination_rate=None, theta0=None, - linear_constrain=None): + linear_constrains=None): self.events = list() self.rec_rate = recombination_rate self.gen_time = gen_time @@ -45,7 +45,7 @@ def __init__(self, theta0=theta0, mutation_rate=mutation_rate, recombination_rate=recombination_rate, - linear_constrain=linear_constrain + linear_constrains=linear_constrains ) def __eq__(self, other): @@ -337,7 +337,7 @@ def is_from_one_epoch(x, y): Nanc_size=Nanc_var, mutation_rate=self.mutation_rate, recombination_rate=self.recombination_rate, - linear_constrain=self.linear_constrain + linear_constrains=self.linear_constrains ) values = self.translate_values(