diff --git a/hstrat/__main__.py b/hstrat/__main__.py index e84d9d0ba..802b468a6 100644 --- a/hstrat/__main__.py +++ b/hstrat/__main__.py @@ -4,11 +4,15 @@ print(f"hstrat v{get_hstrat_version()}") print() print("Available commands (stabilized API):") + print("$ python3 -m hstrat.dataframe.surface_build_tree") print("$ python3 -m hstrat.dataframe.surface_unpack_reconstruct") print("$ python3 -m hstrat.dataframe.surface_postprocess_trie") print() print("Available commands (experimental API):") print("$ python3 -m hstrat._auxiliary_lib._alifestd_as_newick_asexual") + print( + "$ python3 -m hstrat._auxiliary_lib._alifestd_downsample_tips_asexual" + ) print( "$ python3 -m hstrat._auxiliary_lib._alifestd_try_add_ancestor_list_col" ) diff --git a/hstrat/_auxiliary_lib/_alifestd_downsample_tips_asexual.py b/hstrat/_auxiliary_lib/_alifestd_downsample_tips_asexual.py index 9aca774f6..51a102d71 100644 --- a/hstrat/_auxiliary_lib/_alifestd_downsample_tips_asexual.py +++ b/hstrat/_auxiliary_lib/_alifestd_downsample_tips_asexual.py @@ -1,13 +1,24 @@ -import random +import argparse +import functools +import logging +import sys import typing +from joinem._dataframe_cli import _add_parser_base, _run_dataframe_cli +import numpy as np import pandas as pd from ._alifestd_find_leaf_ids import alifestd_find_leaf_ids +from ._alifestd_has_contiguous_ids import alifestd_has_contiguous_ids from ._alifestd_prune_extinct_lineages_asexual import ( alifestd_prune_extinct_lineages_asexual, ) from ._alifestd_try_add_ancestor_id_col import alifestd_try_add_ancestor_id_col +from ._configure_prod_logging import configure_prod_logging +from ._delegate_polars_implementation import delegate_polars_implementation +from ._format_cli_description import format_cli_description +from ._get_hstrat_version import get_hstrat_version +from ._log_context_duration import log_context_duration from ._with_rng_state_context import with_rng_state_context @@ -17,8 +28,13 @@ def _alifestd_downsample_tips_asexual_impl( ) -> pd.DataFrame: """Implementation detail for alifestd_downsample_tips_asexual.""" tips = alifestd_find_leaf_ids(phylogeny_df) - kept = random.sample(tips, min(n_downsample, len(tips))) - phylogeny_df["extant"] = phylogeny_df["id"].isin(kept) + kept = np.random.choice(tips, min(n_downsample, len(tips)), replace=False) + if alifestd_has_contiguous_ids(phylogeny_df): + extant = np.zeros(len(phylogeny_df), dtype=bool) + extant[kept] = True + phylogeny_df["extant"] = extant + else: + phylogeny_df["extant"] = phylogeny_df["id"].isin(kept) return alifestd_prune_extinct_lineages_asexual( phylogeny_df, mutate=True @@ -31,8 +47,9 @@ def alifestd_downsample_tips_asexual( mutate: bool = False, seed: typing.Optional[int] = None, ) -> pd.DataFrame: - """Subsample phylogeny containing `num_tips` tips. If `num_tips` is greater - than the number of tips in the phylogeny, the whole phylogeny is returned. + """Create a subsample phylogeny containing `num_tips` tips. If `num_tips` + is greater than the number of tips in the phylogeny, the whole phylogeny is + returned. Only supports asexual phylogenies. """ @@ -56,3 +73,69 @@ def alifestd_downsample_tips_asexual( ) return impl(phylogeny_df, n_downsample) + + +_raw_description = """Create a subsample phylogeny containing `num_tips` tips. + +If `num_tips` is greater than the number of tips in the phylogeny, the whole phylogeny is returned. + +Data is assumed to be in alife standard format. +Only supports asexual phylogenies. + +Additional Notes +================ +- Requires 'ancestor_id' column to be present in input DataFrame. +Otherwise, no action is taken. + +- Use `--eager-read` if modifying data file inplace. + +- This CLI entrypoint is experimental and may be subject to change. +""" + + +def _create_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + add_help=False, + description=format_cli_description(_raw_description), + formatter_class=argparse.RawTextHelpFormatter, + ) + parser = _add_parser_base( + parser=parser, + dfcli_module="hstrat._auxiliary_lib._alifestd_downsample_tips_asexual", + dfcli_version=get_hstrat_version(), + ) + parser.add_argument( + "-n", + default=sys.maxsize, + type=int, + help="Number of tips to subsample.", + ) + parser.add_argument( + "--seed", + default=None, + dest="seed", + help="Integer seed for deterministic behavior.", + type=int, + ) + return parser + + +if __name__ == "__main__": + configure_prod_logging() + + parser = _create_parser() + args, __ = parser.parse_known_args() + with log_context_duration( + "hstrat._auxiliary_lib._alifestd_downsample_tips_asexual", logging.info + ): + _run_dataframe_cli( + base_parser=parser, + output_dataframe_op=delegate_polars_implementation()( + functools.partial( + alifestd_downsample_tips_asexual, + n_downsample=args.n, + seed=args.seed, + ), + ), + overridden_arguments="ignore", # seed is overridden + ) diff --git a/hstrat/_auxiliary_lib/_alifestd_find_leaf_ids.py b/hstrat/_auxiliary_lib/_alifestd_find_leaf_ids.py index a06b8b788..1cf553121 100644 --- a/hstrat/_auxiliary_lib/_alifestd_find_leaf_ids.py +++ b/hstrat/_auxiliary_lib/_alifestd_find_leaf_ids.py @@ -1,5 +1,3 @@ -import typing - import numpy as np import ordered_set as ods import pandas as pd @@ -9,7 +7,7 @@ from ._alifestd_try_add_ancestor_id_col import alifestd_try_add_ancestor_id_col -def alifestd_find_leaf_ids(phylogeny_df: pd.DataFrame) -> typing.List[int]: +def alifestd_find_leaf_ids(phylogeny_df: pd.DataFrame) -> np.ndarray: """What ids are not listed in any `ancestor_list`? Input dataframe is not mutated by this operation. @@ -20,15 +18,14 @@ def alifestd_find_leaf_ids(phylogeny_df: pd.DataFrame) -> typing.List[int]: if "ancestor_id" in phylogeny_df: # root is self ref, but must exclude to handle only-root phylo - internal_node_idxs = phylogeny_df.loc[ - phylogeny_df["ancestor_id"] != phylogeny_df["id"], - "ancestor_id", - ].to_numpy() + internal_node_idxs = phylogeny_df["ancestor_id"].to_numpy()[ + phylogeny_df["ancestor_id"] != phylogeny_df["id"] + ] leaf_pos_filter = np.ones(len(phylogeny_df), dtype=np.bool_) leaf_pos_filter[internal_node_idxs] = False - return phylogeny_df.loc[leaf_pos_filter, "id"].to_list() + return np.flatnonzero(leaf_pos_filter) all_ids = ods.OrderedSet(phylogeny_df["id"]) internal_ids = ( @@ -50,4 +47,4 @@ def alifestd_find_leaf_ids(phylogeny_df: pd.DataFrame) -> typing.List[int]: ] ) ) - return list(all_ids - internal_ids) + return np.fromiter(all_ids - internal_ids, dtype=int) diff --git a/hstrat/_auxiliary_lib/_alifestd_prune_extinct_lineages_asexual.py b/hstrat/_auxiliary_lib/_alifestd_prune_extinct_lineages_asexual.py index d9d97d014..d666a00e9 100644 --- a/hstrat/_auxiliary_lib/_alifestd_prune_extinct_lineages_asexual.py +++ b/hstrat/_auxiliary_lib/_alifestd_prune_extinct_lineages_asexual.py @@ -3,8 +3,70 @@ import numpy as np import pandas as pd +from ._alifestd_has_contiguous_ids import alifestd_has_contiguous_ids +from ._alifestd_is_topologically_sorted import alifestd_is_topologically_sorted from ._alifestd_try_add_ancestor_id_col import alifestd_try_add_ancestor_id_col from ._alifestd_unfurl_lineage_asexual import alifestd_unfurl_lineage_asexual +from ._jit import jit +from ._unfurl_lineage_with_contiguous_ids import ( + unfurl_lineage_with_contiguous_ids, +) + + +def _create_has_extant_descendant_noncontiguous( + phylogeny_df: pd.DataFrame, + extant_mask: np.ndarray, +) -> np.ndarray: + """Implementation detail for alifestd_prune_extinct_lineages_asexual.""" + + phylogeny_df["has_extant_descendant"] = False + for extant_id in phylogeny_df.loc[extant_mask, "id"]: + for lineage_id in alifestd_unfurl_lineage_asexual( + phylogeny_df, + int(extant_id), + mutate=True, + ): + if phylogeny_df.loc[lineage_id, "has_extant_descendant"]: + break + + phylogeny_df.loc[lineage_id, "has_extant_descendant"] = True + + return phylogeny_df["has_extant_descendant"] + + +@jit(nopython=True) +def _create_has_extant_descendant_contiguous( + ancestor_ids: np.ndarray, + extant_mask: np.ndarray, +) -> np.ndarray: + """Implementation detail for alifestd_prune_extinct_lineages_asexual.""" + + has_extant_descendant = np.zeros_like(extant_mask) + for extant_id in np.flatnonzero(extant_mask): + for lineage_id in unfurl_lineage_with_contiguous_ids( + ancestor_ids, + int(extant_id), + ): + if has_extant_descendant[lineage_id]: + break + + has_extant_descendant[lineage_id] = True + + return has_extant_descendant + + +@jit(nopython=True) +def _create_has_extant_descendant_contiguous_sorted( + ancestor_ids: np.ndarray, + extant_mask: np.ndarray, +) -> np.ndarray: + """Implementation detail for alifestd_prune_extinct_lineages_asexual.""" + + has_extant_descendant = extant_mask.copy() + for id_ in range(len(ancestor_ids) - 1, -1, -1): + has_extant_descendant[ancestor_ids[id_]] |= has_extant_descendant[id_] + + return has_extant_descendant def alifestd_prune_extinct_lineages_asexual( @@ -45,7 +107,10 @@ def alifestd_prune_extinct_lineages_asexual( phylogeny_df = phylogeny_df.copy() phylogeny_df = alifestd_try_add_ancestor_id_col(phylogeny_df, mutate=True) - phylogeny_df.set_index("id", drop=False, inplace=True) + if alifestd_has_contiguous_ids(phylogeny_df): + phylogeny_df.reset_index(drop=True, inplace=True) + else: + phylogeny_df.index = phylogeny_df["id"] extant_mask = None if "extant" in phylogeny_df: @@ -58,22 +123,26 @@ def alifestd_prune_extinct_lineages_asexual( else: raise ValueError('Need "extant" or "destruction_time" column.') - phylogeny_df["has_extant_descendant"] = False - - for extant_id in phylogeny_df.loc[extant_mask, "id"]: - for lineage_id in alifestd_unfurl_lineage_asexual( + if not alifestd_has_contiguous_ids(phylogeny_df): + has_extant_descendant = _create_has_extant_descendant_noncontiguous( phylogeny_df, - int(extant_id), - mutate=True, - ): - if phylogeny_df.loc[lineage_id, "has_extant_descendant"]: - break - - phylogeny_df.loc[lineage_id, "has_extant_descendant"] = True + extant_mask, + ) + elif not alifestd_is_topologically_sorted(phylogeny_df): + has_extant_descendant = _create_has_extant_descendant_contiguous( + phylogeny_df["ancestor_id"].to_numpy(dtype=np.uint64), + extant_mask.to_numpy(dtype=bool), + ) + else: + has_extant_descendant = ( + _create_has_extant_descendant_contiguous_sorted( + phylogeny_df["ancestor_id"].to_numpy(dtype=np.uint64), + extant_mask.to_numpy(dtype=bool), + ) + ) - drop_filter = ~phylogeny_df["has_extant_descendant"] + phylogeny_df = phylogeny_df[has_extant_descendant].reset_index(drop=True) phylogeny_df.drop( - phylogeny_df.index[drop_filter], inplace=True, axis="rows" + columns="has_extant_descendant", errors="ignore", inplace=True ) - phylogeny_df.drop("has_extant_descendant", inplace=True, axis="columns") - return phylogeny_df.reset_index(drop=True) + return phylogeny_df diff --git a/hstrat/_auxiliary_lib/_alifestd_try_add_ancestor_list_col.py b/hstrat/_auxiliary_lib/_alifestd_try_add_ancestor_list_col.py index fbd79e297..1c92dbf85 100644 --- a/hstrat/_auxiliary_lib/_alifestd_try_add_ancestor_list_col.py +++ b/hstrat/_auxiliary_lib/_alifestd_try_add_ancestor_list_col.py @@ -74,6 +74,7 @@ def alifestd_try_add_ancestor_list_col( def _create_parser() -> argparse.ArgumentParser: """Create parser for CLI entrypoint.""" parser = argparse.ArgumentParser( + add_help=False, description=format_cli_description(_raw_description), formatter_class=argparse.RawTextHelpFormatter, ) diff --git a/hstrat/_auxiliary_lib/_coerce_to_pandas.py b/hstrat/_auxiliary_lib/_coerce_to_pandas.py index e656844ac..c2702623b 100644 --- a/hstrat/_auxiliary_lib/_coerce_to_pandas.py +++ b/hstrat/_auxiliary_lib/_coerce_to_pandas.py @@ -1,6 +1,7 @@ import typing import pandas as pd +import polars as pl _supported_iterables = tuple, set, list, frozenset _supported_mappings = dict @@ -10,6 +11,9 @@ def coerce_to_pandas(obj: typing.Any, *, recurse: bool = False) -> typing.Any: """ If a Polars type is detected, coerce it to corresponding Pandas type. """ + if isinstance(obj, pl.LazyFrame): + obj = obj.collect() + if hasattr(obj, "__dataframe__"): return pd.api.interchange.from_dataframe(obj, allow_copy=True) elif hasattr(obj, "to_pandas"): diff --git a/hstrat/_auxiliary_lib/_delegate_polars_implementation.py b/hstrat/_auxiliary_lib/_delegate_polars_implementation.py index 6d66e24a6..80e67743b 100644 --- a/hstrat/_auxiliary_lib/_delegate_polars_implementation.py +++ b/hstrat/_auxiliary_lib/_delegate_polars_implementation.py @@ -13,7 +13,9 @@ from ._coerce_to_polars import coerce_to_polars from ._warn_once import warn_once -DataFrame_T = typing.TypeVar("DataFrame_T", pd.DataFrame, pl.DataFrame) +DataFrame_T = typing.TypeVar( + "DataFrame_T", pd.DataFrame, pl.DataFrame, pl.LazyFrame +) Series_T = typing.TypeVar("Series_T", pd.Series, pl.Series) @@ -25,7 +27,7 @@ def _detect_pandas(arg: typing.Any, recurse: bool) -> bool: """ if isinstance(arg, (pd.DataFrame, pd.Series)): return True - elif isinstance(arg, (pl.DataFrame, pl.Series, str)): + elif isinstance(arg, (pl.DataFrame, pl.LazyFrame, pl.Series, str)): return False elif recurse and isinstance(arg, _supported_mappings): return any(_detect_pandas(v, recurse) for v in arg.values()) @@ -46,7 +48,7 @@ def _detect_polars(arg: typing.Any, recurse: bool) -> bool: If `recurse` is True, then this function will recursively check for Polars members in mappings and iterables. """ - if isinstance(arg, (pl.DataFrame, pl.Series)): + if isinstance(arg, (pl.DataFrame, pl.LazyFrame, pl.Series)): return True elif isinstance(arg, (pd.DataFrame, pd.Series, str)): return False @@ -95,7 +97,7 @@ def delegating_function(*args, **kwargs) -> typing.Any: any_pandas = any(map(detect_pandas_, (*args, *kwargs.values()))) any_polars = any(map(detect_polars_, (*args, *kwargs.values()))) logging.info("begin delgate_polars_implementation") - logging.info("- detected {any_pandas=} {any_polars=}") + logging.info(f"- detected {any_pandas=} {any_polars=}") if any_pandas and any_polars: raise TypeError("mixing pandas and polars types is disallowed") diff --git a/hstrat/dataframe/surface_build_tree.py b/hstrat/dataframe/surface_build_tree.py index 0adfc2a33..bee9a8654 100644 --- a/hstrat/dataframe/surface_build_tree.py +++ b/hstrat/dataframe/surface_build_tree.py @@ -95,6 +95,7 @@ def _create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( + add_help=False, description=format_cli_description(raw_message), formatter_class=argparse.RawTextHelpFormatter, ) @@ -132,7 +133,7 @@ def _create_parser() -> argparse.ArgumentParser: args, __ = parser.parse_known_args() logging.info( - f"instantiating trie postprocess functor: " + "instantiating trie postprocess functor: " f"`{args.trie_postprocessor}`", ) trie_postprocessor = eval(args.trie_postprocessor, {"hstrat": hstrat}) diff --git a/hstrat/dataframe/surface_postprocess_trie.py b/hstrat/dataframe/surface_postprocess_trie.py index 0ee04038f..d73d80ec6 100644 --- a/hstrat/dataframe/surface_postprocess_trie.py +++ b/hstrat/dataframe/surface_postprocess_trie.py @@ -103,6 +103,7 @@ def _create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( + add_help=False, description=format_cli_description(raw_message), formatter_class=argparse.RawTextHelpFormatter, ) @@ -134,7 +135,7 @@ def _create_parser() -> argparse.ArgumentParser: args, __ = parser.parse_known_args() logging.info( - f"instantiating trie postprocess functor: " + "instantiating trie postprocess functor: " f"`{args.trie_postprocessor}`", ) trie_postprocessor = eval(args.trie_postprocessor, {"hstrat": hstrat}) diff --git a/hstrat/dataframe/surface_unpack_reconstruct.py b/hstrat/dataframe/surface_unpack_reconstruct.py index 40a53d634..2c632f8c5 100644 --- a/hstrat/dataframe/surface_unpack_reconstruct.py +++ b/hstrat/dataframe/surface_unpack_reconstruct.py @@ -129,6 +129,7 @@ def _create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( + add_help=False, description=format_cli_description(raw_message), formatter_class=argparse.RawTextHelpFormatter, ) diff --git a/pyproject.toml b/pyproject.toml index 905462c1a..9e50556c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "downstream>=1.5.1", "iterpop>=0.3.4", "interval_search>=0.3.1", - "joinem>=0.9.1", + "joinem>=0.9.2", "keyname>=0.4.1", "lazy_loader>=0.4", "lru-dict>=1.1.7", diff --git a/requirements-dev/py310/requirements-all.txt b/requirements-dev/py310/requirements-all.txt index 46cce7a10..c4f818c51 100644 --- a/requirements-dev/py310/requirements-all.txt +++ b/requirements-dev/py310/requirements-all.txt @@ -131,7 +131,7 @@ jinja2==3.1.4 # nbconvert # nbsphinx # sphinx -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py310/requirements-docs.txt b/requirements-dev/py310/requirements-docs.txt index 202aeb498..f36e4860d 100644 --- a/requirements-dev/py310/requirements-docs.txt +++ b/requirements-dev/py310/requirements-docs.txt @@ -102,7 +102,7 @@ jinja2==3.1.4 # nbconvert # nbsphinx # sphinx -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py310/requirements-jit.txt b/requirements-dev/py310/requirements-jit.txt index 55ffd8dd3..95bdf1b13 100644 --- a/requirements-dev/py310/requirements-jit.txt +++ b/requirements-dev/py310/requirements-jit.txt @@ -52,7 +52,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py310/requirements-minimal.txt b/requirements-dev/py310/requirements-minimal.txt index 3f15e0d67..4d1747460 100644 --- a/requirements-dev/py310/requirements-minimal.txt +++ b/requirements-dev/py310/requirements-minimal.txt @@ -48,7 +48,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py310/requirements-release.txt b/requirements-dev/py310/requirements-release.txt index 14cc613d2..5b883f869 100644 --- a/requirements-dev/py310/requirements-release.txt +++ b/requirements-dev/py310/requirements-release.txt @@ -61,7 +61,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py310/requirements-testing.txt b/requirements-dev/py310/requirements-testing.txt index a522e317f..0d1e7eab9 100644 --- a/requirements-dev/py310/requirements-testing.txt +++ b/requirements-dev/py310/requirements-testing.txt @@ -71,7 +71,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py311/requirements-all.txt b/requirements-dev/py311/requirements-all.txt index 3097e0eb1..cc7be2bc7 100644 --- a/requirements-dev/py311/requirements-all.txt +++ b/requirements-dev/py311/requirements-all.txt @@ -129,7 +129,7 @@ jinja2==3.1.4 # nbconvert # nbsphinx # sphinx -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py311/requirements-docs.txt b/requirements-dev/py311/requirements-docs.txt index ce902807f..04262e3a8 100644 --- a/requirements-dev/py311/requirements-docs.txt +++ b/requirements-dev/py311/requirements-docs.txt @@ -102,7 +102,7 @@ jinja2==3.1.4 # nbconvert # nbsphinx # sphinx -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py311/requirements-jit.txt b/requirements-dev/py311/requirements-jit.txt index f8488bbe0..38cb5163d 100644 --- a/requirements-dev/py311/requirements-jit.txt +++ b/requirements-dev/py311/requirements-jit.txt @@ -52,7 +52,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py311/requirements-minimal.txt b/requirements-dev/py311/requirements-minimal.txt index d883f78ca..f075a35bc 100644 --- a/requirements-dev/py311/requirements-minimal.txt +++ b/requirements-dev/py311/requirements-minimal.txt @@ -48,7 +48,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py311/requirements-release.txt b/requirements-dev/py311/requirements-release.txt index 8aa8636e6..8d7df8eec 100644 --- a/requirements-dev/py311/requirements-release.txt +++ b/requirements-dev/py311/requirements-release.txt @@ -61,7 +61,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py311/requirements-testing.txt b/requirements-dev/py311/requirements-testing.txt index 45b27978a..a4f32d5fa 100644 --- a/requirements-dev/py311/requirements-testing.txt +++ b/requirements-dev/py311/requirements-testing.txt @@ -69,7 +69,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py312/requirements-all.txt b/requirements-dev/py312/requirements-all.txt index 3097e0eb1..cc7be2bc7 100644 --- a/requirements-dev/py312/requirements-all.txt +++ b/requirements-dev/py312/requirements-all.txt @@ -129,7 +129,7 @@ jinja2==3.1.4 # nbconvert # nbsphinx # sphinx -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py312/requirements-docs.txt b/requirements-dev/py312/requirements-docs.txt index ce902807f..04262e3a8 100644 --- a/requirements-dev/py312/requirements-docs.txt +++ b/requirements-dev/py312/requirements-docs.txt @@ -102,7 +102,7 @@ jinja2==3.1.4 # nbconvert # nbsphinx # sphinx -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py312/requirements-jit.txt b/requirements-dev/py312/requirements-jit.txt index 2ca2e7e5f..ada8d2cb0 100644 --- a/requirements-dev/py312/requirements-jit.txt +++ b/requirements-dev/py312/requirements-jit.txt @@ -52,7 +52,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py312/requirements-minimal.txt b/requirements-dev/py312/requirements-minimal.txt index d883f78ca..f075a35bc 100644 --- a/requirements-dev/py312/requirements-minimal.txt +++ b/requirements-dev/py312/requirements-minimal.txt @@ -48,7 +48,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py312/requirements-release.txt b/requirements-dev/py312/requirements-release.txt index 8aa8636e6..8d7df8eec 100644 --- a/requirements-dev/py312/requirements-release.txt +++ b/requirements-dev/py312/requirements-release.txt @@ -61,7 +61,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/requirements-dev/py312/requirements-testing.txt b/requirements-dev/py312/requirements-testing.txt index 45b27978a..a4f32d5fa 100644 --- a/requirements-dev/py312/requirements-testing.txt +++ b/requirements-dev/py312/requirements-testing.txt @@ -69,7 +69,7 @@ iterpop==0.4.1 # via # hstrat (../../pyproject.toml) # alifedata-phyloinformatics-convert -joinem==0.9.1 +joinem==0.9.2 # via # hstrat (../../pyproject.toml) # downstream diff --git a/tests/test_hstrat/test_auxiliary_lib/test_alifestd_as_newick_asexual_cli.py b/tests/test_hstrat/test_auxiliary_lib/test_alifestd_as_newick_asexual_cli.py index b8bda2b16..2c1467abc 100644 --- a/tests/test_hstrat/test_auxiliary_lib/test_alifestd_as_newick_asexual_cli.py +++ b/tests/test_hstrat/test_auxiliary_lib/test_alifestd_as_newick_asexual_cli.py @@ -7,6 +7,18 @@ assets = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") +def test_alifestd_as_newick_asexual_cli_help(): + subprocess.run( + [ + "python3", + "-m", + "hstrat._auxiliary_lib._alifestd_as_newick_asexual", + "--help", + ], + check=True, + ) + + def test_alifestd_as_newick_asexual_cli_version(): subprocess.run( [ diff --git a/tests/test_hstrat/test_auxiliary_lib/test_alifestd_downsample_tips_asexual_cli.py b/tests/test_hstrat/test_auxiliary_lib/test_alifestd_downsample_tips_asexual_cli.py new file mode 100644 index 000000000..d9acfaa87 --- /dev/null +++ b/tests/test_hstrat/test_auxiliary_lib/test_alifestd_downsample_tips_asexual_cli.py @@ -0,0 +1,67 @@ +import os +import pathlib +import subprocess + +assets = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") + + +def test_alifestd_downsample_tips_asexual_cli_help(): + subprocess.run( + [ + "python3", + "-m", + "hstrat._auxiliary_lib._alifestd_downsample_tips_asexual", + "--help", + ], + check=True, + ) + + +def test_alifestd_downsample_tips_asexual_cli_version(): + subprocess.run( + [ + "python3", + "-m", + "hstrat._auxiliary_lib._alifestd_downsample_tips_asexual", + "--version", + ], + check=True, + ) + + +def test_alifestd_downsample_tips_asexual_cli_csv(): + output_file = "/tmp/hstrat_alifestd_downsample_tips_asexual.pqt" + pathlib.Path(output_file).unlink(missing_ok=True) + subprocess.run( + [ + "python3", + "-m", + "hstrat._auxiliary_lib._alifestd_downsample_tips_asexual", + "-n", + "1", + output_file, + ], + check=True, + input=f"{assets}/example-standard-toy-asexual-phylogeny.csv".encode(), + ) + assert os.path.exists(output_file) + + +def test_alifestd_downsample_tips_asexual_cli_parquet(): + output_file = "/tmp/hstrat_alifestd_downsample_tips_asexual.pqt" + pathlib.Path(output_file).unlink(missing_ok=True) + subprocess.run( + [ + "python3", + "-m", + "hstrat._auxiliary_lib._alifestd_downsample_tips_asexual", + "-n", + "1", + "--seed", + "50_000_000", + output_file, + ], + check=True, + input=f"{assets}/example-standard-toy-asexual-phylogeny.csv".encode(), + ) + assert os.path.exists(output_file) diff --git a/tests/test_hstrat/test_auxiliary_lib/test_alifestd_find_leaf_ids.py b/tests/test_hstrat/test_auxiliary_lib/test_alifestd_find_leaf_ids.py index a3f6929d7..b4d226a39 100644 --- a/tests/test_hstrat/test_auxiliary_lib/test_alifestd_find_leaf_ids.py +++ b/tests/test_hstrat/test_auxiliary_lib/test_alifestd_find_leaf_ids.py @@ -56,7 +56,7 @@ def test_alifestd_find_leaf_ids_empty(phylogeny_df, apply): phylogeny_df = phylogeny_df.copy() phylogeny_df = apply(phylogeny_df) - assert alifestd_find_leaf_ids(phylogeny_df.iloc[-1:0, :]) == [] + assert alifestd_find_leaf_ids(phylogeny_df.iloc[-1:0, :]).tolist() == [] @pytest.mark.parametrize( @@ -100,7 +100,7 @@ def test_alifestd_find_leaf_ids_singleton(phylogeny_df, apply): phylogeny_df = apply(phylogeny_df) phylogeny_df.sort_values("id", ascending=True, inplace=True) - assert alifestd_find_leaf_ids(phylogeny_df.iloc[0:1, :]) == [ + assert alifestd_find_leaf_ids(phylogeny_df.iloc[0:1, :]).tolist() == [ phylogeny_df.iloc[0].at["id"] ] @@ -120,14 +120,14 @@ def test_alifestd_find_leaf_ids_tworoots(): phylo2.iloc[0:1, :], ] ) - ) == [phylo1.iloc[0].at["id"]] + [phylo2.iloc[0].at["id"]] + ).tolist() == [phylo1.iloc[0].at["id"]] + [phylo2.iloc[0].at["id"]] def test_alifestd_find_leaf_ids_empty2(): phylo1 = pd.read_csv(f"{assets_path}/nk_ecoeaselection.csv") - assert alifestd_find_leaf_ids(phylo1[-1:0]) == [] + assert alifestd_find_leaf_ids(phylo1[-1:0]).tolist() == [] phylo1["ancestor_id"] = 0 - assert alifestd_find_leaf_ids(phylo1[-1:0]) == [] + assert alifestd_find_leaf_ids(phylo1[-1:0]).tolist() == [] def _test_alifestd_find_leaf_ids_impl(phylogeny_df): @@ -141,10 +141,10 @@ def _test_alifestd_find_leaf_ids_impl(phylogeny_df): ] leaf_ids.sort(key=phylogeny_df_.index.get_loc) - assert leaf_ids == alifestd_find_leaf_ids(phylogeny_df) + assert leaf_ids == alifestd_find_leaf_ids(phylogeny_df).tolist() else: # sexual phylogenies - leaf_ids = alifestd_find_leaf_ids(phylogeny_df) + leaf_ids = alifestd_find_leaf_ids(phylogeny_df).tolist() assert sorted(leaf_ids, key=phylogeny_df_.index.get_loc) == leaf_ids all_ids = set(phylogeny_df["id"]) diff --git a/tests/test_hstrat/test_auxiliary_lib/test_coerce_to_pandas.py b/tests/test_hstrat/test_auxiliary_lib/test_coerce_to_pandas.py new file mode 100644 index 000000000..cf3efa74d --- /dev/null +++ b/tests/test_hstrat/test_auxiliary_lib/test_coerce_to_pandas.py @@ -0,0 +1,37 @@ +import pandas as pd +import polars as pl + +from hstrat._auxiliary_lib import coerce_to_pandas + + +def test_coerce_to_pandas_polars_lazyframe(): + lf = pl.LazyFrame({"a": [1, 2, 3]}) + result = coerce_to_pandas(lf) + + assert isinstance(result, pd.DataFrame) + pd.testing.assert_frame_equal(result, pd.DataFrame({"a": [1, 2, 3]})) + + +def test_coerce_to_pandas_polars_dataframe(): + df = pl.DataFrame({"x": [10, 20], "y": [30, 40]}) + result = coerce_to_pandas(df) + assert isinstance(result, pd.DataFrame) + pd.testing.assert_frame_equal( + result, pd.DataFrame({"x": [10, 20], "y": [30, 40]}) + ) + + +def test_coerce_to_pandas_recurse_iterable(): + data = (pl.DataFrame({"a": [1, 2]}), 42, pl.DataFrame({"b": [3, 4]})) + result = coerce_to_pandas(data, recurse=True) + assert isinstance(result, tuple) + assert len(result) == 3 + assert isinstance(result[0], pd.DataFrame) + assert result[1] == 42 + assert isinstance(result[2], pd.DataFrame) + + +def test_coerce_to_pandas_no_coercion_needed(): + data = [1, 2, 3, "no-polars-here"] + result = coerce_to_pandas(data) + assert result == data diff --git a/tests/test_hstrat/test_auxiliary_lib/test_delegate_polars_implementation.py b/tests/test_hstrat/test_auxiliary_lib/test_delegate_polars_implementation.py index 0705c9dab..a491371e6 100644 --- a/tests/test_hstrat/test_auxiliary_lib/test_delegate_polars_implementation.py +++ b/tests/test_hstrat/test_auxiliary_lib/test_delegate_polars_implementation.py @@ -38,6 +38,15 @@ def dummy_func( coerce_to_polars( pd.read_csv(f"{assets_path}/nk_tournamentselection.csv") ), + coerce_to_polars( + pd.read_csv(f"{assets_path}/nk_ecoeaselection.csv") + ).lazy(), + coerce_to_polars( + pd.read_csv(f"{assets_path}/nk_lexicaseselection.csv") + ).lazy(), + coerce_to_polars( + pd.read_csv(f"{assets_path}/nk_tournamentselection.csv") + ).lazy(), ], ) @pytest.mark.parametrize( @@ -72,8 +81,10 @@ def test_coercion_and_error( dummy_func({"df": df}, [series], 1234) else: new_df, new_series, _ = dummy_func({"df": df}, [series], "asdf") - assert type(new_df) == type(df) - assert type(new_series) == type(series) + assert isinstance(new_df, type(df)) or isinstance( + new_df, type(df.collect()) + ) + assert isinstance(new_series, type(series)) SignalException = type("", (Exception,), {}) diff --git a/tests/test_hstrat/test_dataframe/test_surface_build_tree_cli.py b/tests/test_hstrat/test_dataframe/test_surface_build_tree_cli.py index 236149e78..624da6896 100644 --- a/tests/test_hstrat/test_dataframe/test_surface_build_tree_cli.py +++ b/tests/test_hstrat/test_dataframe/test_surface_build_tree_cli.py @@ -5,6 +5,18 @@ assets = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") +def test_surface_build_tree_cli_help(): + subprocess.run( + [ + "python3", + "-m", + "hstrat.dataframe.surface_build_tree", + "--help", + ], + check=True, + ) + + def test_surface_build_tree_cli_version(): subprocess.run( [ @@ -18,7 +30,7 @@ def test_surface_build_tree_cli_version(): def test_surface_build_tree_cli_csv(): - output_file = "/tmp/hstrat_surface_build_tree.pqt" + output_file = "/tmp/hstrat_surface_build_tree.csv" pathlib.Path(output_file).unlink(missing_ok=True) subprocess.run( [ diff --git a/tests/test_hstrat/test_dataframe/test_surface_postprocess_trie_cli.py b/tests/test_hstrat/test_dataframe/test_surface_postprocess_trie_cli.py index f60227b3d..38f56977c 100644 --- a/tests/test_hstrat/test_dataframe/test_surface_postprocess_trie_cli.py +++ b/tests/test_hstrat/test_dataframe/test_surface_postprocess_trie_cli.py @@ -5,6 +5,18 @@ assets = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") +def test_surface_postprocess_trie_cli_help(): + subprocess.run( + [ + "python3", + "-m", + "hstrat.dataframe.surface_postprocess_trie", + "--help", + ], + check=True, + ) + + def test_surface_postprocess_trie_cli_version(): subprocess.run( [ diff --git a/tests/test_hstrat/test_dataframe/test_surface_unpack_reconstruct_cli.py b/tests/test_hstrat/test_dataframe/test_surface_unpack_reconstruct_cli.py index 139510267..8e61ef3be 100644 --- a/tests/test_hstrat/test_dataframe/test_surface_unpack_reconstruct_cli.py +++ b/tests/test_hstrat/test_dataframe/test_surface_unpack_reconstruct_cli.py @@ -5,6 +5,18 @@ assets = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets") +def test_surface_unpack_reconstruct_cli_help(): + subprocess.run( + [ + "python3", + "-m", + "hstrat.dataframe.surface_unpack_reconstruct", + "--help", + ], + check=True, + ) + + def test_surface_unpack_reconstruct_cli_version(): subprocess.run( [ diff --git a/tests/test_hstrat/test_test_drive/test_descend_template_phylogeny_alifestd.py b/tests/test_hstrat/test_test_drive/test_descend_template_phylogeny_alifestd.py index 6f4c54d4c..38762b119 100644 --- a/tests/test_hstrat/test_test_drive/test_descend_template_phylogeny_alifestd.py +++ b/tests/test_hstrat/test_test_drive/test_descend_template_phylogeny_alifestd.py @@ -107,7 +107,7 @@ def test_descend_template_phylogeny( ) assert [n.id for n in sorted_leaf_nodes] == alifestd_find_leaf_ids( phylogeny_df - ) + ).tolist() for extant_ids, sorted_extant_nodes in ( (None, sorted_leaf_nodes), (map(lambda node: node.id, sampled_tree_nodes), sampled_tree_nodes),