Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug h5ad saving #580

Merged
merged 23 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions dynamo/data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from functools import reduce

import pandas as pd
from anndata import (
AnnData,
read,
Expand All @@ -19,6 +20,7 @@
from tqdm import tqdm

from .dynamo_logger import main_info
from .tools.Markov import KernelMarkovChain


def make_dir(path: str, can_exist=True):
Expand Down Expand Up @@ -80,6 +82,22 @@ def convert2float(adata, columns, var=False):
adata.obs[i] = data.copy()


def convert_velocity_params_type(adata: AnnData) -> None:
"""Convert the velocity parameters into numeric type to avoid error when saving to h5ad.

The data type of the parameters in the Pandas Series is set to object due to the initialization of parameters with
None values. However, this choice of data type can lead to errors during the process of saving the anndata object.
Therefore, it becomes necessary to convert these parameters to a numeric data type to ensure proper handling and
compatibility.
"""
velocity_param_names = ["beta", "gamma", "half_life", "alpha_b", "alpha_r2", "gamma_b", "gamma_r2", "gamma_logLL",
"delta_b", "delta_r2", "bs", "bf", "uu0", "ul0", "su0", "sl0", "alpha", "a", "b", "alpha_a",
"alpha_a", "alpha_i", "cost", "logLL", "U0", "S0", "total0"]
for param_name in velocity_param_names:
if param_name in adata.var.columns:
adata.var[param_name] = pd.to_numeric(adata.var[param_name], errors='coerce')


def load_NASC_seq(dir, type="TPM", delimiter="_", colnames=None, dropna=False):
"""Function to create an anndata object from NASC-seq pipeline

Expand Down Expand Up @@ -334,3 +352,63 @@ def export_rank_xlsx(adata, path="rank_info.xlsx", ext="excel", rank_prefix="ran
if key[: len(rank_prefix)] == rank_prefix:
main_info("saving sheet: " + str(key))
adata.uns[key].to_excel(writer, sheet_name=str(key))


def export_kmc(adata: AnnData) -> None:
"""Save the parameters of kmc and delete the kmc object from anndata."""
kmc = adata.uns["kmc"]
adata.uns["kmc_params"] = {
"P": kmc.P,
"Idx": kmc.Idx,
"eignum": kmc.eignum,
"D": kmc.D,
"U": kmc.U,
"W": kmc.W,
"W_inv": kmc.W_inv,
"Kd": kmc.Kd,
}
adata.uns.pop("kmc")
Comment on lines +341 to +354
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not add this to the place where kmc is saved to adata.uns?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this way, we can keep a kmc object when we run the Dynamo analysis. If we need it, we just read it from the adata.uns.

Umap object uses a different saving strategy, which means umap parameters will be saved instead of the object itself. My idea is that it is possible to avoid creating a umap in the pipeline even if we run the umap dimension reduction. Unless the user wants to perform inverse transform in a specific analysis like fate, they don't need this Umap instance. While the creation of KMC is inevitable if the user enables the kmc method in tl.cell_velocities. Since we have already created KMC, we can keep it for future usage until saving to h5ad.



def import_kmc(adata: AnnData) -> None:
"""Construct the kmc object using the parameters saved."""
kmc = KernelMarkovChain(P=adata.uns["kmc_params"]["P"], Idx=adata.uns["kmc_params"]["Idx"])
kmc.eignum = adata.uns["kmc_params"]["eignum"]
kmc.D = adata.uns["kmc_params"]["D"]
kmc.U = adata.uns["kmc_params"]["U"]
kmc.W = adata.uns["kmc_params"]["W"]
kmc.W_inv = adata.uns["kmc_params"]["W_inv"]
kmc.Kd = adata.uns["kmc_params"]["Kd"]
adata.uns["kmc"] = kmc
adata.uns.pop("kmc_params")


def export_h5ad(adata: AnnData, path: str = "data/processed_data.h5ad") -> None:
"""Export the anndata object to h5ad."""
convert_velocity_params_type(adata)
if "kmc" in adata.uns.keys():
export_kmc(adata)
fate_keys = [i if i.startswith("fate") else None for i in adata.uns_keys()]
for i in fate_keys:
if i is not None:
if "prediction" in adata.uns[i].keys():
adata.uns[i]["prediction"] = {str(index): array for index, array in enumerate(adata.uns[i]["prediction"])}
if "t" in adata.uns[i].keys():
adata.uns[i]["t"] = {str(index): array for index, array in enumerate(adata.uns[i]["t"])}
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
adata.write_h5ad(path)


def import_h5ad(path: str ="data/processed_data.h5ad") -> AnnData:
"""Import a Dynamo h5ad object into anndata."""
adata = read_h5ad(path)
if "kmc_params" in adata.uns.keys():
import_kmc(adata)
fate_keys = [i if i.startswith("fate") else None for i in adata.uns_keys()]
for i in fate_keys:
if i is not None:
if "prediction" in adata.uns[i].keys():
adata.uns[i]["prediction"] = [adata.uns[i]["prediction"][index] for index in adata.uns[i]["prediction"]]
if "t" in adata.uns[i].keys():
adata.uns[i]["t"] = [adata.uns[i]["t"][index] for index in adata.uns[i]["t"]]
return adata

7 changes: 5 additions & 2 deletions dynamo/plot/topography.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def plot_nullclines(
NCx, NCy = None, None

# if nullcline is not previously calculated, calculate and plot them
if vecfld_dict is None or "nullcline" not in vecfld_dict.keys():
if vecfld_dict is None or "NCx" not in vecfld_dict.keys() or "NCy" not in vecfld_dict.keys():
if vecfld_dict is not None:
X_basis = vecfld_dict["X"][:, :2]
min_, max_ = X_basis.min(0), X_basis.max(0)
Expand All @@ -265,7 +265,10 @@ def plot_nullclines(

NCx, NCy = vecfld2d.NCx, vecfld.NCy
else:
NCx, NCy = vecfld_dict["nullcline"][0], vecfld_dict["nullcline"][1]
NCx, NCy = (
[vecfld_dict["NCx"][index] for index in vecfld_dict["NCx"]],
[vecfld_dict["NCy"][index] for index in vecfld_dict["NCy"]],
Comment on lines +268 to +270
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what will be the behaviors for this? will this reorder the x/y coordinates of the nullclines?
[vecfld_dict["NCx"][index] for index in vecfld_dict["NCx"]],
[vecfld_dict["NCy"][index] for index in vecfld_dict["NCy"]],

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will read the nullclines from the dictionary to form a list of x and y coordinates. The order should be the same. Here is a reference indicating regular dictionaries have kept their items in the same order that they were inserted into the underlying dictionary since python 3.6.

)

if ax is None:
ax = plt.gca()
Expand Down
28 changes: 23 additions & 5 deletions dynamo/prediction/fate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
main_info_insert_adata,
main_warning,
)
from ..preprocessing.pca import pca_inverse_transform
from ..tools.connectivity import construct_mapper_umap
from ..tools.utils import fetch_states, getTseq
from ..vectorfield import vector_field_function
from ..vectorfield.utils import vecfld_from_adata, vector_transformation
Expand Down Expand Up @@ -164,18 +166,34 @@ def fate(
# this requires umap 0.4; reverse project to PCA space.
if prediction.ndim == 1:
prediction = prediction[None, :]
exprs = adata.uns["umap_fit"]["fit"].inverse_transform(prediction)

params = adata.uns["umap_fit"]
mapper = construct_mapper_umap(
params["X_data"],
n_components=params["umap_kwargs"]["n_components"],
metric=params["umap_kwargs"]["metric"],
min_dist=params["umap_kwargs"]["min_dist"],
spread=params["umap_kwargs"]["spread"],
max_iter=params["umap_kwargs"]["max_iter"],
alpha=params["umap_kwargs"]["alpha"],
gamma=params["umap_kwargs"]["gamma"],
negative_sample_rate=params["umap_kwargs"]["negative_sample_rate"],
init_pos=params["umap_kwargs"]["init_pos"],
random_state=params["umap_kwargs"]["random_state"],
umap_kwargs=params["umap_kwargs"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

umap_kwargs=params["umap_kwargs"]
will this lead to pass duplicated arguments to construct_mapper_umap?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

params["umap_kwargs"] may pass duplicate arguments. But this should not raise an error because in construct_mapper_umap we use update_dict.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

umap_kwargs=params["umap_kwargs"]
will this lead to pass duplicated arguments to construct_mapper_umap?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

params["umap_kwargs"] may pass duplicate arguments. But this should not raise an error because in construct_mapper_umap we use update_dict.

)
exprs = mapper.inverse_transform(prediction)

# further reverse project back to raw expression space
PCs = adata.uns["PCs"].T
if PCs.shape[0] == exprs.shape[1]:
exprs = np.expm1(exprs @ PCs + adata.uns["pca_mean"])

ndim = adata.uns["umap_fit"]["fit"]._raw_data.shape[1]
ndim = mapper._raw_data.shape[1]

if "X" in adata.obsm_keys():
if ndim == adata.obsm[DKM.X_PCA].shape[1]: # lift the dimension up again
exprs = adata.uns["pca_fit"].inverse_transform(prediction)
exprs = pca_inverse_transform(prediction, PCs=PCs.T, mean=adata.uns["pca_mean"])

if adata.var.use_for_dynamics.sum() == exprs.shape[1]:
valid_genes = adata.var_names[adata.var.use_for_dynamics]
Expand All @@ -189,12 +207,12 @@ def fate(

adata.uns[fate_key] = {
"init_states": init_states,
"init_cells": init_cells,
"init_cells": list(init_cells),
"average": average,
"t": t,
"prediction": prediction,
# "VecFld": VecFld,
"VecFld_true": VecFld_true,
# "VecFld_true": VecFld_true,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VecFld_true means the groundtruth vector field.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will we need this in the pipeline?

"genes": valid_genes,
}
if exprs is not None:
Expand Down
2 changes: 1 addition & 1 deletion dynamo/prediction/least_action_path.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def least_action(

adata.uns[LAP_key] = {
"init_states": init_states,
"init_cells": init_cells,
"init_cells": list(init_cells),
Comment on lines 604 to +605
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why list only applies to init_cells instead of init_states?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

init_cells can be an Index (so we convert it to a list) while init_states is an array (a more compatible data type).

"t": t,
"mftp": mftp,
"prediction": prediction,
Expand Down
3 changes: 2 additions & 1 deletion dynamo/preprocessing/cell_cycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ def get_cell_phase(
else:
cell_phase_genes = gene_list

adata.uns["cell_phase_genes"] = cell_phase_genes
adata.uns["cell_phase_genes_order"] = [key for key in cell_phase_genes]
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
adata.uns["cell_phase_genes"] = dict(cell_phase_genes)
# score each cell cycle phase and Z-normalize
phase_scores = pd.DataFrame(batch_group_score(adata, layer, cell_phase_genes))
normalized_phase_scores = phase_scores.sub(phase_scores.mean(axis=1), axis=0).div(phase_scores.std(axis=1), axis=0)
Expand Down
31 changes: 31 additions & 0 deletions dynamo/preprocessing/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,34 @@ def top_pca_genes(
main_info_insert_adata_var(adata_store_key, indent_level=2)
adata.var[adata_store_key] = genes
return adata


def pca_transform(data: Union[np.ndarray, csr_matrix], PCs: np.ndarray) -> np.ndarray:
"""Transform the data with given principal components.

Args:
data: raw data to transform.
PCs: the principal components.

Returns:
The transformed data.
"""
arr = data.toarray().copy() if issparse(data) else data.copy()
mean = arr.mean(0)
centered_data = arr - mean
return np.dot(centered_data, PCs)


def pca_inverse_transform(data: Union[np.ndarray, csr_matrix], PCs: np.ndarray, mean: np.ndarray) -> np.ndarray:
"""Inverse transform the data with given principal components.

Args:
data: raw data to transform.
PCs: the principal components.
mean: the mean used to fit the PCA.

Returns:
The inverse transformed data.
"""
arr = data.toarray().copy() if issparse(data) else data.copy()
return np.dot(arr, PCs.T) + mean
Sichao25 marked this conversation as resolved.
Show resolved Hide resolved
35 changes: 24 additions & 11 deletions dynamo/tools/cell_velocities.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..configuration import DKM
from ..dynamo_logger import LoggerManager, main_info, main_warning
from ..utils import areinstance
from .connectivity import _gen_neighbor_keys, adj_to_knn, check_and_recompute_neighbors
from .connectivity import _gen_neighbor_keys, adj_to_knn, check_and_recompute_neighbors, construct_mapper_umap
from .dimension_reduction import reduceDimension
from .graph_calculus import calc_gaussian_weight, fp_operator, graphize_velocity
from .Markov import ContinuousTimeMarkovChain, KernelMarkovChain, velocity_on_grid
Expand Down Expand Up @@ -519,23 +519,35 @@ def cell_velocities(
adata.obsp["discrete_vector_field"] = E

elif method == "transform":
umap_trans, n_pca_components = (
adata.uns["umap_fit"]["fit"],
adata.uns["umap_fit"]["n_pca_components"],
params = adata.uns["umap_fit"]
umap_trans = construct_mapper_umap(
params["X_data"],
n_components=params["umap_kwargs"]["n_components"],
metric=params["umap_kwargs"]["metric"],
min_dist=params["umap_kwargs"]["min_dist"],
spread=params["umap_kwargs"]["spread"],
max_iter=params["umap_kwargs"]["max_iter"],
alpha=params["umap_kwargs"]["alpha"],
gamma=params["umap_kwargs"]["gamma"],
negative_sample_rate=params["umap_kwargs"]["negative_sample_rate"],
init_pos=params["umap_kwargs"]["init_pos"],
random_state=params["umap_kwargs"]["random_state"],
umap_kwargs=params["umap_kwargs"],
)

if "pca_fit" not in adata.uns_keys() or type(adata.uns["pca_fit"]) == str:
CM = adata.X[:, adata.var.use_for_dynamics.values]
CM = adata.X[:, adata.var.use_for_dynamics.values]
if "PCs" not in adata.uns_keys():
from ..preprocessing.pca import pca

adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X", return_all=True)
adata.uns["pca_fit"] = pca_fit
adata, pca_fit, X_pca = pca(adata, CM, params["n_pca_components"], "X", return_all=True)

X_pca, pca_fit = adata.obsm[DKM.X_PCA], adata.uns["pca_fit"]
from ..preprocessing.pca import pca_transform

X_pca, pca_PCs = adata.obsm[DKM.X_PCA], adata.uns["PCs"]
V = adata[:, adata.var.use_for_dynamics.values].layers[vkey] if vkey in adata.layers.keys() else None
CM, V = CM.A if sp.issparse(CM) else CM, V.A if sp.issparse(V) else V
V[np.isnan(V)] = 0
Y_pca = pca_fit.transform(CM + V)
Y_pca = pca_transform(CM + V, PCs=pca_PCs)

Y = umap_trans.transform(Y_pca)

Expand All @@ -562,7 +574,8 @@ def cell_velocities(
else:
transition_key = add_transition_key

adata.obsp[transition_key] = T
if method != "transform":
adata.obsp[transition_key] = T
if add_velocity_key is None:
velocity_key, grid_velocity_key = "velocity_" + basis, "grid_velocity_" + basis
else:
Expand Down
Loading