diff --git a/q2_fmt/__init__.py b/q2_fmt/__init__.py index c5ddb7e..4993ca1 100644 --- a/q2_fmt/__init__.py +++ b/q2_fmt/__init__.py @@ -9,14 +9,14 @@ from ._version import get_versions from ._engraftment import cc, group_timepoints -from ._peds import (sample_peds, feature_peds, peds, heatmap, - peds_simulation, sample_pprs) +from ._peds import (pedf, prdf, heatmap, + pedf_permutation_test, pprf) from ._ancombc import detect_donor_indicators __version__ = get_versions()['version'] del get_versions -__all__ = ['cc', 'sample_peds', 'feature_peds', - 'peds', 'heatmap', 'group_timepoints', 'peds_simulation', - 'sample_pprs', 'detect_donor_indicators'] +__all__ = ['cc', 'pedf', 'prdf', + 'heatmap', 'group_timepoints', 'pedf_permutation_test', + 'pprf', 'detect_donor_indicators'] diff --git a/q2_fmt/_ancombc.py b/q2_fmt/_ancombc.py index 2b76169..16ee54d 100644 --- a/q2_fmt/_ancombc.py +++ b/q2_fmt/_ancombc.py @@ -8,8 +8,7 @@ import pandas as pd from qiime2 import Metadata -# TODO: Change Import path. -from q2_fmt._peds import _check_for_time_column, _check_reference_column +from q2_fmt._util import _check_for_time_column, _check_reference_column def detect_donor_indicators(ctx, table, reference_column, time_column, diff --git a/q2_fmt/_examples.py b/q2_fmt/_examples.py index 8fee5ad..acff0ff 100644 --- a/q2_fmt/_examples.py +++ b/q2_fmt/_examples.py @@ -11,10 +11,6 @@ import qiime2 -from qiime2.plugin.util import transform - -from q2_stats.types import TabularDataResourceDirFmt - def _get_data_from_tests(path): return pkg_resources.resource_filename('q2_fmt.tests', @@ -59,15 +55,15 @@ def feature_table_factory(): ) -def peds_md_factory(): +def pedf_md_factory(): return qiime2.Metadata.load( _get_data_from_tests('md-peds-usage.txt') ) -def peds_dist_factory(): +def pedf_dist_factory(): return qiime2.Artifact.import_data( - "Dist1D[Ordered, Matched] % Properties('peds')", + "Dist1D[Ordered, Matched] % Properties('pedf')", _get_data_from_tests('peds_dist.table.jsonl') ) @@ -152,12 +148,12 @@ def cc_baseline(use): raincloud.assert_output_type('Visualization') -def peds_method(use): - md = use.init_metadata('md', peds_md_factory) +def pedf_method(use): + md = use.init_metadata('md', pedf_md_factory) table = use.init_artifact('table', feature_table_factory) - peds_group_dists, = use.action( - use.UsageAction('fmt', 'sample_peds'), + pedf_group_dists, = use.action( + use.UsageAction('fmt', 'pedf'), use.UsageInputs( table=table, metadata=md, @@ -166,21 +162,21 @@ def peds_method(use): subject_column='SubjectID' ), use.UsageOutputNames( - peds_dists='peds_dist' + pedf_dists='pedf_dist' ) ) - peds_group_dists.assert_output_type("Dist1D[Ordered, Matched] %" - " Properties('peds')") + pedf_group_dists.assert_output_type("Dist1D[Ordered, Matched] %" + " Properties('pedf')") -def feature_peds_method(use): - md = use.init_metadata('md', peds_md_factory) +def prdf_method(use): + md = use.init_metadata('md', pedf_md_factory) table = use.init_artifact('table', feature_table_factory) - peds_group_dists, = use.action( - use.UsageAction('fmt', 'feature_peds'), + prdf_group_dists, = use.action( + use.UsageAction('fmt', 'prdf'), use.UsageInputs( table=table, metadata=md, @@ -189,46 +185,22 @@ def feature_peds_method(use): subject_column='SubjectID' ), use.UsageOutputNames( - peds_dists='peds_dist' + prdf_dists='prdf_dist' ) ) - peds_group_dists.assert_output_type("Dist1D[Ordered, Matched] %" - " Properties('peds')") - - -# PEDS pipeline -def peds_pipeline_sample(use): - md = use.init_metadata('md', peds_md_factory) - table = use.init_artifact('table', feature_table_factory) - - peds_heatmap, = use.action( - use.UsageAction('fmt', 'peds'), - use.UsageInputs( - table=table, - metadata=md, - peds_metric='sample', - time_column='time_point', - reference_column='Donor', - subject_column='SubjectID' - ), - use.UsageOutputNames( - heatmap='heatmap', - - ) - ) - - peds_heatmap.assert_output_type('Visualization') + prdf_group_dists.assert_output_type("Dist1D[Ordered, Matched] %" + " Properties('prdf')") def heatmap(use): - peds_dist = use.init_artifact('peds_dist', peds_dist_factory) + pedf_dist = use.init_artifact('pedf_dist', pedf_dist_factory) - peds_heatmap, = use.action( + pedf_heatmap, = use.action( use.UsageAction('fmt', 'heatmap'), use.UsageInputs( - data=peds_dist, + data=pedf_dist, ), use.UsageOutputNames( visualization='heatmap', @@ -236,15 +208,15 @@ def heatmap(use): ) ) - peds_heatmap.assert_output_type('Visualization') + pedf_heatmap.assert_output_type('Visualization') -def simulation_peds_method(use): - md = use.init_metadata('md', peds_md_factory) +def perm_pedf_method(use): + md = use.init_metadata('md', pedf_md_factory) table = use.init_artifact('table', feature_table_factory) - actual_sample_peds, peds_stats, global_stats = use.action( - use.UsageAction('fmt', 'peds_simulation'), + actual_sample_pedf, pedf_stats, global_stats = use.action( + use.UsageAction('fmt', 'pedf_permutation_test'), use.UsageInputs( table=table, metadata=md, @@ -255,23 +227,23 @@ def simulation_peds_method(use): sampling_depth=400 ), use.UsageOutputNames( - actual_sample_peds='actual_sample_peds', + actual_sample_pedf='actual_sample_pedf', per_subject_stats='per_subject_stats', global_stats='global_stats' ) ) - peds_stats.assert_output_type("StatsTable[Pairwise]") + pedf_stats.assert_output_type("StatsTable[Pairwise]") global_stats.assert_output_type("StatsTable[Pairwise]") -def pprs_method(use): - md = use.init_metadata('md', peds_md_factory) +def pprf_method(use): + md = use.init_metadata('md', pedf_md_factory) table = use.init_artifact('table', feature_table_factory) - pprs_group_dists, = use.action( - use.UsageAction('fmt', 'sample_pprs'), + pprf_group_dists, = use.action( + use.UsageAction('fmt', 'pprf'), use.UsageInputs( table=table, metadata=md, @@ -281,17 +253,17 @@ def pprs_method(use): filter_missing_references=False ), use.UsageOutputNames( - pprs_dists='pprs_dist' + pprf_dists='pprf_dist' ) ) - pprs_group_dists.assert_output_type("Dist1D[Ordered, Matched] %" - " Properties('pprs')") + pprf_group_dists.assert_output_type("Dist1D[Ordered, Matched] %" + " Properties('pprf')") def detect_donor_indicators_method(use): - md = use.init_metadata('md', peds_md_factory) + md = use.init_metadata('md', pedf_md_factory) table = use.init_artifact('table', feature_table_factory) differentials, da_barplot = use.action( diff --git a/q2_fmt/_peds.py b/q2_fmt/_peds.py index 7d6b860..19d08fe 100644 --- a/q2_fmt/_peds.py +++ b/q2_fmt/_peds.py @@ -37,38 +37,6 @@ from q2_stats.plots.raincloud import _make_stats -def peds(ctx, table, metadata, peds_metric, time_column, reference_column, - subject_column, filter_missing_references=False, - drop_incomplete_subjects=False, drop_incomplete_timepoints=None, - level_delimiter=None, sampling_depth=None, num_resamples=0): - - heatmap = ctx.get_action('fmt', 'heatmap') - - results = [] - - if peds_metric == 'sample': - sample_peds = ctx.get_action('fmt', 'sample_peds') - peds_dist = sample_peds( - table=table, metadata=metadata, time_column=time_column, - subject_column=subject_column, reference_column=reference_column, - filter_missing_references=filter_missing_references, - sampling_depth=sampling_depth, num_resamples=num_resamples,) - - else: - feature_peds = ctx.get_action('fmt', 'feature_peds') - peds_dist = feature_peds( - table=table, metadata=metadata, time_column=time_column, - subject_column=subject_column, reference_column=reference_column, - filter_missing_references=filter_missing_references, - sampling_depth=sampling_depth, num_resamples=num_resamples) - results += heatmap(data=peds_dist[0], - level_delimiter=level_delimiter, - drop_incomplete_subjects=drop_incomplete_subjects, - drop_incomplete_timepoints=drop_incomplete_timepoints) - - return tuple(results) - - def heatmap(output_dir: str, data: pd.DataFrame, level_delimiter: str = None, per_subject_stats: pd.DataFrame = None, @@ -80,8 +48,8 @@ def heatmap(output_dir: str, data: pd.DataFrame, per_subject_stats is None) except AssertionError as e: raise AssertionError("The input data provided was created with" - " `fmt sample_pprs`. This is not compatible with" - " statistics created from `fmt peds-simulation`" + " `fmt pprf`. This is not compatible with" + " statistics created from `fmt pedf-simulation`" " because they are created from separate" " references (i.e. baseline and donor)") from e @@ -102,7 +70,7 @@ def heatmap(output_dir: str, data: pd.DataFrame, gradient = "measure" if "all possible recipients with feature" in data.columns: if drop_incomplete_subjects or drop_incomplete_timepoints: - warnings.warn('Feature PEDS was selected as the PEDS metric, which' + warnings.warn('PRDF was selected as the proportion metric, which' ' does not accept `drop_incomplete_subjects` or' ' `drop_incomplete_timepoints` as parameters. One' ' (or both) of these parameters were detected in' @@ -148,11 +116,11 @@ def heatmap(output_dir: str, data: pd.DataFrame, table1=table1)) -def sample_peds(table: pd.DataFrame, metadata: qiime2.Metadata, - time_column: str, reference_column: str, subject_column: str, - filter_missing_references: bool = False, - sampling_depth: int = None, - num_resamples: int = 0) -> (pd.DataFrame): +def pedf(table: pd.DataFrame, metadata: qiime2.Metadata, + time_column: str, reference_column: str, subject_column: str, + filter_missing_references: bool = False, + sampling_depth: int = None, + num_resamples: int = 0) -> (pd.DataFrame): # making sure that samples exist in the table ids_with_data = table.index @@ -189,24 +157,24 @@ def sample_peds(table: pd.DataFrame, metadata: qiime2.Metadata, # no samping depth num_resamples = 1 for x in range(num_resamples): - peds_df = pd.DataFrame(columns=['id', + pedf_df = pd.DataFrame(columns=['id', 'transfered_donor_features', 'total_donor_features', 'donor', 'subject', 'group']) if sampling_depth: table = _subsample(table, sampling_depth) - peds_df = _compute_peds(peds_df=peds_df, peds_type="Sample", - peds_time=np.nan, - reference_series=used_references, - table=table, metadata=metadata_df, - time_column=time_column, - subject_column=subject_column, - reference_column=reference_column) + pedf_df = _compute_proportion(df=pedf_df, type="PEDF", + time=np.nan, + reference_series=used_references, + table=table, metadata=metadata_df, + time_column=time_column, + subject_column=subject_column, + reference_column=reference_column) # Set the index for pd matching when concating and summing - peds_df = peds_df.set_index('id') - num_list.append(peds_df['transfered_donor_features']) - denom_list.append(peds_df['total_donor_features']) + pedf_df = pedf_df.set_index('id') + num_list.append(pedf_df['transfered_donor_features']) + denom_list.append(pedf_df['total_donor_features']) numerator_df = pd.concat(num_list, axis=1) denominator_df = pd.concat(denom_list, axis=1) @@ -216,49 +184,49 @@ def sample_peds(table: pd.DataFrame, metadata: qiime2.Metadata, with warnings.catch_warnings(): warnings.simplefilter("ignore") - peds = median_numerator_series/median_denominator_series + pedf = median_numerator_series/median_denominator_series - peds_df['measure'] = peds - peds_df['transfered_donor_features'] = median_numerator_series - peds_df['total_donor_features'] = median_denominator_series - peds_df = peds_df.reset_index() + pedf_df['measure'] = pedf + pedf_df['transfered_donor_features'] = median_numerator_series + pedf_df['total_donor_features'] = median_denominator_series + pedf_df = pedf_df.reset_index() - peds_df['id'].attrs.update({ + pedf_df['id'].attrs.update({ 'title': metadata_df.index.name, 'description': 'Sample IDs' }) - peds_df['measure'].attrs.update({ - 'title': "Sample PEDS", - 'description': 'Proportional Engraftment of Donor Strains' + pedf_df['measure'].attrs.update({ + 'title': "PEDF", + 'description': 'Proportional Engraftment of Donor Features' }) - peds_df['group'].attrs.update({ + pedf_df['group'].attrs.update({ 'title': time_column, 'description': 'Time' }) - peds_df['subject'].attrs.update({ + pedf_df['subject'].attrs.update({ 'title': subject_column, 'description': 'Subject IDs linking samples across time' }) - peds_df['transfered_donor_features'].attrs.update({ + pedf_df['transfered_donor_features'].attrs.update({ 'title': "Transfered Reference Features", 'description': '...' }) - peds_df['total_donor_features'].attrs.update({ + pedf_df['total_donor_features'].attrs.update({ 'title': "Total Reference Features", 'description': '...' }) - peds_df['donor'].attrs.update({ + pedf_df['donor'].attrs.update({ 'title': reference_column, 'description': 'Donor' }) - return peds_df + return pedf_df -def feature_peds(table: pd.DataFrame, metadata: qiime2.Metadata, - time_column: str, reference_column: str, subject_column: str, - filter_missing_references: bool = False, - sampling_depth: int = None, - num_resamples: int = 0) -> (pd.DataFrame): +def prdf(table: pd.DataFrame, metadata: qiime2.Metadata, + time_column: str, reference_column: str, subject_column: str, + filter_missing_references: bool = False, + sampling_depth: int = None, + num_resamples: int = 0) -> (pd.DataFrame): # making sure that samples exist in the table ids_with_data = table.index metadata = metadata.filter_ids(ids_to_keep=ids_with_data) @@ -289,7 +257,7 @@ def feature_peds(table: pd.DataFrame, metadata: qiime2.Metadata, # no samping depth num_resamples = 1 for x in range(num_resamples): - peds_df =\ + prdf_df =\ pd.DataFrame(columns=['id', 'recipients with feature', 'all possible recipients with feature', 'group', 'subject']) @@ -297,17 +265,17 @@ def feature_peds(table: pd.DataFrame, metadata: qiime2.Metadata, table = _subsample(table, sampling_depth) for time, time_metadata in metadata_df.groupby(time_column): - peds_df = _compute_peds(peds_df=peds_df, peds_type="Feature", - peds_time=time, - reference_series=used_references, - table=table, - metadata=time_metadata, - time_column=time_column, - subject_column=subject_column, - reference_column=reference_column) - peds_df = peds_df.set_index('id') - num_list.append(peds_df['recipients with feature']) - denom_list.append(peds_df['all possible recipients with feature']) + prdf_df = _compute_proportion(df=prdf_df, type="PRDF", + time=time, + reference_series=used_references, + table=table, + metadata=time_metadata, + time_column=time_column, + subject_column=subject_column, + reference_column=reference_column) + prdf_df = prdf_df.set_index('id') + num_list.append(prdf_df['recipients with feature']) + denom_list.append(prdf_df['all possible recipients with feature']) numerator_df = pd.concat(num_list, axis=1) denominator_df = pd.concat(denom_list, axis=1) @@ -316,38 +284,38 @@ def feature_peds(table: pd.DataFrame, metadata: qiime2.Metadata, with warnings.catch_warnings(): warnings.simplefilter("ignore") - peds = median_numerator_series/median_denominator_series + prdf = median_numerator_series/median_denominator_series - peds_df['measure'] = peds.values - peds_df['recipients with feature'] = median_numerator_series.values - peds_df['all possible recipients with feature'] = \ + prdf_df['measure'] = prdf.values + prdf_df['recipients with feature'] = median_numerator_series.values + prdf_df['all possible recipients with feature'] = \ median_denominator_series.values - peds_df = peds_df.reset_index() + prdf_df = prdf_df.reset_index() - peds_df['id'].attrs.update({ + prdf_df['id'].attrs.update({ 'title': "Feature ID", 'description': '' }) - peds_df['measure'].attrs.update({ - 'title': "Feature PEDS", - 'description': 'Proportional Engraftment of Donor Strains' + prdf_df['measure'].attrs.update({ + 'title': "PRDF", + 'description': 'Proportion of Recipients with Donor Feature' }) - peds_df['group'].attrs.update({ + prdf_df['group'].attrs.update({ 'title': time_column, 'description': 'Time' }) - peds_df['subject'].attrs.update({ + prdf_df['subject'].attrs.update({ 'title': "Feature ID", 'description': '' }) - return peds_df + return prdf_df -def _compute_peds(peds_df: pd.Series, peds_type: str, peds_time: int, - reference_series: pd.Series, table: pd.Series, - metadata: qiime2.Metadata, time_column: str, - subject_column: str, - reference_column: str = None) -> (pd.DataFrame): +def _compute_proportion(df: pd.Series, type: str, time: int, + reference_series: pd.Series, table: pd.Series, + metadata: qiime2.Metadata, time_column: str, + subject_column: str, + reference_column: str = None) -> (pd.DataFrame): table = table > 0 reference_overlap = reference_series.isin(table.index) try: @@ -365,33 +333,34 @@ def _compute_peds(peds_df: pd.Series, peds_type: str, peds_time: int, recip_df=recip_df, reference_column=reference_column) maskedrecip = donormask & recip_df - if peds_type == "Sample" or peds_type == "PPRS": + if type == "PEDF" or type == "PPRF": num_sum = np.sum(maskedrecip, axis=1) donor_sum = np.sum(donormask, axis=1) for count, sample in enumerate(recip_df.index): sample_row = metadata.loc[sample] - peds_df.loc[len(peds_df)] = [sample, num_sum[count], - donor_sum[count], - sample_row[reference_column], - sample_row[subject_column], - sample_row[time_column]] + df.loc[len(df)] = [sample, num_sum[count], + donor_sum[count], + sample_row[reference_column], + sample_row[subject_column], + sample_row[time_column]] - elif peds_type == "Feature": + elif type == "PRDF": num_sum = np.sum(maskedrecip, axis=0) donor_sum = np.sum(donormask, axis=0) for count, feature in enumerate(recip_df.columns): - peds_df.loc[len(peds_df)] = [feature, num_sum[count], - donor_sum[count], peds_time, feature] - peds_df = peds_df.dropna() + df.loc[len(df)] = [feature, num_sum[count], + donor_sum[count], time, feature] + df = df.dropna() else: - raise KeyError('There was an error finding which PEDS methods to use') - return peds_df + raise KeyError('There was an error finding which proportion method to' + ' use') + return df -def sample_pprs(table: pd.DataFrame, metadata: qiime2.Metadata, - time_column: str, baseline_timepoint: str, subject_column: str, - filter_missing_references: bool, sampling_depth: int = None, - num_resamples: int = 0) -> (pd.DataFrame): +def pprf(table: pd.DataFrame, metadata: qiime2.Metadata, + time_column: str, baseline_timepoint: str, subject_column: str, + filter_missing_references: bool, sampling_depth: int = None, + num_resamples: int = 0) -> (pd.DataFrame): # making sure that samples exist in the table ids_with_data = table.index metadata = metadata.filter_ids(ids_to_keep=ids_with_data) @@ -427,25 +396,25 @@ def sample_pprs(table: pd.DataFrame, metadata: qiime2.Metadata, # no samping depth num_resamples = 1 for x in range(num_resamples): - peds_df = pd.DataFrame(columns=['id', + pprf_df = pd.DataFrame(columns=['id', 'transfered_baseline_features', 'total_baseline_features', 'baseline', 'subject', 'group']) if sampling_depth: table = _subsample(table, sampling_depth) - peds_df = _compute_peds(peds_df=peds_df, peds_type='PPRS', - peds_time=np.nan, - reference_series=used_references, - table=table, metadata=baseline_metadata, - time_column=time_column, - subject_column=subject_column, - reference_column=used_references.name) + pprf_df = _compute_proportion(df=pprf_df, type='PPRF', + time=np.nan, + reference_series=used_references, + table=table, metadata=baseline_metadata, + time_column=time_column, + subject_column=subject_column, + reference_column=used_references.name) # Set the index for pd matching when concating and summing - peds_df = peds_df.set_index('id') + pprf_df = pprf_df.set_index('id') - num_list.append(peds_df['transfered_baseline_features']) - denom_list.append(peds_df['total_baseline_features']) + num_list.append(pprf_df['transfered_baseline_features']) + denom_list.append(pprf_df['total_baseline_features']) numerator_df = pd.concat(num_list, axis=1) denominator_df = pd.concat(denom_list, axis=1) @@ -455,51 +424,51 @@ def sample_pprs(table: pd.DataFrame, metadata: qiime2.Metadata, with warnings.catch_warnings(): warnings.simplefilter("ignore") - pprs = median_numerator_series/median_denominator_series + pprf = median_numerator_series/median_denominator_series - peds_df['measure'] = pprs - peds_df['transfered_baseline_features'] = median_numerator_series - peds_df['total_baseline_features'] = median_denominator_series - peds_df = peds_df.reset_index() + pprf_df['measure'] = pprf + pprf_df['transfered_baseline_features'] = median_numerator_series + pprf_df['total_baseline_features'] = median_denominator_series + pprf_df = pprf_df.reset_index() - peds_df['id'].attrs.update({ + pprf_df['id'].attrs.update({ 'title': metadata_df.index.name, 'description': 'Sample IDs' }) - peds_df['measure'].attrs.update({ - 'title': 'PPRS', - 'description': 'Proportional Persistence of Recipient Strains' + pprf_df['measure'].attrs.update({ + 'title': 'PPRF', + 'description': 'Proportional Persistence of Recipient Features' }) - peds_df['group'].attrs.update({ + pprf_df['group'].attrs.update({ 'title': time_column, 'description': 'Time' }) - peds_df['subject'].attrs.update({ + pprf_df['subject'].attrs.update({ 'title': subject_column, 'description': 'Subject IDs linking samples across time' }) - peds_df['transfered_baseline_features'].attrs.update({ + pprf_df['transfered_baseline_features'].attrs.update({ 'title': "Transfered Reference Features", 'description': '...' }) - peds_df['total_baseline_features'].attrs.update({ + pprf_df['total_baseline_features'].attrs.update({ 'title': "Total Reference Features", 'description': '...' }) - peds_df['baseline'].attrs.update({ + pprf_df['baseline'].attrs.update({ 'title': used_references.name, 'description': 'recipeint baseline' }) - return peds_df + return pprf_df -def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata, - time_column: str, reference_column: str, - subject_column: str, sampling_depth: int, - filter_missing_references: bool = False, - peds_rarefaction: bool = True, - num_resamples: int = 999, - ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): +def pedf_permutation_test(table: pd.DataFrame, metadata: qiime2.Metadata, + time_column: str, reference_column: str, + subject_column: str, sampling_depth: int, + filter_missing_references: bool = False, + pedf_rarefaction: bool = True, + num_resamples: int = 999, + ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): ids_with_data = table.index metadata = metadata.filter_ids(ids_to_keep=ids_with_data) @@ -530,22 +499,21 @@ def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata, " and needs more than one recipient" " to successfully shuffle.") - if not peds_rarefaction: - actual_peds_num_resamples = 1 + if not pedf_rarefaction: + actual_pedf_num_resamples = 1 else: - actual_peds_num_resamples = num_resamples + actual_pedf_num_resamples = num_resamples - peds = sample_peds( - table=table, metadata=metadata, - time_column=time_column, - reference_column=reference_column, - subject_column=subject_column, - filter_missing_references=filter_missing_references, - num_resamples=actual_peds_num_resamples, - sampling_depth=sampling_depth - ) + pedf_df = pedf( + table=table, metadata=metadata, + time_column=time_column, + reference_column=reference_column, + subject_column=subject_column, + filter_missing_references=filter_missing_references, + num_resamples=actual_pedf_num_resamples, + sampling_depth=sampling_depth) - actual_peds = peds[['id', 'measure']].set_index('id')['measure'] + actual_pedf = pedf_df[['id', 'measure']].set_index('id')['measure'] # Mismatch simulation: recip_df = _create_recipient_table(used_references, metadata_df, table) @@ -582,15 +550,15 @@ def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata, recip_mask = _mask_recipient(donor_mask, rarefied_recip_simulated_table) # Numerator for PEDS Calc. (Number of Donor features in the Recipient) num_engrafted_donor_features = np.sum(recip_mask.values, axis=1) - # Denominator for PEDS Calc. (Number of unique features in the Donor) + # Denominator for PEDF Calc. (Number of unique features in the Donor) num_donor_features = np.sum(donor_mask, axis=1) # This ignores warnings that come from dividing by 0. - # mismatched_peds will be Nan if the denominator is 0 and thats reasonable. + # mismatched_pedf will be Nan if the denominator is 0 and thats reasonable. with warnings.catch_warnings(): warnings.simplefilter("ignore") - mismatched_peds = num_engrafted_donor_features/num_donor_features - per_sub_stats = _per_subject_stats(mismatched_peds, - actual_peds) + mismatched_pedf = num_engrafted_donor_features/num_donor_features + per_sub_stats = _per_subject_stats(mismatched_pedf, + actual_pedf) global_stats = _global_stats(per_sub_stats['p-value']) - return peds, per_sub_stats, global_stats + return pedf_df, per_sub_stats, global_stats diff --git a/q2_fmt/plugin_setup.py b/q2_fmt/plugin_setup.py index 88c8dd5..aeb9757 100644 --- a/q2_fmt/plugin_setup.py +++ b/q2_fmt/plugin_setup.py @@ -82,19 +82,29 @@ ' used to filter by a subset of `metadata`, such as a specific value' ' in one of `metadata` columns.') -per_subject_stats = ('Table describing significance of PEDS scores compared to' +per_subject_stats = ('Table describing significance of PEDF scores compared to' ' mismatched donor-recipient pairs on a per-subject' ' basis.') -global_stats = ('Table describing significance of PEDS scores across all' +global_stats = ('Table describing significance of PEDF scores across all' ' subjects.') -peds_table = 'The `table` to calculate PEDS on.' -peds_dists = ('The distributions for the PEDS measure, grouped by the selected' +pedf_table = ('The `table` to calculate PEDF on.') +pprf_table = ('The `table` to calculate PPRF on.') +prdf_table = ('The `table` to calculate PRDF on.') +pedf_dists = ('The distributions for the PEDF measure, grouped by the selected' ' `time_column`. Also contains the numerator and denominator for' - ' PEDS calulations. May also contain subject IDs, if' + ' PEDF calulations. May also contain subject IDs, if' + ' `subject_column` is provided in `metadata`.') +pprf_dists = ('The distributions for the PPRF measure, grouped by the selected' + ' `time_column`. Also contains the numerator and denominator for' + ' PEDF calulations. May also contain subject IDs, if' + ' `subject_column` is provided in `metadata`.') +prdf_dists = ('The distributions for the PRDF measure, grouped by the selected' + ' `time_column`. Also contains the numerator and denominator for' + ' PEDF calulations. May also contain subject IDs, if' ' `subject_column` is provided in `metadata`.') num_resample = ('Number of iterations for rarefying. If there are' ' more than one resampling the values will be' - ' averaged using median and the PEDS proportion will' + ' averaged using median and the PEDF proportion will' ' be calculated on the expected values. i.e. the' ' median numerator and median denominator') sampling_depth = ('Number of observations that each sample' @@ -233,48 +243,14 @@ } ) -plugin.pipelines.register_function( - function=q2_fmt.peds, - inputs={'table': FeatureTable[Frequency | RelativeFrequency | - PresenceAbsence]}, - parameters={'metadata': Metadata, - 'peds_metric': Str % Choices('feature', 'sample'), - 'time_column': Str, 'reference_column': Str, - 'subject_column': Str, - 'filter_missing_references': Bool, - 'drop_incomplete_subjects': Bool, - 'drop_incomplete_timepoints': List[Str], - 'level_delimiter': Str, - 'num_resamples': Int % Range(0, 999), - 'sampling_depth': Int % Range(1, None)}, - outputs=[('heatmap', Visualization)], - input_descriptions={'table': peds_table}, - parameter_descriptions={ - 'metadata': metadata, - 'peds_metric': 'PEDS metric to run.', - 'time_column': time_column, - 'reference_column': reference_column, - 'subject_column': subject_column, - 'filter_missing_references': filter_missing_references, - 'drop_incomplete_subjects': drop_incomplete_subjects, - 'drop_incomplete_timepoints': drop_incomplete_timepoints, - 'level_delimiter': level_delimiter, - 'num_resamples': num_resample, - 'sampling_depth': sampling_depth}, - output_descriptions={'heatmap': 'PEDS heatmap visualization'}, - name='PEDS pipeline to calculate feature or sample PEDS', - description='Runs a pipeline to calculate sample or feature PEDS,' - ' and generate the relevant heatmap', - examples={'peds_pipeline': ex.peds_pipeline_sample} -) - plugin.visualizers.register_function( function=q2_fmt.heatmap, - inputs={'data': Dist1D[Ordered, Matched] % Properties("peds") | - Dist1D[Ordered, Matched] % Properties("pprs"), + inputs={'data': Dist1D[Ordered, Matched] % Properties("pedf") | + Dist1D[Ordered, Matched] % Properties("pprf") | + Dist1D[Ordered, Matched] % Properties("prdf"), 'per_subject_stats': StatsTable[Pairwise], 'global_stats': StatsTable[Pairwise]}, - input_descriptions={'data': 'PEDS or PPRS output to plot', + input_descriptions={'data': 'PEDF or PPRF output to plot', 'per_subject_stats': per_subject_stats, 'global_stats': global_stats}, parameters={'level_delimiter': Str, @@ -285,13 +261,13 @@ 'drop_incomplete_timepoints': drop_incomplete_timepoints, 'drop_incomplete_subjects': drop_incomplete_subjects}, name=' Proportional Features Heatmap', - description='Plot heatmap for PEDS or PPRS value over time', + description='Plot heatmap for PEDF, PRDF or PPRF value over time', examples={ 'heatmap': ex.heatmap} ) plugin.methods.register_function( - function=q2_fmt.sample_peds, + function=q2_fmt.pedf, inputs={'table': FeatureTable[Frequency | RelativeFrequency | PresenceAbsence]}, parameters={'metadata': Metadata, 'time_column': Str, @@ -299,8 +275,8 @@ 'filter_missing_references': Bool, 'num_resamples': Int % Range(0, 999), 'sampling_depth': Int % Range(1, None)}, - outputs=[('peds_dists', Dist1D[Ordered, Matched] % Properties("peds"))], - input_descriptions={'table': peds_table}, + outputs=[('pedf_dists', Dist1D[Ordered, Matched] % Properties("pedf"))], + input_descriptions={'table': pedf_table}, parameter_descriptions={ 'metadata': metadata, 'time_column': time_column, @@ -310,19 +286,21 @@ 'num_resamples': num_resample, 'sampling_depth': sampling_depth}, output_descriptions={ - 'peds_dists': peds_dists + 'pedf_dists': pedf_dists }, - name='Proportional Engraftment of Donor Strains (Features) in each' - ' recipient sample', + name='Proportional Engraftment of Donor Features in each' + ' recipient sample. This is adapted from aggarwala et al. 2021' + ' stainer manuscript, which coined the term PEDS. PEDF uses the same' + ' ideas but is applied to features generally.', description='Calculates percentage of microbes that where found in the ' 'donated material that are found in the recipient.', citations=[citations['aggarwala_precise_2021']], examples={ - 'peds_methods': ex.peds_method + 'pedf_methods': ex.pedf_method } ) plugin.methods.register_function( - function=q2_fmt.feature_peds, + function=q2_fmt.prdf, inputs={'table': FeatureTable[Frequency | RelativeFrequency | PresenceAbsence]}, parameters={'metadata': Metadata, 'time_column': Str, @@ -330,8 +308,8 @@ 'filter_missing_references': Bool, 'num_resamples': Int % Range(0, 999), 'sampling_depth': Int % Range(1, None)}, - outputs=[('peds_dists', Dist1D[Ordered, Matched] % Properties("peds"))], - input_descriptions={'table': peds_table}, + outputs=[('prdf_dists', Dist1D[Ordered, Matched] % Properties("prdf"))], + input_descriptions={'table': prdf_table}, parameter_descriptions={ 'metadata': metadata, 'time_column': time_column, @@ -342,18 +320,18 @@ 'sampling_depth': sampling_depth }, output_descriptions={ - 'peds_dists': peds_dists + 'prdf_dists': prdf_dists }, - name='Porportional Engraftment of Donor Strains per feature', + name='Porportion of Recipients with Donor Feature', description='Calculates how many recipients recieved a given' - ' donated material feature ', + ' donated microbiome feature ', examples={ - 'peds_methods': ex.feature_peds_method + 'prdf_methods': ex.prdf_method } ) plugin.methods.register_function( - function=q2_fmt.sample_pprs, + function=q2_fmt.pprf, inputs={'table': FeatureTable[Frequency | RelativeFrequency | PresenceAbsence]}, parameters={'metadata': Metadata, 'time_column': Str, @@ -361,9 +339,9 @@ 'filter_missing_references': Bool, 'num_resamples': Int % Range(0, 999), 'sampling_depth': Int % Range(1, None)}, - outputs=[('pprs_dists', Dist1D[Ordered, Matched] % Properties("pprs"))], + outputs=[('pprf_dists', Dist1D[Ordered, Matched] % Properties("pprf"))], input_descriptions={ - 'table': 'The `table` to calculate PPRS on.'}, + 'table': pprf_table}, parameter_descriptions={ 'metadata': metadata, 'time_column': time_column, @@ -374,21 +352,21 @@ 'sampling_depth': sampling_depth }, output_descriptions={ - 'pprs_dists': 'The distributions for the PPRS measure, grouped by' - ' the selected `time_column`. Also contains the' - ' numerator and denominator for PPRS calulations.' + 'pprf_dists': pprf_dists }, - name='Proportional Persistence of Recipient Strains (Features) in each' - ' recipient sample', + name='Proportional Persistence of Recipient Features in each' + ' recipient sample. This is adapted from aggarwala et al. 2021' + ' stainer manuscript, which coined the term PPRS. PPRF uses the same' + ' ideas but is applied to features generally.', description='Calculates percentage of microbes that were found in the' ' baseline recipient and persist following FMT' ' intervention.', citations=[citations['aggarwala_precise_2021']], - examples={'pprs_methods': ex.pprs_method} + examples={'pprf_methods': ex.pprf_method} ) plugin.methods.register_function( - function=q2_fmt.peds_simulation, + function=q2_fmt.pedf_permutation_test, inputs={'table': FeatureTable[Frequency | RelativeFrequency | PresenceAbsence]}, parameters={'metadata': Metadata, @@ -397,13 +375,14 @@ 'subject_column': T_subject, 'filter_missing_references': Bool, 'num_resamples': Int % Range(99, None), - 'peds_rarefaction': Bool, + 'pedf_rarefaction': Bool, 'sampling_depth': Int % Range(1, None), }, - outputs=[('actual_sample_peds', - Dist1D[Ordered, Matched] % Properties("peds")), + outputs=[('actual_sample_pedf', + Dist1D[Ordered, Matched] % Properties("pedf")), ('per_subject_stats', StatsTable[Pairwise]), ('global_stats', StatsTable[Pairwise])], + input_descriptions={'table': pedf_table}, parameter_descriptions={ 'metadata': metadata, 'time_column': time_column, @@ -414,23 +393,23 @@ ' simulation on and the number of rarefactions to' ' preform', 'sampling_depth': sampling_depth, - 'peds_rarefaction': 'If False, the feature-table for the actual peds' + 'pedf_rarefaction': 'If False, the feature-table for the actual pedf' ' will be rarefied intstead of using rarefaction.' ' This will make it faster to run this method but' ' the actual values may be slightly less' ' comparable to the simulated values which will' - ' be undergo rarefaction `num_resamples` of times', + ' undergo rarefaction `num_resamples` of times', }, output_descriptions={ - 'actual_sample_peds': peds_dists, + 'actual_sample_pedf': pedf_dists, 'per_subject_stats': per_subject_stats, 'global_stats': global_stats }, - name='PEDS Monte Carlo simulation', - description='A Monte Carlo simulation that randomizes the relationships' - ' between donors and recipients, to test whether the PEDS' + name='PEDF permutation Test', + description='A permutation Test that randomizes the relationships' + ' between donors and recipients, to test whether the PEDF' ' score between a recipient and their actual donor is' - ' significantly higher than PEDS scores between other' + ' significantly higher than PEDF scores between other' ' recipients paired with random donors. This is intended to' ' only work in studies where there are distinct donors,' ' and will yield insignificant results if there are too' @@ -438,7 +417,7 @@ ' which indicator features that are unique to a given donor' ' are transferred to their recipients, as opposed to features' ' that are not indicative of any specific donor. Note: ' - ' PEDS Monte Carlo simulation may have dependency issues' + ' PEDF permutation test may have dependency issues' ' between samples and the simulated background' ' distribution that can make the test' ' overly conservative. This can be fixed by filtering down to' @@ -450,7 +429,7 @@ citations['stouffer_1949_american'], citations['Benjamini_fdr_1995']], examples={ - 'peds_methods': ex.simulation_peds_method + 'pedf_methods': ex.perm_pedf_method } ) @@ -464,7 +443,7 @@ 'baseline_timepoint': Str}, outputs=[('differentials', FeatureData[DifferentialAbundance]), ('da_barplot', Visualization)], - input_descriptions={'table': peds_table}, + input_descriptions={'table': pedf_table}, parameter_descriptions={ 'metadata': metadata, 'time_column': time_column, @@ -477,7 +456,7 @@ ' and donor samples.', 'differentials': 'The calculated per-feature' ' differentials.'}, - name='Detect Donor Indicators Features', + name='Detect Donor Indicator Features', description='Runs a pipeline to indentify differetial features between the' ' donor and the baseline recipient. This is done by filtering' ' the feature table to donor and baseline timepoints and' diff --git a/q2_fmt/tests/test_engraftment.py b/q2_fmt/tests/test_engraftment.py index 32b9526..85245c0 100644 --- a/q2_fmt/tests/test_engraftment.py +++ b/q2_fmt/tests/test_engraftment.py @@ -27,8 +27,8 @@ _check_column_type, _drop_incomplete_timepoints,) from q2_fmt._engraftment import group_timepoints -from q2_fmt._peds import (_compute_peds, sample_peds, - feature_peds, peds_simulation, sample_pprs) +from q2_fmt._peds import (_compute_proportion, pedf, + prdf, pedf_permutation_test, pprf) from q2_fmt._ancombc import get_baseline_donor_md @@ -787,18 +787,19 @@ def test_get_donor(self): 'Feature1': [1, 0, 1, 1, 1, 1], 'Feature2': [1, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - peds_df = pd.DataFrame(columns=['id', + pedf_df = pd.DataFrame(columns=['id', 'transfered_donor_features', 'total_donor_features', 'donor', 'subject', 'group']) - peds_df = _compute_peds(peds_df=peds_df, peds_type="Sample", - peds_time=np.nan, - reference_series=reference_series, - table=table_df, metadata=metadata_df, - time_column="group", reference_column="Ref", - subject_column="subject") - peds_df = peds_df.set_index("id") - donor = peds_df.at["sample1", "donor"] + pedf_df = _compute_proportion(df=pedf_df, type="PEDF", + time=np.nan, + reference_series=reference_series, + table=table_df, metadata=metadata_df, + time_column="group", + reference_column="Ref", + subject_column="subject") + pedf_df = pedf_df.set_index("id") + donor = pedf_df.at["sample1", "donor"] self.assertEqual(donor, "donor1") def test_get_subject(self): @@ -817,19 +818,20 @@ def test_get_subject(self): 'donor1', 'donor2'], 'Feature1': [1, 0, 1, 1, 1, 1], 'Feature3': [1, 1, 1, 1, 1, 1]}).set_index('id') - peds_df = pd.DataFrame(columns=['id', + pedf_df = pd.DataFrame(columns=['id', 'transfered_donor_features', 'total_donor_features', 'donor', 'subject', 'group']) - peds_df = _compute_peds(peds_df=peds_df, peds_type="Sample", - peds_time=np.nan, - reference_series=reference_series, - table=table_df, metadata=metadata_df, - time_column="group", reference_column="Ref", - subject_column="subject") - peds_df = peds_df.set_index("id") - subject = peds_df.at["sample1", "subject"] + pedf_df = _compute_proportion(df=pedf_df, type="PEDF", + time=np.nan, + reference_series=reference_series, + table=table_df, metadata=metadata_df, + time_column="group", + reference_column="Ref", + subject_column="subject") + pedf_df = pedf_df.set_index("id") + subject = pedf_df.at["sample1", "subject"] self.assertEqual(subject, "sub1") def test_timepoint(self): @@ -849,18 +851,19 @@ def test_timepoint(self): 'Feature1': [1, 0, 1, 1, 1, 1], 'Feature2': [1, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - peds_df = pd.DataFrame(columns=['id', + pedf_df = pd.DataFrame(columns=['id', 'transfered_donor_features', 'total_donor_features', 'donor', 'subject', 'group']) - peds_df = _compute_peds(peds_df=peds_df, peds_type="Sample", - peds_time=np.nan, - reference_series=reference_series, - table=table_df, metadata=metadata_df, - time_column="group", reference_column="Ref", - subject_column="subject") - peds_df = peds_df.set_index("id") - tp = peds_df.at["sample3", "group"] + pedf_df = _compute_proportion(df=pedf_df, type="PEDF", + time=np.nan, + reference_series=reference_series, + table=table_df, metadata=metadata_df, + time_column="group", + reference_column="Ref", + subject_column="subject") + pedf_df = pedf_df.set_index("id") + tp = pedf_df.at["sample3", "group"] self.assertEqual(tp, 1) def test_no_donors(self): @@ -950,11 +953,11 @@ def test_no_feature_overlap(self): 'Feature1': [0, 0, 1, 1, 1, 1], 'Feature2': [0, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - sample_peds_df = sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - exp_peds_df = pd.DataFrame({ + pedf_df = pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + exp_pedf_df = pd.DataFrame({ 'id': ['sample1', 'sample2', 'sample3', 'sample4'], 'transfered_donor_features': [0.0, 1.0, 3.0, 3.0], 'total_donor_features': [3.0, 3.0, 3.0, 3.0], @@ -963,7 +966,7 @@ def test_no_feature_overlap(self): 'group': [1.0, 2.0, 3.0, 2.0], 'measure': [0, 0.333333, 1, 1] }) - pd.testing.assert_frame_equal(sample_peds_df, exp_peds_df) + pd.testing.assert_frame_equal(pedf_df, exp_pedf_df) def test_feature_overlap(self): metadata_df = pd.DataFrame({ @@ -982,18 +985,18 @@ def test_feature_overlap(self): 'Feature1': [0, 0, 1, 1, 1, 1], 'Feature2': [0, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - sample_peds_df = sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - TDFs1 = sample_peds_df.set_index("id").at['sample1', - 'transfered_donor_features'] - TDFs2 = sample_peds_df.set_index("id").at['sample2', - 'transfered_donor_features'] - TDFs3 = sample_peds_df.set_index("id").at['sample3', - 'transfered_donor_features'] - TDFs4 = sample_peds_df.set_index("id").at['sample4', - 'transfered_donor_features'] + pedf_df = pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + TDFs1 = pedf_df.set_index("id").at['sample1', + 'transfered_donor_features'] + TDFs2 = pedf_df.set_index("id").at['sample2', + 'transfered_donor_features'] + TDFs3 = pedf_df.set_index("id").at['sample3', + 'transfered_donor_features'] + TDFs4 = pedf_df.set_index("id").at['sample4', + 'transfered_donor_features'] self.assertEqual(TDFs2, 1) self.assertEqual(TDFs1, 0) self.assertEqual(TDFs3, 3) @@ -1016,18 +1019,18 @@ def test_peds_calc(self): 'Feature1': [0, 0, 1, 1, 1, 1], 'Feature2': [0, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - sample_peds_df = sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - TDFs1 = sample_peds_df.set_index("id").at['sample1', - 'measure'] - TDFs2 = sample_peds_df.set_index("id").at['sample2', - 'measure'] - TDFs3 = sample_peds_df.set_index("id").at['sample3', - 'measure'] - TDFs4 = sample_peds_df.set_index("id").at['sample4', - 'measure'] + pedf_df = pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + TDFs1 = pedf_df.set_index("id").at['sample1', + 'measure'] + TDFs2 = pedf_df.set_index("id").at['sample2', + 'measure'] + TDFs3 = pedf_df.set_index("id").at['sample3', + 'measure'] + TDFs4 = pedf_df.set_index("id").at['sample4', + 'measure'] self.assertEqual(TDFs2, 1/3) self.assertEqual(TDFs1, 0) self.assertEqual(TDFs3, 1) @@ -1052,12 +1055,12 @@ def test_unique_subjects_in_timepoints(self): 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') with self.assertRaisesRegex(ValueError, 'There is more than one' ' occurrence of.*Subject sub1.*[1,2,2]'): - sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") + pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") - def test_feature_peds_calc(self): + def test_prdf_calc(self): metadata_df = pd.DataFrame({ 'id': ['sample1', 'sample2', 'sample3', 'donor1'], @@ -1071,18 +1074,18 @@ def test_feature_peds_calc(self): 'Feature1': [0, 0, 1, 1], 'Feature2': [0, 1, 1, 1], 'Feature3': [0, 0, 1, 0]}).set_index('id') - feature_peds_df = feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - TDFs1 = feature_peds_df.set_index("id").at['Feature1', - 'measure'] - TDFs2 = feature_peds_df.set_index("id").at['Feature2', - 'measure'] + prdf_df = prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + TDFs1 = prdf_df.set_index("id").at['Feature1', + 'measure'] + TDFs2 = prdf_df.set_index("id").at['Feature2', + 'measure'] self.assertEqual(TDFs1, 1/3) self.assertEqual(TDFs2, 2/3) - def test_feature_peds_calc_2_tp(self): + def test_prdf_calc_2_tp(self): metadata_df = pd.DataFrame({ 'id': ['sample1', 'sample2', 'sample3', 'donor1'], @@ -1096,14 +1099,14 @@ def test_feature_peds_calc_2_tp(self): 'Feature1': [0, 0, 1, 1], 'Feature2': [0, 1, 1, 1], 'Feature3': [0, 0, 1, 0]}).set_index('id') - feature_peds_df = feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - TDFs1 = feature_peds_df.set_index("id").at['Feature1', - 'measure'].values - TDFs2 = feature_peds_df.set_index("id").at['Feature2', - 'measure'].values + prdf_df = prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + TDFs1 = prdf_df.set_index("id").at['Feature1', + 'measure'].values + TDFs2 = prdf_df.set_index("id").at['Feature2', + 'measure'].values np.testing.assert_array_equal(TDFs1, [.5, 0.]) np.testing.assert_array_equal(TDFs2, [.5, 1.]) @@ -1127,10 +1130,10 @@ def test_sample_id_match(self): with self.assertRaisesRegex(ValueError, "The following IDs are not" " present in the metadata: 'd1', 'd2'," " 's1', 's2', 's3', 's4'"): - feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") + prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") def test_column_type_nonnumeric(self): metadata_df = pd.DataFrame({ @@ -1183,17 +1186,17 @@ def test_reference_series_not_in_table(self): 'Feature1': [1, 0, 1, 1, 1, 1], 'Feature2': [1, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - peds_df = pd.DataFrame(columns=['id', 'measure', + pedf_df = pd.DataFrame(columns=['id', 'measure', 'transfered_donor_features', 'total_donor_features', 'donor', 'subject', 'group']) with self.assertRaisesRegex(AssertionError, ".*['1' '2'].*"): - _compute_peds(peds_df=peds_df, peds_type="Sample", - peds_time=np.nan, - reference_series=reference_series, - table=table_df, metadata=metadata_df, - time_column="group", reference_column="Ref", - subject_column="subject") + _compute_proportion(df=pedf_df, type="PEDF", + time=np.nan, + reference_series=reference_series, + table=table_df, metadata=metadata_df, + time_column="group", reference_column="Ref", + subject_column="subject") def test_column_name_is_ID(self): metadata_df = pd.DataFrame({ @@ -1224,15 +1227,15 @@ def test_rename_features_with_delim(self): 'Feature;1': [0, 0, 1, 1], 'Feature;2': [0, 1, 1, 1], 'Feature;3': [0, 0, 1, 0]}).set_index('id') - feature_peds_df = feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - _rename_features(data=feature_peds_df, level_delimiter=";") - Fs1 = feature_peds_df.set_index("id").at['Feature 1', - 'subject'] - Fs2 = feature_peds_df.set_index("id").at['Feature 2', - 'subject'] + prdf_df = prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + _rename_features(data=prdf_df, level_delimiter=";") + Fs1 = prdf_df.set_index("id").at['Feature 1', + 'subject'] + Fs2 = prdf_df.set_index("id").at['Feature 2', + 'subject'] self.assertEqual("1", Fs1) self.assertEqual("2", Fs2) @@ -1250,15 +1253,15 @@ def test_rename_features_with_no_delim(self): 'Feature1': [0, 0, 1, 1], 'Feature2': [0, 1, 1, 1], 'Feature3': [0, 0, 1, 0]}).set_index('id') - feature_peds_df = feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - _rename_features(data=feature_peds_df, level_delimiter=None) - Fs1 = feature_peds_df.set_index("id").at['Feature1', - 'subject'] - Fs2 = feature_peds_df.set_index("id").at['Feature2', - 'subject'] + prdf_df = prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + _rename_features(data=prdf_df, level_delimiter=None) + Fs1 = prdf_df.set_index("id").at['Feature1', + 'subject'] + Fs2 = prdf_df.set_index("id").at['Feature2', + 'subject'] self.assertEqual("Feature1", Fs1) self.assertEqual("Feature2", Fs2) @@ -1276,15 +1279,15 @@ def test_rename_features_with_wrong_delim(self): 'Feature;1': [0, 0, 1, 1], 'Feature;2': [0, 1, 1, 1], 'Feature;3': [0, 0, 1, 0]}).set_index('id') - feature_peds_df = feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - _rename_features(data=feature_peds_df, level_delimiter=":") - Fs1 = feature_peds_df.set_index("id").at['Feature;1', - 'subject'] - Fs2 = feature_peds_df.set_index("id").at['Feature;2', - 'subject'] + prdf_df = prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + _rename_features(data=prdf_df, level_delimiter=":") + Fs1 = prdf_df.set_index("id").at['Feature;1', + 'subject'] + Fs2 = prdf_df.set_index("id").at['Feature;2', + 'subject'] self.assertEqual("Feature;1", Fs1) self.assertEqual("Feature;2", Fs2) @@ -1302,15 +1305,15 @@ def test_rename_features_with_blank_label(self): 'Feature;1;__': [0, 0, 1, 1], 'Feature;2': [0, 1, 1, 1], 'Feature;3': [0, 0, 1, 0]}).set_index('id') - feature_peds_df = feature_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - _rename_features(data=feature_peds_df, level_delimiter=";") - Fs1 = feature_peds_df.set_index("id").at['Feature 1 __', - 'subject'] - Fs2 = feature_peds_df.set_index("id").at['Feature 2', - 'subject'] + prdf_df = prdf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + _rename_features(data=prdf_df, level_delimiter=";") + Fs1 = prdf_df.set_index("id").at['Feature 1 __', + 'subject'] + Fs2 = prdf_df.set_index("id").at['Feature 2', + 'subject'] self.assertEqual("1", Fs1) self.assertEqual("2", Fs2) @@ -1331,11 +1334,11 @@ def test_peds_nan_tp(self): 'Feature1': [0, 0, 1, 1, 1, 1], 'Feature2': [0, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - sample_peds_df = sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") - obs_samples = sample_peds_df['id'].to_list() + pedf_df = pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") + obs_samples = pedf_df['id'].to_list() exp_sample = ['sample1', 'sample2', 'sample4'] self.assertEqual(obs_samples, exp_sample) @@ -1359,12 +1362,12 @@ def test_peds_no_donor_in_table(self): with self.assertRaisesRegex(KeyError, "References included in the" " metadata are missing from the feature" " table.*"): - sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject") + pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject") - def test_peds_no_donor_in_table_flag(self): + def test_pedf_no_donor_in_table_flag(self): metadata_df = pd.DataFrame({ 'id': ['sample1', 'sample2', 'sample3', 'sample4', 'donor1', 'donor2'], @@ -1381,12 +1384,12 @@ def test_peds_no_donor_in_table_flag(self): 'Feature1': [0, 0, 1, 1, 1], 'Feature2': [0, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1]}).set_index('id') - sample_peds_df = sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject", - filter_missing_references=True) - obs_samples = sample_peds_df['id'].to_list() + pedf_df = pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject", + filter_missing_references=True) + obs_samples = pedf_df['id'].to_list() exp_sample = ['sample1', 'sample2'] self.assertEqual(obs_samples, exp_sample) @@ -1402,13 +1405,13 @@ def test_pprs(self): 'sample5', 'sample6'], 'Feature1': [1, 0, 1, 0, 0, 0], 'Feature2': [0, 0, 1, 1, 0, 1]}).set_index('id') - sample_pprs_df = sample_pprs(table=table_df, metadata=metadata, - time_column="group", - subject_column="subject", - baseline_timepoint="1", - filter_missing_references=False) + pprf_df = pprf(table=table_df, metadata=metadata, + time_column="group", + subject_column="subject", + baseline_timepoint="1", + filter_missing_references=False) - exp_pprs_df = pd.DataFrame({ + exp_pprf_df = pd.DataFrame({ 'id': ['sample2', 'sample3', 'sample5', 'sample6'], 'transfered_baseline_features': [0.0, 1.0, 0.0, 1.0], 'total_baseline_features': [1.0, 1.0, 1.0, 1.0], @@ -1417,9 +1420,9 @@ def test_pprs(self): 'group': [2.0, 3.0, 2.0, 3.0], 'measure': [0.0, 1.0, 0.0, 1.0] }) - pd.testing.assert_frame_equal(sample_pprs_df, exp_pprs_df) + pd.testing.assert_frame_equal(pprf_df, exp_pprf_df) - def test_pprs_baseline_sub(self): + def test_pprf_baseline_sub(self): metadata_df = pd.DataFrame({ 'id': ['sample1', 'sample2', 'sample3', 'sample4', 'pre1', 'pre2'], @@ -1436,11 +1439,11 @@ def test_pprs_baseline_sub(self): 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') with self.assertRaisesRegex(AssertionError, "No baseline samples" " were connected via subject. .*"): - sample_pprs(table=table_df, metadata=metadata, - time_column="group", - baseline_timepoint="0", - subject_column="subject", - filter_missing_references=False) + pprf(table=table_df, metadata=metadata, + time_column="group", + baseline_timepoint="0", + subject_column="subject", + filter_missing_references=False) def test_create_used_references(self): reference_series = pd.Series(data=['donor1', 'donor1'], @@ -1476,11 +1479,11 @@ def test_pprs_missing_baseline(self): 'Feature2': [0, 0, 1, 1, 0, 1]}).set_index('id') with self.assertRaisesRegex(KeyError, "Missing references for the" " associated sample data. .*"): - sample_pprs(table=table_df, metadata=metadata, - time_column="group", - subject_column="subject", - baseline_timepoint="1", - filter_missing_references=False) + pprf(table=table_df, metadata=metadata, + time_column="group", + subject_column="subject", + baseline_timepoint="1", + filter_missing_references=False) def test_median(self): df = pd.DataFrame({ @@ -1567,17 +1570,17 @@ def test_peds_boots(self): 'Feature1': [0, 0, 1, 1, 1, 1], 'Feature2': [0, 1, 1, 1, 1, 1], 'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id') - sample_peds_df = sample_peds(table=table_df, metadata=metadata, - time_column="group", - reference_column="Ref", - subject_column="subject", - num_resamples=1, - sampling_depth=3) - - TDFs3 = sample_peds_df.set_index("id").at['sample3', - 'measure'] - TDFs4 = sample_peds_df.set_index("id").at['sample4', - 'measure'] + pedf_df = pedf(table=table_df, metadata=metadata, + time_column="group", + reference_column="Ref", + subject_column="subject", + num_resamples=1, + sampling_depth=3) + + TDFs3 = pedf_df.set_index("id").at['sample3', + 'measure'] + TDFs4 = pedf_df.set_index("id").at['sample4', + 'measure'] self.assertEqual(TDFs3, 1) self.assertEqual(TDFs4, 1) @@ -1645,13 +1648,13 @@ def test_high_donor_overlap(self): 'Feature3': [0, 0, 10, 0, 0, 10]}).set_index('id') metadata = Metadata(metadata_df) - _, stats, _ = peds_simulation(metadata=metadata, - table=table_df, - time_column="group", - reference_column="Ref", - subject_column="subject", - num_resamples=999, - sampling_depth=9) + _, stats, _ = pedf_permutation_test(metadata=metadata, + table=table_df, + time_column="group", + reference_column="Ref", + subject_column="subject", + num_resamples=999, + sampling_depth=9) real_median = np.median(stats["A:measure"].values) fake_median = np.median(stats["B:measure"].values) self.assertGreater(real_median, fake_median) @@ -1681,13 +1684,13 @@ def test_low_donor_overlap(self): 'Feature3': [0, 0, 10, 10, 10, 0]}).set_index('id') metadata = Metadata(metadata_df) - _, stats, _ = peds_simulation(metadata=metadata, - table=table_df, - time_column="group", - reference_column="Ref", - subject_column="subject", - num_resamples=999, - sampling_depth=9) + _, stats, _ = pedf_permutation_test(metadata=metadata, + table=table_df, + time_column="group", + reference_column="Ref", + subject_column="subject", + num_resamples=999, + sampling_depth=9) real_median = np.median(stats["A:measure"].values) fake_median = np.median(stats["B:measure"].values) @@ -1713,13 +1716,13 @@ def test_single_donor(self): with self.assertRaisesRegex(AssertionError, "There is only one" " donated microbiome in your data. *"): - peds_simulation(metadata=metadata, - table=table_df, - time_column="group", - reference_column="Ref", - subject_column="subject", - num_resamples=999, - sampling_depth=1) + pedf_permutation_test(metadata=metadata, + table=table_df, + time_column="group", + reference_column="Ref", + subject_column="subject", + num_resamples=999, + sampling_depth=1) def test_create_mismatched_pairs(self): metadata_df = pd.DataFrame({