Skip to content

Commit

Permalink
Merge pull request Reed-CompBio#101 from ntalluri/ensemble
Browse files Browse the repository at this point in the history
Ensemble Networks
  • Loading branch information
agitter authored Jul 20, 2023
2 parents 7808e59 + 533e137 commit ca6bbea
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 7 deletions.
4 changes: 3 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
Expand Down Expand Up @@ -258,12 +259,13 @@ rule ml_analysis:
hac_clusters_vertical = SEP.join([out_dir, '{dataset}-hac-clusters-vertical.txt']),
hac_image_horizontal = SEP.join([out_dir, '{dataset}-hac-horizontal.png']),
hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-hac-clusters-horizontal.txt']),

ensemble_network_file = SEP.join([out_dir,'{dataset}-ensemble-pathway.txt'])
run:
summary_df = ml.summarize_networks(input.pathways)
ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
ml.ensemble_network(summary_df, output.ensemble_network_file)

# Remove the output directory
rule clean:
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- conda-forge
dependencies:
- adjusttext=0.7.3.1
- bioconda::snakemake-minimal=7.18.2
- bioconda::snakemake-minimal=7.19.1
- docker-py=5.0
- matplotlib=3.5
- networkx=2.8
Expand Down
20 changes: 18 additions & 2 deletions src/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = pd.concat(edge_dataframes, axis=1, join='outer')
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

return concated_df


def create_palette(column_names):
"""
Generates a dictionary mapping each column name (algorithm name)
Expand Down Expand Up @@ -289,3 +287,21 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
clusters_df.to_csv(output_file, sep='\t', index=False)
make_required_dirs(output_png)
plt.savefig(output_png, bbox_inches="tight", dpi=DPI)


def ensemble_network(dataframe: pd.DataFrame, output_file: str):
"""
Calculates the mean of the binary values in the provided dataframe to create an ensemble pathway.
Counts the number of times an edge appears in a set of pathways and divides by the total number of pathways.
Edges that appear more frequently across pathways are more likely to be robust,
so this information can be used to filter edges in a final network.
@param dataframe: binary dataframe of edge presence and absence in each pathway from summarize_networks
@param output_file: the filename to save the ensemble network
"""
row_means = dataframe.mean(axis=1, numeric_only=True).reset_index()
row_means.columns = ['Edges', 'Frequency']
row_means[['Node1', 'Node2']] = row_means['Edges'].str.split(NODE_SEP, expand=True, regex=False)
row_means = row_means.drop('Edges', axis=1)
row_means = row_means[['Node1', 'Node2', 'Frequency']]
make_required_dirs(output_file)
row_means.to_csv(output_file, sep='\t', index=False, header=False)
7 changes: 7 additions & 0 deletions test/ml/expected/expected-ensemble-network.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
A B 0.6666666666666666
C D 0.6666666666666666
E F 0.6666666666666666
L M 0.6666666666666666
M N 0.3333333333333333
O P 0.3333333333333333
P Q 0.3333333333333333
16 changes: 13 additions & 3 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def test_summarize_networks(self):
def test_pca(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt',
OUT_DIR + 'pca-coordinates.csv')
coord = pd.read_table(OUT_DIR + 'pca-coordinates.csv')
OUT_DIR + 'pca-coordinates.tsv')
coord = pd.read_table(OUT_DIR + 'pca-coordinates.tsv')
coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines
expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.csv')
expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv')
expected = expected.round(5)

assert coord.equals(expected)
Expand All @@ -48,3 +48,13 @@ def test_hac_vertical(self):

assert filecmp.cmp(OUT_DIR + 'hac-clusters-vertical.txt', EXPECT_DIR + 'expected-hac-vertical-clusters.txt')

def test_ensemble_network(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network.tsv')

en = pd.read_table(OUT_DIR + 'ensemble-network.tsv')
en = en.round(5) # round values to 5 digits to account for numeric differences across machines
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network.tsv')
expected = expected.round(5)

assert en.equals(expected)

0 comments on commit ca6bbea

Please sign in to comment.