Merge pull request Reed-CompBio#101 from ntalluri/ensemble

Ensemble Networks
ntalluri · Jul 20, 2023 · ca6bbea · ca6bbea
2 parents 7808e59 + 533e137
commit ca6bbea
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 7 deletions.
diff --git a/Snakefile b/Snakefile
@@ -77,6 +77,7 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
 
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
@@ -258,12 +259,13 @@ rule ml_analysis:
         hac_clusters_vertical = SEP.join([out_dir, '{dataset}-hac-clusters-vertical.txt']),
         hac_image_horizontal = SEP.join([out_dir, '{dataset}-hac-horizontal.png']),
         hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-hac-clusters-horizontal.txt']),
-
+        ensemble_network_file = SEP.join([out_dir,'{dataset}-ensemble-pathway.txt'])
     run: 
         summary_df = ml.summarize_networks(input.pathways)
         ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
         ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
+        ml.ensemble_network(summary_df, output.ensemble_network_file)
 
 # Remove the output directory
 rule clean:

diff --git a/environment.yml b/environment.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
 dependencies:
   - adjusttext=0.7.3.1
-  - bioconda::snakemake-minimal=7.18.2
+  - bioconda::snakemake-minimal=7.19.1
   - docker-py=5.0
   - matplotlib=3.5
   - networkx=2.8

diff --git a/src/analysis/ml.py b/src/analysis/ml.py
@@ -71,10 +71,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
     concated_df = pd.concat(edge_dataframes, axis=1, join='outer')
     concated_df = concated_df.fillna(0)
     concated_df = concated_df.astype('int64')
-
     return concated_df
 
-
 def create_palette(column_names):
     """
     Generates a dictionary mapping each column name (algorithm name)
@@ -289,3 +287,21 @@ def hac_horizontal(dataframe: pd.DataFrame, output_png: str, output_file: str, l
     clusters_df.to_csv(output_file, sep='\t', index=False)
     make_required_dirs(output_png)
     plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
+
+
+def ensemble_network(dataframe: pd.DataFrame, output_file: str):
+    """
+    Calculates the mean of the binary values in the provided dataframe to create an ensemble pathway.
+    Counts the number of times an edge appears in a set of pathways and divides by the total number of pathways.
+    Edges that appear more frequently across pathways are more likely to be robust,
+    so this information can be used to filter edges in a final network.
+    @param dataframe: binary dataframe of edge presence and absence in each pathway from summarize_networks
+    @param output_file: the filename to save the ensemble network
+    """
+    row_means = dataframe.mean(axis=1, numeric_only=True).reset_index()
+    row_means.columns = ['Edges', 'Frequency']
+    row_means[['Node1', 'Node2']] = row_means['Edges'].str.split(NODE_SEP, expand=True, regex=False)
+    row_means = row_means.drop('Edges', axis=1)
+    row_means = row_means[['Node1', 'Node2', 'Frequency']]
+    make_required_dirs(output_file)
+    row_means.to_csv(output_file, sep='\t', index=False, header=False)
diff --git a/test/ml/expected/expected-ensemble-network.tsv b/test/ml/expected/expected-ensemble-network.tsv
@@ -0,0 +1,7 @@
+A	B	0.6666666666666666
+C	D	0.6666666666666666
+E	F	0.6666666666666666
+L	M	0.6666666666666666
+M	N	0.3333333333333333
+O	P	0.3333333333333333
+P	Q	0.3333333333333333
diff --git a/.../ml/expected/expected-pca-coordinates.csv → .../ml/expected/expected-pca-coordinates.tsv b/.../ml/expected/expected-pca-coordinates.csv → .../ml/expected/expected-pca-coordinates.tsv
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
@@ -28,10 +28,10 @@ def test_summarize_networks(self):
     def test_pca(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
         ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt',
-               OUT_DIR + 'pca-coordinates.csv')
-        coord = pd.read_table(OUT_DIR + 'pca-coordinates.csv')
+               OUT_DIR + 'pca-coordinates.tsv')
+        coord = pd.read_table(OUT_DIR + 'pca-coordinates.tsv')
         coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
-        expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.csv')
+        expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv')
         expected = expected.round(5)
 
         assert coord.equals(expected)
@@ -48,3 +48,13 @@ def test_hac_vertical(self):
 
         assert filecmp.cmp(OUT_DIR + 'hac-clusters-vertical.txt', EXPECT_DIR + 'expected-hac-vertical-clusters.txt')
 
+    def test_ensemble_network(self):
+        dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
+        ml.ensemble_network(dataframe, OUT_DIR + 'ensemble-network.tsv')
+
+        en = pd.read_table(OUT_DIR + 'ensemble-network.tsv')
+        en = en.round(5)  # round values to 5 digits to account for numeric differences across machines
+        expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network.tsv')
+        expected = expected.round(5)
+
+        assert en.equals(expected)