Skip to content

Commit

Permalink
Merge pull request Reed-CompBio#143 from ntalluri/debugging
Browse files Browse the repository at this point in the history
Debugging ML
  • Loading branch information
agitter authored Jun 14, 2024
2 parents 5616f76 + ab55402 commit 1f59293
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 5 deletions.
20 changes: 15 additions & 5 deletions spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
edge_tuples = []
for file in file_paths:
try:
# collecting and sorting the edge pairs per algortihm
# collecting and sorting the edge pairs per algorithm
with open(file, 'r') as f:
lines = f.readlines()

Expand All @@ -61,9 +61,8 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
p = PurePath(file)
edge_tuples.append((p.parts[-2], edges))

except FileNotFoundError:
print(file, ' not found during ML analysis') # should not hit this
continue
except FileNotFoundError as exc:
raise FileNotFoundError(str(file) + ' not found during ML analysis') from exc

# initially construct separate dataframes per algorithm
edge_dataframes = []
Expand All @@ -81,8 +80,19 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = pd.concat(edge_dataframes, axis=1, join='outer')
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

# don't do ml post-processing if there is an empty dataframe or the number of samples is <= 1
if concated_df.empty:
raise ValueError("ML post-processing cannot proceed because the summarize network dataframe is empty.\nWe "
"suggest setting ml include: false in the configuration file to avoid this error.")
if min(concated_df.shape) <= 1:
raise ValueError(f"ML post-processing cannot proceed because the available number of pathways is insufficient. "
f"The ml post-processing requires more than one pathway, but currently "
f"there are only {min(concated_df.shape)} pathways.")

return concated_df


def create_palette(column_names):
"""
Generates a dictionary mapping each column name (algorithm name)
Expand Down Expand Up @@ -141,7 +151,7 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:

# saving the coordinates of each algorithm
make_required_dirs(output_coord)
coordinates_df = pd.DataFrame(X_pca, columns = ['PC' + str(i) for i in range(1, components+1)])
coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
coordinates_df.insert(0, 'algorithm', columns.tolist())
coordinates_df.to_csv(output_coord, sep='\t', index=False)

Expand Down
1 change: 1 addition & 0 deletions test/ml/input/test-data-single/single.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
L M 1 U
9 changes: 9 additions & 0 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path

import pandas as pd
import pytest

import spras.analysis.ml as ml

Expand All @@ -25,6 +26,14 @@ def test_summarize_networks(self):
dataframe.to_csv(OUT_DIR + 'dataframe.csv')
assert filecmp.cmp(OUT_DIR + 'dataframe.csv', EXPECT_DIR + 'expected-dataframe.csv', shallow=False)

def test_summarize_networks_empty(self):
with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-empty/empty.txt'])

def test_single_line(self):
with pytest.raises(ValueError): #raises error if single line in file s.t. single row in dataframe is used for post processing
ml.summarize_networks([INPUT_DIR + 'test-data-single/single.txt'])

def test_pca(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt',
Expand Down

0 comments on commit 1f59293

Please sign in to comment.