diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index 0fc4e9ed..e2dbf3dd 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -81,6 +81,7 @@ jobs: docker pull reedcompbio/pathlinker:latest docker pull reedcompbio/meo:latest docker pull reedcompbio/mincostflow:latest + docker pull reedcompbio/allpairs:latest docker pull reedcompbio/domino:latest - name: Build Omics Integrator 1 Docker image uses: docker/build-push-action@v1 @@ -127,6 +128,15 @@ jobs: tags: latest cache_froms: reedcompbio/mincostflow:latest push: false + - name: Build All Pairs Shortest Paths Docker image + uses: docker/build-push-action@v1 + with: + path: docker-wrappers/AllPairs/. + dockerfile: docker-wrappers/AllPairs/Dockerfile + repository: reedcompbio/allpairs + tags: latest + cache_froms: reedcompbio/allpairs:latest + push: false - name: Build DOMINO Docker image uses: docker/build-push-action@v1 with: diff --git a/config/config.yaml b/config/config.yaml index 4b817af3..12718d17 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -72,6 +72,11 @@ flow: [1] # The flow must be an int capacity: [1] + - name: "allpairs" + params: + include: true + directed: false + - name: "domino" params: include: true @@ -80,6 +85,7 @@ slice_threshold: [0.3] module_threshold: [0.05] + # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset # Assume that if a dataset label does not change, the lists of associated input files do not change diff --git a/docker-wrappers/AllPairs/Dockerfile b/docker-wrappers/AllPairs/Dockerfile new file mode 100644 index 00000000..d75572e2 --- /dev/null +++ b/docker-wrappers/AllPairs/Dockerfile @@ -0,0 +1,7 @@ +# AllPairs wrapper +FROM python:3.9-alpine3.16 + +WORKDIR /AllPairs + +RUN pip install networkx==2.6.3 +COPY all-pairs-shortest-paths.py /AllPairs/all-pairs-shortest-paths.py diff --git a/docker-wrappers/AllPairs/README.md b/docker-wrappers/AllPairs/README.md new file mode 100644 index 00000000..03cdc681 --- /dev/null +++ b/docker-wrappers/AllPairs/README.md @@ -0,0 +1,28 @@ +# All Pairs Shortest Paths Docker image + +A Docker image for All Pairs Shortest Paths that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/allpairs). +This algorithm was implemented by the SPRAS team and relies on the NetworkX [`shortest_path`](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path.html) function. + +To create the Docker image run: +``` +docker build -t reedcompbio/allpairs -f Dockerfile . +``` +from this directory. + +To inspect the installed Python packages: +``` +docker run reedcompbio/allpairs pip list +``` + + +## Testing +Test code is located in `test/AllPairs`. +The `input` subdirectory contains a sample network and source/target file, along with a network and source/target file to check for the correctness of All Pairs Shortest Path. +The expected output graphs for the sample networks are in the `expected` subdirectory. + +The Docker wrapper can be tested with `pytest -k test_ap.py` from the root of the SPRAS repository. + + +## Notes +- The `all-pairs-shortest-paths.py` code is located locally in SPRAS (since the code is short). It is under `docker-wrappers/AllPairs`. +- Samples of an input network and source/target file are located under test/AllPairs/input. diff --git a/docker-wrappers/AllPairs/all-pairs-shortest-paths.py b/docker-wrappers/AllPairs/all-pairs-shortest-paths.py new file mode 100644 index 00000000..dd6b835c --- /dev/null +++ b/docker-wrappers/AllPairs/all-pairs-shortest-paths.py @@ -0,0 +1,82 @@ +""" +All Pairs Shortest Paths pathway reconstruction algorithm. +The algorithm takes a network and a list of sources and targets as input. +It outputs the shortest possible path between every source and every target. +""" + +import argparse +from pathlib import Path + +import networkx as nx + + +def parse_arguments(): + """ + Process command line arguments. + @return arguments + """ + parser = argparse.ArgumentParser( + description="All Pairs Shortest Paths pathway reconstruction" + ) + parser.add_argument("--network", type=Path, required=True, help="Network file of the form ." + " Tab-delimited.") + parser.add_argument("--nodes", type=Path, required=True, help="Nodes file of the form . " + "Tab-delimited.") + parser.add_argument("--output", type=Path, required=True, help="Output file") + + return parser.parse_args() + + +def allpairs(network_file: Path, nodes_file: Path, output_file: Path): + if not network_file.exists(): + raise OSError(f"Network file {str(network_file)} does not exist") + if not nodes_file.exists(): + raise OSError(f"Nodes file {str(nodes_file)} does not exist") + if output_file.exists(): + print(f"Output file {str(output_file)} will be overwritten") + + # Create the parent directories for the output file if needed + output_file.parent.mkdir(parents=True, exist_ok=True) + + # Read the list of nodes + sources = set() + targets = set() + with nodes_file.open() as nodes_f: + for line in nodes_f: + row = line.strip().split(sep='\t') + if row[1] == 'source': + sources.add(row[0]) + elif row[1] == 'target': + targets.add(row[0]) + + # There should be at least one source and one target + assert len(sources) > 0, 'There are no sources.' + assert len(targets) > 0, 'There are no targets.' + + # Read graph & assert all the sources/targets are in network + graph = nx.read_weighted_edgelist(network_file, delimiter='\t') + assert len(sources.intersection(graph.nodes())) == len(sources), 'At least one source is not in the interactome.' + assert len(targets.intersection(graph.nodes())) == len(targets), 'At least one target is not in the interactome.' + + # Finally, compute all-pairs-shortest-paths and record the subgraph. + output = nx.Graph() + for source in sources: + for target in targets: + p = nx.shortest_path(graph, source, target, weight='weight') + nx.add_path(output, p) + + # Write the subgraph as a list of edges. + nx.write_edgelist(output, output_file, data=False, delimiter='\t') + print(f"Wrote output file to {str(output_file)}") + + +def main(): + """ + Parse arguments and run pathway reconstruction + """ + args = parse_arguments() + allpairs(args.network, args.nodes, args.output) + + +if __name__ == "__main__": + main() diff --git a/docker-wrappers/LocalNeighborhood/local_neighborhood.py b/docker-wrappers/LocalNeighborhood/local_neighborhood.py index bf26f345..2a2b6096 100644 --- a/docker-wrappers/LocalNeighborhood/local_neighborhood.py +++ b/docker-wrappers/LocalNeighborhood/local_neighborhood.py @@ -29,7 +29,7 @@ def local_neighborhood(network_file: Path, nodes_file: Path, output_file: Path): if not nodes_file.exists(): raise OSError(f"Nodes file {str(nodes_file)} does not exist") if output_file.exists(): - print(f"Output files {str(output_file)} will be overwritten") + print(f"Output file {str(output_file)} will be overwritten") # Create the parent directories for the output file if needed output_file.parent.mkdir(parents=True, exist_ok=True) diff --git a/src/allpairs.py b/src/allpairs.py new file mode 100644 index 00000000..10808415 --- /dev/null +++ b/src/allpairs.py @@ -0,0 +1,103 @@ +import warnings +from pathlib import Path + +import pandas as pd + +from src.prm import PRM +from src.util import prepare_volume, run_container + +__all__ = ['AllPairs'] + + +class AllPairs(PRM): + required_inputs = ['nodetypes', 'network'] + + @staticmethod + def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ + for input_type in AllPairs.required_inputs: + if input_type not in filename_map: + raise ValueError("{input_type} filename is missing") + + # Get sources and targets for node input file + # Borrowed code from pathlinker.py + sources_targets = data.request_node_columns(["sources", "targets"]) + if sources_targets is None: + raise ValueError("All Pairs Shortest Paths requires sources and targets") + + both_series = sources_targets.sources & sources_targets.targets + for _index, row in sources_targets[both_series].iterrows(): + warn_msg = row.NODEID + " has been labeled as both a source and a target." + warnings.warn(warn_msg, stacklevel=2) + + # Create nodetype file + input_df = sources_targets[["NODEID"]].copy() + input_df.columns = ["#Node"] + input_df.loc[sources_targets["sources"] == True, "Node type"] = "source" + input_df.loc[sources_targets["targets"] == True, "Node type"] = "target" + + input_df.to_csv(filename_map["nodetypes"], sep="\t", index=False, columns=["#Node", "Node type"]) + + # This is pretty memory intensive. We might want to keep the interactome centralized. + data.get_interactome().to_csv(filename_map["network"], sep="\t", index=False, + columns=["Interactor1", "Interactor2", "Weight"], + header=["#Interactor1", "Interactor2", "Weight"]) + + @staticmethod + def run(nodetypes=None, network=None, output_file=None, singularity=False): + """ + Run All Pairs Shortest Paths with Docker + @param nodetypes: input node types with sources and targets (required) + @param network: input network file (required) + @param singularity: if True, run using the Singularity container instead of the Docker container + @param output_file: path to the output pathway file (required) + """ + if not nodetypes or not network or not output_file: + raise ValueError('Required All Pairs Shortest Paths arguments are missing') + + work_dir = '/apsp' + + # Each volume is a tuple (src, dest) + volumes = list() + + bind_path, node_file = prepare_volume(nodetypes, work_dir) + volumes.append(bind_path) + + bind_path, network_file = prepare_volume(network, work_dir) + volumes.append(bind_path) + + # Create the parent directories for the output file if needed + Path(output_file).parent.mkdir(parents=True, exist_ok=True) + bind_path, mapped_out_file = prepare_volume(output_file, work_dir) + volumes.append(bind_path) + + command = ['python', + '/AllPairs/all-pairs-shortest-paths.py', + '--network', network_file, + '--nodes', node_file, + '--output', mapped_out_file] + + print('Running All Pairs Shortest Paths with arguments: {}'.format(' '.join(command)), flush=True) + + container_framework = 'singularity' if singularity else 'docker' + out = run_container(container_framework, + 'reedcompbio/allpairs', + command, + volumes, + work_dir) + print(out) + + @staticmethod + def parse_output(raw_pathway_file, standardized_pathway_file): + """ + Convert a predicted pathway into the universal format + @param raw_pathway_file: pathway file produced by an algorithm's run function + @param standardized_pathway_file: the same pathway written in the universal format + """ + df = pd.read_csv(raw_pathway_file, sep='\t', header=None) + df['Rank'] = 1 # add a rank column of 1s since the edges are not ranked. + df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t') diff --git a/src/runner.py b/src/runner.py index 2a63a699..64a5aa88 100644 --- a/src/runner.py +++ b/src/runner.py @@ -1,6 +1,6 @@ -from src.dataset import Dataset - # supported algorithm imports +from src.allpairs import AllPairs as allpairs +from src.dataset import Dataset from src.domino import DOMINO as domino from src.meo import MEO as meo from src.mincostflow import MinCostFlow as mincostflow diff --git a/test/AllPairs/expected/correctness-expected.txt b/test/AllPairs/expected/correctness-expected.txt new file mode 100644 index 00000000..d2d88c3e --- /dev/null +++ b/test/AllPairs/expected/correctness-expected.txt @@ -0,0 +1,3 @@ +A B +A E +B C diff --git a/test/AllPairs/expected/sample-out-expected.txt b/test/AllPairs/expected/sample-out-expected.txt new file mode 100644 index 00000000..8985411e --- /dev/null +++ b/test/AllPairs/expected/sample-out-expected.txt @@ -0,0 +1,10 @@ +S1 A +S1 B +A E +A F +E T1 +T1 F +F T2 +F B +B S2 +S2 T3 diff --git a/test/AllPairs/expected/zero-length-expected.txt b/test/AllPairs/expected/zero-length-expected.txt new file mode 100644 index 00000000..e69de29b diff --git a/test/AllPairs/input/correctness-network.txt b/test/AllPairs/input/correctness-network.txt new file mode 100644 index 00000000..661980a8 --- /dev/null +++ b/test/AllPairs/input/correctness-network.txt @@ -0,0 +1,6 @@ +#Node1 Node2 +A B 1 +B C 1 +C D 1 +D E 1 +A E 1 diff --git a/test/AllPairs/input/correctness-nodetypes.txt b/test/AllPairs/input/correctness-nodetypes.txt new file mode 100644 index 00000000..f804a182 --- /dev/null +++ b/test/AllPairs/input/correctness-nodetypes.txt @@ -0,0 +1,5 @@ +#Node Node type +A source +B source +C target +E target diff --git a/test/AllPairs/input/sample-in-net.txt b/test/AllPairs/input/sample-in-net.txt new file mode 100644 index 00000000..6286f346 --- /dev/null +++ b/test/AllPairs/input/sample-in-net.txt @@ -0,0 +1,21 @@ +#Node1 Node2 +S1 A 0.5 +A E 0.5 +E T1 0.5 +E F 0.5 +F E 0.5 +F A 0.5 +T1 F 0.5 +F T2 0.5 +B S1 0.5 +B F 0.5 +B C 0.5 +S2 B 0.5 +S2 C 0.5 +S2 T3 0.5 +C G 0.5 +G C 0.5 +C F 0.5 +G F 0.5 +G T2 0.5 +G T3 0.5 diff --git a/test/AllPairs/input/sample-in-nodetypes.txt b/test/AllPairs/input/sample-in-nodetypes.txt new file mode 100644 index 00000000..05f54a1c --- /dev/null +++ b/test/AllPairs/input/sample-in-nodetypes.txt @@ -0,0 +1,6 @@ +#Node Node type +S1 source +S2 source +T1 target +T2 target +T3 target diff --git a/test/AllPairs/input/zero-length-network.txt b/test/AllPairs/input/zero-length-network.txt new file mode 100644 index 00000000..661980a8 --- /dev/null +++ b/test/AllPairs/input/zero-length-network.txt @@ -0,0 +1,6 @@ +#Node1 Node2 +A B 1 +B C 1 +C D 1 +D E 1 +A E 1 diff --git a/test/AllPairs/input/zero-length-nodetypes.txt b/test/AllPairs/input/zero-length-nodetypes.txt new file mode 100644 index 00000000..0e36b81d --- /dev/null +++ b/test/AllPairs/input/zero-length-nodetypes.txt @@ -0,0 +1,3 @@ +#Node Node type +A source +A target diff --git a/test/AllPairs/test_ap.py b/test/AllPairs/test_ap.py new file mode 100644 index 00000000..f1c82f85 --- /dev/null +++ b/test/AllPairs/test_ap.py @@ -0,0 +1,105 @@ +import filecmp +import shutil +from pathlib import Path + +import pytest + +from src.allpairs import AllPairs + +TEST_DIR = 'test/AllPairs/' +OUT_DIR = TEST_DIR+'output/' +EXPECTED_DIR = TEST_DIR+'expected/' + + +class TestAllPairs: + """ + Run all pairs shortest paths (AllPairs) tests in the Docker image + """ + def test_allpairs(self): + out_path = Path(OUT_DIR+'sample-out.txt') + out_path.unlink(missing_ok=True) + # Only include required arguments + AllPairs.run( + nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', + network=TEST_DIR+'input/sample-in-net.txt', + output_file=str(out_path) + ) + assert out_path.exists() + + def test_allpairs_missing(self): + # Test the expected error is raised when required arguments are missing + with pytest.raises(ValueError): + # No nodetypes + AllPairs.run( + network=TEST_DIR + 'input/sample-in-net.txt', + output_file=OUT_DIR+'sample-out.txt') + + # Only run Singularity test if the binary is available on the system + # spython is only available on Unix, but do not explicitly skip non-Unix platforms + @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') + def test_allpairs_singularity(self): + out_path = Path(OUT_DIR+'sample-out.txt') + out_path.unlink(missing_ok=True) + # Only include required arguments and run with Singularity + AllPairs.run( + nodetypes=TEST_DIR+'input/sample-in-nodetypes.txt', + network=TEST_DIR+'input/sample-in-net.txt', + output_file=str(out_path), + singularity=True + ) + assert out_path.exists() + + def test_allpairs_correctness(self): + """ + Tests algorithm correctness of all_pairs_shortest_path.py by using AllPairs.run + The shortest paths are: + A-B-C + A-E + B-C + B-A-E + so the union of the unique edges in these paths will be returned as the pathway. + """ + out_path = Path(OUT_DIR+'correctness-out.txt') + out_path.unlink(missing_ok=True) + + AllPairs.run( + nodetypes=TEST_DIR+'input/correctness-nodetypes.txt', + network=TEST_DIR+'input/correctness-network.txt', + output_file=OUT_DIR+'correctness-out.txt' + ) + assert out_path.exists() + + with open(out_path, 'r') as f: + edge_pairs = f.readlines() + output_edges = [] + for edge in edge_pairs: + node1, node2 = sorted(edge.split()) + output_edges.append((node1, node2)) + output_edges.sort() + + with open(EXPECTED_DIR+'correctness-expected.txt', 'r') as f: + c_edge_pairs = f.readlines() + correct_edges = [] + for edge in c_edge_pairs: + node1, node2 = edge.split() + correct_edges.append((node1, node2)) + + assert output_edges == correct_edges + + def test_allpairs_zero_length(self): + """ + Tests algorithm correctness of all_pairs_shortest_path.py by using AllPairs.run + The test case has a single soucre and target that is the same node, so the only path has + zero length. + Therefore, the output pathway has no edges. + """ + out_path = Path(OUT_DIR+'zero-length-out.txt') + out_path.unlink(missing_ok=True) + + AllPairs.run( + nodetypes=TEST_DIR+'input/zero-length-nodetypes.txt', + network=TEST_DIR+'input/zero-length-network.txt', + output_file=OUT_DIR+'zero-length-out.txt' + ) + + assert filecmp.cmp(OUT_DIR+'zero-length-out.txt', EXPECTED_DIR+'zero-length-expected.txt', shallow=False)