Merge branch 'master' of github.com:ntalluri/spras into direction

ntalluri · Aug 30, 2023 · a214cc8 · a214cc8
2 parents 3f173b5 + c6dc2d9
commit a214cc8
Show file tree

Hide file tree

Showing 18 changed files with 404 additions and 3 deletions.
diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml
@@ -81,6 +81,7 @@ jobs:
         docker pull reedcompbio/pathlinker:latest
         docker pull reedcompbio/meo:latest
         docker pull reedcompbio/mincostflow:latest
+        docker pull reedcompbio/allpairs:latest
         docker pull reedcompbio/domino:latest
     - name: Build Omics Integrator 1 Docker image
       uses: docker/build-push-action@v1
@@ -127,6 +128,15 @@ jobs:
         tags: latest
         cache_froms: reedcompbio/mincostflow:latest
         push: false
+    - name: Build All Pairs Shortest Paths Docker image
+      uses: docker/build-push-action@v1
+      with:
+        path: docker-wrappers/AllPairs/.
+        dockerfile: docker-wrappers/AllPairs/Dockerfile
+        repository: reedcompbio/allpairs
+        tags: latest
+        cache_froms: reedcompbio/allpairs:latest
+        push: false
     - name: Build DOMINO Docker image
       uses: docker/build-push-action@v1
       with:

diff --git a/config/config.yaml b/config/config.yaml
@@ -72,6 +72,11 @@
                     flow: [1] # The flow must be an int
                     capacity: [1]
 
+        - name: "allpairs"
+          params:
+                include: true
+                directed: false
+
         - name: "domino"
           params:
                 include: true
@@ -80,6 +85,7 @@
                     slice_threshold: [0.3]
                     module_threshold: [0.05]
 
+
  # Here we specify which pathways to run and other file location information.
  # DataLoader.py can currently only load a single dataset
  # Assume that if a dataset label does not change, the lists of associated input files do not change

diff --git a/docker-wrappers/AllPairs/Dockerfile b/docker-wrappers/AllPairs/Dockerfile
@@ -0,0 +1,7 @@
+# AllPairs wrapper
+FROM python:3.9-alpine3.16
+
+WORKDIR /AllPairs
+
+RUN pip install networkx==2.6.3
+COPY all-pairs-shortest-paths.py /AllPairs/all-pairs-shortest-paths.py
diff --git a/docker-wrappers/AllPairs/README.md b/docker-wrappers/AllPairs/README.md
@@ -0,0 +1,28 @@
+# All Pairs Shortest Paths Docker image
+
+A Docker image for All Pairs Shortest Paths that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/allpairs).
+This algorithm was implemented by the SPRAS team and relies on the NetworkX [`shortest_path`](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path.html) function.
+
+To create the Docker image run:
+```
+docker build -t reedcompbio/allpairs -f Dockerfile .
+```
+from this directory.
+
+To inspect the installed Python packages:
+```
+docker run reedcompbio/allpairs pip list
+```
+
+
+## Testing
+Test code is located in `test/AllPairs`.
+The `input` subdirectory contains a sample network and source/target file, along with a network and source/target file to check for the correctness of All Pairs Shortest Path.
+The expected output graphs for the sample networks are in the `expected` subdirectory.
+
+The Docker wrapper can be tested with `pytest -k test_ap.py` from the root of the SPRAS repository.
+
+
+## Notes
+- The `all-pairs-shortest-paths.py` code is located locally in SPRAS (since the code is short). It is under `docker-wrappers/AllPairs`.
+- Samples of an input network and source/target file are located under test/AllPairs/input.
diff --git a/docker-wrappers/AllPairs/all-pairs-shortest-paths.py b/docker-wrappers/AllPairs/all-pairs-shortest-paths.py
@@ -0,0 +1,82 @@
+"""
+All Pairs Shortest Paths pathway reconstruction algorithm.
+The algorithm takes a network and a list of sources and targets as input.
+It outputs the shortest possible path between every source and every target.
+"""
+
+import argparse
+from pathlib import Path
+
+import networkx as nx
+
+
+def parse_arguments():
+    """
+    Process command line arguments.
+    @return arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="All Pairs Shortest Paths pathway reconstruction"
+    )
+    parser.add_argument("--network", type=Path, required=True, help="Network file of the form <node1> <node2> <weight>."
+                                                                    " Tab-delimited.")
+    parser.add_argument("--nodes", type=Path, required=True, help="Nodes file of the form <node> <source-or-target>. "
+                                                                  "Tab-delimited.")
+    parser.add_argument("--output", type=Path, required=True, help="Output file")
+
+    return parser.parse_args()
+
+
+def allpairs(network_file: Path, nodes_file: Path, output_file: Path):
+    if not network_file.exists():
+        raise OSError(f"Network file {str(network_file)} does not exist")
+    if not nodes_file.exists():
+        raise OSError(f"Nodes file {str(nodes_file)} does not exist")
+    if output_file.exists():
+        print(f"Output file {str(output_file)} will be overwritten")
+
+    # Create the parent directories for the output file if needed
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    # Read the list of nodes
+    sources = set()
+    targets = set()
+    with nodes_file.open() as nodes_f:
+        for line in nodes_f:
+            row = line.strip().split(sep='\t')
+            if row[1] == 'source':
+                sources.add(row[0])
+            elif row[1] == 'target':
+                targets.add(row[0])
+
+    # There should be at least one source and one target
+    assert len(sources) > 0, 'There are no sources.'
+    assert len(targets) > 0, 'There are no targets.'
+
+    # Read graph & assert all the sources/targets are in network
+    graph = nx.read_weighted_edgelist(network_file, delimiter='\t')
+    assert len(sources.intersection(graph.nodes())) == len(sources), 'At least one source is not in the interactome.'
+    assert len(targets.intersection(graph.nodes())) == len(targets), 'At least one target is not in the interactome.'
+
+    # Finally, compute all-pairs-shortest-paths and record the subgraph.
+    output = nx.Graph()
+    for source in sources:
+        for target in targets:
+            p = nx.shortest_path(graph, source, target, weight='weight')
+            nx.add_path(output, p)
+
+    # Write the subgraph as a list of edges.
+    nx.write_edgelist(output, output_file, data=False, delimiter='\t')
+    print(f"Wrote output file to {str(output_file)}")
+
+
+def main():
+    """
+    Parse arguments and run pathway reconstruction
+    """
+    args = parse_arguments()
+    allpairs(args.network, args.nodes, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker-wrappers/LocalNeighborhood/local_neighborhood.py b/docker-wrappers/LocalNeighborhood/local_neighborhood.py
@@ -29,7 +29,7 @@ def local_neighborhood(network_file: Path, nodes_file: Path, output_file: Path):
     if not nodes_file.exists():
         raise OSError(f"Nodes file {str(nodes_file)} does not exist")
     if output_file.exists():
-        print(f"Output files {str(output_file)} will be overwritten")
+        print(f"Output file {str(output_file)} will be overwritten")
 
     # Create the parent directories for the output file if needed
     output_file.parent.mkdir(parents=True, exist_ok=True)

diff --git a/src/allpairs.py b/src/allpairs.py
@@ -0,0 +1,103 @@
+import warnings
+from pathlib import Path
+
+import pandas as pd
+
+from src.prm import PRM
+from src.util import prepare_volume, run_container
+
+__all__ = ['AllPairs']
+
+
+class AllPairs(PRM):
+    required_inputs = ['nodetypes', 'network']
+
+    @staticmethod
+    def generate_inputs(data, filename_map):
+        """
+        Access fields from the dataset and write the required input files
+        @param data: dataset
+        @param filename_map: a dict mapping file types in the required_inputs to the filename for that type
+        """
+        for input_type in AllPairs.required_inputs:
+            if input_type not in filename_map:
+                raise ValueError("{input_type} filename is missing")
+
+        # Get sources and targets for node input file
+        # Borrowed code from pathlinker.py
+        sources_targets = data.request_node_columns(["sources", "targets"])
+        if sources_targets is None:
+            raise ValueError("All Pairs Shortest Paths requires sources and targets")
+
+        both_series = sources_targets.sources & sources_targets.targets
+        for _index, row in sources_targets[both_series].iterrows():
+            warn_msg = row.NODEID + " has been labeled as both a source and a target."
+            warnings.warn(warn_msg, stacklevel=2)
+
+        # Create nodetype file
+        input_df = sources_targets[["NODEID"]].copy()
+        input_df.columns = ["#Node"]
+        input_df.loc[sources_targets["sources"] == True, "Node type"] = "source"
+        input_df.loc[sources_targets["targets"] == True, "Node type"] = "target"
+
+        input_df.to_csv(filename_map["nodetypes"], sep="\t", index=False, columns=["#Node", "Node type"])
+
+        # This is pretty memory intensive. We might want to keep the interactome centralized.
+        data.get_interactome().to_csv(filename_map["network"], sep="\t", index=False,
+                                      columns=["Interactor1", "Interactor2", "Weight"],
+                                      header=["#Interactor1", "Interactor2", "Weight"])
+
+    @staticmethod
+    def run(nodetypes=None, network=None, output_file=None, singularity=False):
+        """
+        Run All Pairs Shortest Paths with Docker
+        @param nodetypes: input node types with sources and targets (required)
+        @param network: input network file (required)
+        @param singularity: if True, run using the Singularity container instead of the Docker container
+        @param output_file: path to the output pathway file (required)
+        """
+        if not nodetypes or not network or not output_file:
+            raise ValueError('Required All Pairs Shortest Paths arguments are missing')
+
+        work_dir = '/apsp'
+
+        # Each volume is a tuple (src, dest)
+        volumes = list()
+
+        bind_path, node_file = prepare_volume(nodetypes, work_dir)
+        volumes.append(bind_path)
+
+        bind_path, network_file = prepare_volume(network, work_dir)
+        volumes.append(bind_path)
+
+        # Create the parent directories for the output file if needed
+        Path(output_file).parent.mkdir(parents=True, exist_ok=True)
+        bind_path, mapped_out_file = prepare_volume(output_file, work_dir)
+        volumes.append(bind_path)
+
+        command = ['python',
+                   '/AllPairs/all-pairs-shortest-paths.py',
+                   '--network', network_file,
+                   '--nodes', node_file,
+                   '--output', mapped_out_file]
+
+        print('Running All Pairs Shortest Paths with arguments: {}'.format(' '.join(command)), flush=True)
+
+        container_framework = 'singularity' if singularity else 'docker'
+        out = run_container(container_framework,
+                            'reedcompbio/allpairs',
+                            command,
+                            volumes,
+                            work_dir)
+        print(out)
+
+    @staticmethod
+    def parse_output(raw_pathway_file, standardized_pathway_file):
+        """
+        Convert a predicted pathway into the universal format
+        @param raw_pathway_file: pathway file produced by an algorithm's run function
+        @param standardized_pathway_file: the same pathway written in the universal format
+        """
+        df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
+        df['Rank'] = 1  # add a rank column of 1s since the edges are not ranked.
+        df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/src/runner.py b/src/runner.py
@@ -1,6 +1,6 @@
-from src.dataset import Dataset
-
 # supported algorithm imports
+from src.allpairs import AllPairs as allpairs
+from src.dataset import Dataset
 from src.domino import DOMINO as domino
 from src.meo import MEO as meo
 from src.mincostflow import MinCostFlow as mincostflow

diff --git a/test/AllPairs/expected/correctness-expected.txt b/test/AllPairs/expected/correctness-expected.txt
@@ -0,0 +1,3 @@
+A	B
+A	E
+B	C
diff --git a/test/AllPairs/expected/sample-out-expected.txt b/test/AllPairs/expected/sample-out-expected.txt
@@ -0,0 +1,10 @@
+S1	A
+S1	B
+A	E
+A	F
+E	T1
+T1	F
+F	T2
+F	B
+B	S2
+S2	T3
diff --git a/test/AllPairs/expected/zero-length-expected.txt b/test/AllPairs/expected/zero-length-expected.txt
diff --git a/test/AllPairs/input/correctness-network.txt b/test/AllPairs/input/correctness-network.txt
@@ -0,0 +1,6 @@
+#Node1	Node2
+A	B	1
+B	C	1
+C	D	1
+D	E	1
+A	E	1
diff --git a/test/AllPairs/input/correctness-nodetypes.txt b/test/AllPairs/input/correctness-nodetypes.txt
@@ -0,0 +1,5 @@
+#Node	Node type
+A	source
+B	source
+C	target
+E	target
diff --git a/test/AllPairs/input/sample-in-net.txt b/test/AllPairs/input/sample-in-net.txt
@@ -0,0 +1,21 @@
+#Node1	Node2
+S1	A	0.5
+A	E	0.5
+E	T1	0.5
+E	F	0.5
+F	E	0.5
+F	A	0.5
+T1	F	0.5
+F	T2	0.5
+B	S1	0.5
+B	F	0.5
+B	C	0.5
+S2	B	0.5
+S2	C	0.5
+S2	T3	0.5
+C	G	0.5
+G	C	0.5
+C	F	0.5
+G	F	0.5
+G	T2	0.5
+G	T3	0.5
diff --git a/test/AllPairs/input/sample-in-nodetypes.txt b/test/AllPairs/input/sample-in-nodetypes.txt
@@ -0,0 +1,6 @@
+#Node	Node type
+S1	source
+S2	source
+T1	target
+T2	target
+T3	target
diff --git a/test/AllPairs/input/zero-length-network.txt b/test/AllPairs/input/zero-length-network.txt
@@ -0,0 +1,6 @@
+#Node1	Node2
+A	B	1
+B	C	1
+C	D	1
+D	E	1
+A	E	1
diff --git a/test/AllPairs/input/zero-length-nodetypes.txt b/test/AllPairs/input/zero-length-nodetypes.txt
@@ -0,0 +1,3 @@
+#Node	Node type
+A	source
+A	target