Merge pull request Reed-CompBio#142 from ntalluri/header

Adding Header Lines
ntalluri · Jul 10, 2024 · 98d7e35 · 98d7e35
2 parents 0557289 + d6b019a
commit 98d7e35
Show file tree

Hide file tree

Showing 66 changed files with 214 additions and 90 deletions.
diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml
@@ -83,7 +83,7 @@ jobs:
         docker pull reedcompbio/mincostflow:latest
         docker pull reedcompbio/allpairs:v2
         docker pull reedcompbio/domino:latest
-        docker pull reedcompbio/py4cytoscape:v2
+        docker pull reedcompbio/py4cytoscape:v3
         docker pull reedcompbio/spras:v0.1.0
     - name: Build Omics Integrator 1 Docker image
       uses: docker/build-push-action@v1
@@ -154,8 +154,8 @@ jobs:
         path: docker-wrappers/Cytoscape/.
         dockerfile: docker-wrappers/Cytoscape/Dockerfile
         repository: reedcompbio/py4cytoscape
-        tags: v2
-        cache_froms: reedcompbio/py4cytoscape:latest
+        tags: v3
+        cache_froms: reedcompbio/py4cytoscape:v3
         push: false
     - name: Build SPRAS Docker image
       uses: docker/build-push-action@v1

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -154,9 +154,10 @@ Use the `run_container` utility function to run the command in the container `<u
 
 Implement the `parse_output` function.
 The edges in the Local Neighborhood output have the same format as the input, `<vertex1>|<vertex2>`.
-Convert these to be tab-separated vertex pairs followed by a tab and a `1` at the end of every line, which indicates all edges have the same rank.
-See the `add_rank_column` function in `src.util.py`.
-The output should have the format `<vertex1> <vertex2> 1`.
+Convert these to be tab-separated vertex pairs followed by a tab `1` and tab `U` at the end of every line, which indicates all edges have the same rank and are undirected.
+See the `add_rank_column` and `raw_pathway_df` function in `src.util.py` and `reinsert_direction_col_undirected` function in `src.interactome.py`.
+Make sure header = True with column names: ['Node1', 'Node2', 'Rank', 'Direction'] when the file is created.
+The output should have the format `<vertex1> <vertex2> 1 U`.
 
 ### Step 4: Make the Local Neighborhood wrapper accessible through SPRAS
 Import the new class `LocalNeighborhood` in `src/runner.py` so the wrapper functions can be accessed.

diff --git a/doc/output.md b/doc/output.md
@@ -0,0 +1,17 @@
+## File formats
+
+### Pathway output format
+Output pathway files in the standard SPRAS format include a header row and rows providing attributes for each edge.
+The header row is `Node1    Node2   Rank    Direction`.
+Each row lists the two nodes that are connected with an edge, the rank for that edge, and a directionality column to indicate whether the edge is directed or undirected.
+The directionality values are either a 'U' for an undirected edge or a 'D' for a directed edge, where the direction is from Node1 to Node2.
+Pathways that do not contain ranked edges can output all 1s in the Rank column.
+
+For example:
+```
+Node1	Node2	Rank	Direction
+A	B	1	D
+B	C	1	D
+B	D	2	U
+D	A	3	U
+```
diff --git a/docker-wrappers/Cytoscape/README.md b/docker-wrappers/Cytoscape/README.md
@@ -20,6 +20,7 @@ The Docker wrapper can be tested with `pytest`.
 ## Versions:
 - v1: Use supervisord to launch Cytoscape from a Python subprocess, then connect to Cytoscape with py4cytoscape. Only loads undirected pathways. Compatible with Singularity in local testing (Apptainer version 1.2.2-1.el7) but fails in GitHub Actions.
 - v2: Add support for edge direction column.
+- v3: Add support for header lines in files
 
 ## TODO
 - Add an auth file for `xvfb-run`

diff --git a/docker-wrappers/Cytoscape/cytoscape_util.py b/docker-wrappers/Cytoscape/cytoscape_util.py
@@ -116,7 +116,9 @@ def load_pathways(pathways: List[str], output: str) -> None:
         suid = p4c.networks.import_network_from_tabular_file(
             file=path,
             column_type_list='s,t,x,ea',
-            delimiters='\t'
+            delimiters='\t',
+            first_row_as_column_names=True,
+
         )
         p4c.networks.rename_network(name, network=suid)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "spras"
-version = "0.1.0"
+version = "0.2.0"
 description = "Signaling Pathway Reconstruction Analysis Streamliner"
 authors = [
   { name = "Anthony Gitter", email = "[email protected]" },

diff --git a/spras/allpairs.py b/spras/allpairs.py
@@ -1,14 +1,13 @@
 import warnings
 from pathlib import Path
 
-import pandas as pd
-
 from spras.containers import prepare_volume, run_container
 from spras.interactome import (
     convert_directed_to_undirected,
     reinsert_direction_col_undirected,
 )
 from spras.prm import PRM
+from spras.util import add_rank_column, raw_pathway_df
 
 __all__ = ['AllPairs']
 
@@ -110,7 +109,9 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         @param raw_pathway_file: pathway file produced by an algorithm's run function
         @param standardized_pathway_file: the same pathway written in the universal format
         """
-        df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
-        df['Rank'] = 1  # add a rank column of 1s since the edges are not ranked.
-        df = reinsert_direction_col_undirected(df)
-        df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
+        df = raw_pathway_df(raw_pathway_file, sep='\t', header=None)
+        if not df.empty:
+            df = add_rank_column(df)
+            df = reinsert_direction_col_undirected(df)
+            df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
+        df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
diff --git a/spras/analysis/cytoscape.py b/spras/analysis/cytoscape.py
@@ -48,7 +48,7 @@ def run_cytoscape(pathways: List[Union[str, PurePath]], output_file: str, contai
 
     print('Running Cytoscape with arguments: {}'.format(' '.join(command)), flush=True)
 
-    container_suffix = "py4cytoscape:v2"
+    container_suffix = "py4cytoscape:v3"
     out = run_container(container_framework,
                         container_suffix,
                         command,

diff --git a/spras/analysis/graphspace.py b/spras/analysis/graphspace.py
@@ -77,21 +77,21 @@ def load_graph(path: str) -> Tuple[Union[nx.Graph, nx.DiGraph], bool]:
     directed = False
 
     try:
-        pathways = pd.read_csv(path, sep="\t", header=None)
+        pathways = pd.read_csv(path, sep="\t", header=0)
     except pd.errors.EmptyDataError:
         print(f"The file {path} is empty.")
         return G, directed
-    pathways.columns = ["Interactor1", "Interactor2", "Rank", "Direction"]
+
     mask_u = pathways['Direction'] == 'U'
     mask_d = pathways['Direction'] == 'D'
     pathways.drop(columns=["Direction"])
 
     if mask_u.all():
-        G = nx.from_pandas_edgelist(pathways, "Interactor1", "Interactor2", ["Rank"])
+        G = nx.from_pandas_edgelist(pathways, "Node1", "Node2", ["Rank"])
         directed = False
 
     elif mask_d.all():
-        G = nx.from_pandas_edgelist(pathways, "Interactor1", "Interactor2", ["Rank"], create_using=nx.DiGraph())
+        G = nx.from_pandas_edgelist(pathways, "Node1", "Node2", ["Rank"], create_using=nx.DiGraph())
         directed = True
     else:
         print(f"{path} could not be visualized. GraphSpace does not support mixed direction type graphs currently")

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
@@ -41,10 +41,13 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
             with open(file, 'r') as f:
                 lines = f.readlines()
 
+            if len(lines) > 0:
+                lines.pop(0)  # skip header line
+
             edges = []
             for line in lines:
                 parts = line.split('\t')
-                if len(parts) > 0:  # in case of empty line in file
+                if len(parts) == 4:  # empty lines not allowed but empty files are allowed
                     node1 = parts[0]
                     node2 = parts[1]
                     direction = str(parts[3]).strip()
@@ -54,8 +57,10 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
                     elif direction == "D":
                         # node order does matter for directed edges
                         edges.append(DIR_CONST.join([node1, node2]))
-                    else:
-                        ValueError(f"direction is {direction}, rather than U or D")
+                    elif direction != 'Direction':
+                        raise ValueError(f"direction is {direction}, rather than U or D")
+                elif len(parts) != 0:
+                    raise ValueError(f"In file {file}, expected line {line} to have 4 values, but found {len(parts)} values.")
 
             # getting the algorithm name
             p = PurePath(file)

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -33,8 +33,12 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) ->
 
     # Iterate through each network file path
     for file_path in sorted(file_paths):
-        # Load in the network
-        nw = nx.read_edgelist(file_path, data=(('weight', float), ('Direction',str)))
+
+        with open(file_path, 'r') as f:
+            lines = f.readlines()[1:]  # skip the header line
+
+        nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
+
         # Save the network name, number of nodes, number edges, and number of connected components
         nw_name = str(file_path)
         number_nodes = nw.number_of_nodes()

diff --git a/spras/domino.py b/spras/domino.py
@@ -205,8 +205,11 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
             edges_df['source'] = edges_df['source'].apply(post_domino_id_transform)
             edges_df['target'] = edges_df['target'].apply(post_domino_id_transform)
             edges_df = reinsert_direction_col_undirected(edges_df)
+            edges_df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
+        else:
+            edges_df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
 
-        edges_df.to_csv(standardized_pathway_file, sep='\t', header=False, index=False)
+        edges_df.to_csv(standardized_pathway_file, sep='\t', header=True, index=False)
 
 
 def pre_domino_id_transform(node_id):
@@ -225,9 +228,4 @@ def post_domino_id_transform(node_id):
     @param node_id: the node id to transform
     @return the node id without the prefix, if it was present, otherwise the original node id
     """
-    # Use removeprefix if SPRAS ever requires Python >= 3.9
-    # https://docs.python.org/3/library/stdtypes.html#str.removeprefix
-    if node_id.startswith(ID_PREFIX):
-        return node_id[ID_PREFIX_LEN:]
-    else:
-        return node_id
+    return node_id.removeprefix(ID_PREFIX)
diff --git a/spras/meo.py b/spras/meo.py
@@ -1,14 +1,12 @@
 from pathlib import Path
 
-import pandas as pd
-
 from spras.containers import prepare_volume, run_container
 from spras.interactome import (
     add_directionality_constant,
     reinsert_direction_col_directed,
 )
 from spras.prm import PRM
-from spras.util import add_rank_column
+from spras.util import add_rank_column, raw_pathway_df
 
 __all__ = ['MEO', 'write_properties']
 
@@ -181,13 +179,14 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         @param standardized_pathway_file: the same pathway written in the universal format
         """
         # Columns Source Type Target Oriented Weight
-        df = pd.read_csv(raw_pathway_file, sep='\t')
-        # Keep only edges that were assigned an orientation (direction)
-        df = df.loc[df['Oriented']]
-        # TODO what should be the edge rank?
-        # Would need to load the paths output file to rank edges correctly
-        df = add_rank_column(df)
-        df = reinsert_direction_col_directed(df)
-
-        df.to_csv(standardized_pathway_file, columns=['Source', 'Target', 'Rank', "Direction"], header=False,
-                  index=False, sep='\t')
+        df = raw_pathway_df(raw_pathway_file, sep='\t', header=0)
+        if not df.empty:
+            # Keep only edges that were assigned an orientation (direction)
+            df = df.loc[df['Oriented']]
+            # TODO what should be the edge rank?
+            # Would need to load the paths output file to rank edges correctly
+            df = add_rank_column(df)
+            df = reinsert_direction_col_directed(df)
+            df.drop(columns=['Type', 'Oriented', 'Weight'], inplace=True)
+            df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+        df.to_csv(standardized_pathway_file, index=False, sep='\t', header=True)
diff --git a/spras/mincostflow.py b/spras/mincostflow.py
@@ -1,14 +1,12 @@
 from pathlib import Path
 
-import pandas as pd
-
 from spras.containers import prepare_volume, run_container
 from spras.interactome import (
     convert_undirected_to_directed,
     reinsert_direction_col_undirected,
 )
 from spras.prm import PRM
-from spras.util import add_rank_column
+from spras.util import add_rank_column, raw_pathway_df
 
 __all__ = ['MinCostFlow']
 
@@ -150,10 +148,11 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         @param standardized_pathway_file: the same pathway written in the universal format
         """
 
-        df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
-        df = add_rank_column(df)
-        # TODO update MinCostFlow version to support mixed graphs
-        # Currently directed edges in the input will be converted to undirected edges in the output
-        df = reinsert_direction_col_undirected(df)
-        df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
-
+        df = raw_pathway_df(raw_pathway_file, sep='\t', header=None)
+        if not df.empty:
+            df = add_rank_column(df)
+            # TODO update MinCostFlow version to support mixed graphs
+            # Currently directed edges in the input will be converted to undirected edges in the output
+            df = reinsert_direction_col_undirected(df)
+            df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+        df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py
@@ -1,11 +1,9 @@
 from pathlib import Path
 
-import pandas as pd
-
 from spras.containers import prepare_volume, run_container
 from spras.interactome import reinsert_direction_col_mixed
 from spras.prm import PRM
-from spras.util import add_rank_column
+from spras.util import add_rank_column, raw_pathway_df
 
 __all__ = ['OmicsIntegrator1', 'write_conf']
 
@@ -191,16 +189,12 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         # I'm assuming from having read the documentation that we will be passing in optimalForest.sif
         # as raw_pathway_file, in which case the format should be edge1 interactiontype edge2.
         # if that assumption is wrong we will need to tweak things
-        try:
-            df = pd.read_csv(raw_pathway_file, sep='\t', header=None)
-        except pd.errors.EmptyDataError:
-            with open(standardized_pathway_file, 'w'):
-                pass
-            return
-
-        df.columns = ["Edge1", "InteractionType", "Edge2"]
-        df = add_rank_column(df)
-        df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp")
-
-        df.to_csv(standardized_pathway_file, columns=['Edge1', 'Edge2', 'Rank', "Direction"], header=False, index=False,
-                  sep='\t')
+        df = raw_pathway_df(raw_pathway_file, sep='\t', header=None)
+        if not df.empty:
+            df.columns = ["Edge1", "InteractionType", "Edge2"]
+            df = add_rank_column(df)
+            df = reinsert_direction_col_mixed(df, "InteractionType", "pd", "pp")
+            df.drop(columns=['InteractionType'], inplace=True)
+            df.columns = ['Node1', 'Node2', 'Rank', 'Direction']
+
+        df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py
@@ -149,12 +149,13 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         # Omicsintegrator2 returns a single line file if no network is found
         num_lines = sum(1 for line in open(raw_pathway_file))
         if num_lines < 2:
-            with open(standardized_pathway_file, 'w'):
-                pass
-            return
-        df = pd.read_csv(raw_pathway_file, sep='\t')
-        df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line
-        df = df.take([0, 1], axis=1)
-        df = add_rank_column(df)
-        df = reinsert_direction_col_undirected(df)
-        df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
+            df = pd.DataFrame(columns=['Node1', 'Node2', 'Rank', 'Direction'])
+        else:
+            df = pd.read_csv(raw_pathway_file, sep='\t', header=0)
+            df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line
+            df = df.take([0, 1], axis=1)
+            df = add_rank_column(df)
+            df = reinsert_direction_col_undirected(df)
+            df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+
+        df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')
diff --git a/spras/pathlinker.py b/spras/pathlinker.py
@@ -1,14 +1,13 @@
 import warnings
 from pathlib import Path
 
-import pandas as pd
-
 from spras.containers import prepare_volume, run_container
 from spras.interactome import (
     convert_undirected_to_directed,
     reinsert_direction_col_directed,
 )
 from spras.prm import PRM
+from spras.util import raw_pathway_df
 
 __all__ = ['PathLinker']
 
@@ -136,7 +135,10 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         @param raw_pathway_file: pathway file produced by an algorithm's run function
         @param standardized_pathway_file: the same pathway written in the universal format
         """
-        # What about multiple raw_pathway_files
-        df = pd.read_csv(raw_pathway_file, sep='\t').take([0, 1, 2], axis=1)
-        df = reinsert_direction_col_directed(df)
-        df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
+        # What about multiple raw_pathway_files?
+        df = raw_pathway_df(raw_pathway_file, sep='\t', header=0)
+        if not df.empty:
+            df = df.take([0, 1, 2], axis=1)
+            df = reinsert_direction_col_directed(df)
+            df.columns = ['Node1', 'Node2', 'Rank', "Direction"]
+        df.to_csv(standardized_pathway_file, header=True, index=False, sep='\t')