Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UniRef90 transform #170

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,5 @@ data/transformed/uniprot_genome_features/*.tsv
kg_microbe/transform_utils/uniprot/tmp/relevant_file_content.txt
kg_microbe/transform_utils/uniprot/tmp/nodes_and_edges/*
data/transformed/uniprot_genome_features/uniprot_kgx.zip
data/transformed/Uniref/edges.tsv
data/transformed/Uniref/nodes.tsv
2 changes: 2 additions & 0 deletions kg_microbe/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from kg_microbe.transform_utils.rhea.rhea import RheaMappingsTransform
from kg_microbe.transform_utils.traits.traits import TraitsTransform
from kg_microbe.transform_utils.uniprot.uniprot import UniprotTransform
from kg_microbe.transform_utils.uniref.uniref import UnirefTransform

DATA_SOURCES = {
# "DrugCentralTransform": DrugCentralTransform,
Expand All @@ -28,6 +29,7 @@
"RheaMappingsTransform": RheaMappingsTransform,
"BactoTraitsTransform": BactoTraitsTransform,
"UniprotTransform": UniprotTransform,
"UnirefTransform": UnirefTransform,
}


Expand Down
7 changes: 7 additions & 0 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@
RHEA_OLD_PREFIX = "OBO:rhea_"
RHEA_NEW_PREFIX = "RHEA:"
ASSAY_PREFIX = "assay:"
UNIREF_90_PREFIX = "UniRef90:"

RHEA_URI = "http://purl.obolibrary.org/obo/rhea_"
DEBIO_OBO_PREFIX = "OBO:debio_"
DEBIO_NEW_PREFIX = "debio:"
Expand Down Expand Up @@ -163,6 +165,8 @@
NCBI_TO_SUBSTRATE_EDGE = "biolink:consumes"
RHEA_TO_EC_EDGE = "biolink:enabled_by"
RHEA_TO_GO_EDGE = "biolink:enables"
NCBI_TO_CLUSTER_EDGE = "biolink:occurs_in"


NCBI_CATEGORY = "biolink:OrganismTaxon"
MEDIUM_CATEGORY = "biolink:ChemicalEntity"
Expand All @@ -179,6 +183,7 @@
ATTRIBUTE_CATEGORY = "biolink:Attribute"
METABOLITE_CATEGORY = "biolink:ChemicalEntity"
SUBSTRATE_CATEGORY = "biolink:ChemicalEntity"
CLUSTER_CATEGORY = "biolink:ProteinFamily"

HAS_PART = "BFO:0000051"
IS_GROWN_IN = "BAO:0002924"
Expand All @@ -194,6 +199,7 @@
ASSESSED_ACTIVITY_RELATIONSHIP = "NCIT:C153110"
CLOSE_MATCH = "skos:closeMatch"
ASSOCIATED_WITH = "PATO:0001668"
OCCURS_IN = "BFO:0000066"

ID_COLUMN = "id"
NAME_COLUMN = "name"
Expand Down Expand Up @@ -410,6 +416,7 @@
GO_PREFIX,
MEDIADIVE_MEDIUM_PREFIX,
STRAIN_PREFIX,
UNIREF_90_PREFIX,
]

HAS_PARTICIPANT_PREDICATE = "biolink:has_participant"
Expand Down
5 changes: 5 additions & 0 deletions kg_microbe/transform_utils/uniref/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Uniref transform."""

from .uniref import UnirefTransform

__all__ = ["UnirefTransform"]
115 changes: 115 additions & 0 deletions kg_microbe/transform_utils/uniref/uniref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""UniRef Transformation Module."""

import csv
import gc
import os
import sys
from pathlib import Path
from typing import Optional, Union

from oaklib import get_adapter
from tqdm import tqdm

from kg_microbe.transform_utils.constants import (
CLUSTER_CATEGORY,
NCBI_CATEGORY,
NCBI_TO_CLUSTER_EDGE,
NCBITAXON_PREFIX,
OCCURS_IN,
UNIREF_90_PREFIX,
)
from kg_microbe.transform_utils.transform import Transform
from kg_microbe.utils.dummy_tqdm import DummyTqdm
from kg_microbe.utils.pandas_utils import drop_duplicates

csv.field_size_limit(sys.maxsize - 1) # _csv.Error: field larger than field limit (131072)


class UnirefTransform(Transform):

"""UniRef Transformation Class."""

def __init__(
self,
input_dir: Optional[Path] = None,
output_dir: Optional[Path] = None,
):
"""Instantiate part."""
source_name = "Uniref"
super().__init__(source_name, input_dir, output_dir)
self.ncbi_impl = get_adapter("sqlite:obo:ncbitaxon")

def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_status: bool = True):
"""Run the transformation."""
input_file = os.path.join(
self.input_base_dir, "uniref90_api_subset.tsv"
) # must exist already

progress_class = tqdm if show_status else DummyTqdm

with open(input_file, "r") as tsvfile, open(self.output_node_file, "w") as nodes_file, open(
self.output_edge_file, "w"
) as edges_file:
# Create a CSV reader specifying the delimiter as a tab character
tsvreader = csv.DictReader(tsvfile, delimiter="\t")
node_writer = csv.writer(nodes_file, delimiter="\t")
edge_writer = csv.writer(edges_file, delimiter="\t")
source = UNIREF_90_PREFIX.strip(":")

# Write the header for the files
node_writer.writerow(self.node_header)
edge_writer.writerow(self.edge_header)

with progress_class(desc="Processing clusters...") as progress:
# Iterate over each row in the TSV file
for row in tsvreader:
# Extract the desired fields
cluster_id = row["Cluster ID"].replace("_", ":")
cluster_name = row["Cluster Name"].lstrip("Cluster:").strip()
ncbitaxon_ids = [
NCBITAXON_PREFIX + x.strip() for x in row["Organism IDs"].split(";") if x
]
ncbi_labels = [
ncbi_label.strip()
for ncbi_label in row["Organisms"].split(";")
if ncbi_label
]
nodes_data_to_write = [
[ncbitaxon_id, NCBI_CATEGORY, ncbi_label]
for ncbitaxon_id, ncbi_label in zip(
ncbitaxon_ids, ncbi_labels, strict=False
)
]
# nodes_data_to_write.append([cluster_id, CLUSTER_CATEGORY, cluster_name])
nodes_data_to_write = [
sublist + [None] * (len(self.node_header) - 3)
for sublist in nodes_data_to_write
]
node_writer.writerows(nodes_data_to_write)
gc.collect()

# Write the cluster node
cluster_node_data = [cluster_id, CLUSTER_CATEGORY, cluster_name]
cluster_node_data.extend([None] * (len(self.node_header) - 3))
node_writer.writerow(cluster_node_data)

# Write the edge for the cluster
edges_data_to_write = [
[
ncbitaxon_id,
NCBI_TO_CLUSTER_EDGE,
cluster_id,
OCCURS_IN,
source,
]
for ncbitaxon_id in ncbitaxon_ids
]
edge_writer.writerows(edges_data_to_write)
gc.collect()

progress.set_description(f"Processing Cluster: {cluster_id}")
# After each iteration, call the update method to advance the progress bar.
progress.update(2000)

drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)
7 changes: 7 additions & 0 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ merged_graph:
filename:
- data/transformed/uniprot_genome_features/nodes.tsv
- data/transformed/uniprot_genome_features/edges.tsv
uniref:
input:
name: "Uniref"
format: tsv
filename:
- data/transformed/Uniref/nodes.tsv
- data/transformed/Uniref/edges.tsv
operations:
- name: kgx.graph_operations.summarize_graph.generate_graph_stats
args:
Expand Down
Loading