diff --git a/.gitignore b/.gitignore index d7387cd4..13d4b504 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,5 @@ data/transformed/uniprot_genome_features/*.tsv kg_microbe/transform_utils/uniprot/tmp/relevant_file_content.txt kg_microbe/transform_utils/uniprot/tmp/nodes_and_edges/* data/transformed/uniprot_genome_features/uniprot_kgx.zip +data/transformed/Uniref/edges.tsv +data/transformed/Uniref/nodes.tsv diff --git a/kg_microbe/transform.py b/kg_microbe/transform.py index 608c5943..16b663fd 100644 --- a/kg_microbe/transform.py +++ b/kg_microbe/transform.py @@ -11,6 +11,7 @@ from kg_microbe.transform_utils.rhea.rhea import RheaMappingsTransform from kg_microbe.transform_utils.traits.traits import TraitsTransform from kg_microbe.transform_utils.uniprot.uniprot import UniprotTransform +from kg_microbe.transform_utils.uniref.uniref import UnirefTransform DATA_SOURCES = { # "DrugCentralTransform": DrugCentralTransform, @@ -28,6 +29,7 @@ "RheaMappingsTransform": RheaMappingsTransform, "BactoTraitsTransform": BactoTraitsTransform, "UniprotTransform": UniprotTransform, + "UnirefTransform": UnirefTransform, } diff --git a/kg_microbe/transform_utils/constants.py b/kg_microbe/transform_utils/constants.py index e1c770c0..b9e43991 100644 --- a/kg_microbe/transform_utils/constants.py +++ b/kg_microbe/transform_utils/constants.py @@ -123,6 +123,8 @@ RHEA_OLD_PREFIX = "OBO:rhea_" RHEA_NEW_PREFIX = "RHEA:" ASSAY_PREFIX = "assay:" +UNIREF_90_PREFIX = "UniRef90:" + RHEA_URI = "http://purl.obolibrary.org/obo/rhea_" DEBIO_OBO_PREFIX = "OBO:debio_" DEBIO_NEW_PREFIX = "debio:" @@ -163,6 +165,8 @@ NCBI_TO_SUBSTRATE_EDGE = "biolink:consumes" RHEA_TO_EC_EDGE = "biolink:enabled_by" RHEA_TO_GO_EDGE = "biolink:enables" +NCBI_TO_CLUSTER_EDGE = "biolink:occurs_in" + NCBI_CATEGORY = "biolink:OrganismTaxon" MEDIUM_CATEGORY = "biolink:ChemicalEntity" @@ -179,6 +183,7 @@ ATTRIBUTE_CATEGORY = "biolink:Attribute" METABOLITE_CATEGORY = "biolink:ChemicalEntity" SUBSTRATE_CATEGORY = "biolink:ChemicalEntity" +CLUSTER_CATEGORY = "biolink:ProteinFamily" HAS_PART = "BFO:0000051" IS_GROWN_IN = "BAO:0002924" @@ -194,6 +199,7 @@ ASSESSED_ACTIVITY_RELATIONSHIP = "NCIT:C153110" CLOSE_MATCH = "skos:closeMatch" ASSOCIATED_WITH = "PATO:0001668" +OCCURS_IN = "BFO:0000066" ID_COLUMN = "id" NAME_COLUMN = "name" @@ -410,6 +416,7 @@ GO_PREFIX, MEDIADIVE_MEDIUM_PREFIX, STRAIN_PREFIX, + UNIREF_90_PREFIX, ] HAS_PARTICIPANT_PREDICATE = "biolink:has_participant" diff --git a/kg_microbe/transform_utils/uniref/__init__.py b/kg_microbe/transform_utils/uniref/__init__.py new file mode 100644 index 00000000..aa9640f0 --- /dev/null +++ b/kg_microbe/transform_utils/uniref/__init__.py @@ -0,0 +1,5 @@ +"""Uniref transform.""" + +from .uniref import UnirefTransform + +__all__ = ["UnirefTransform"] diff --git a/kg_microbe/transform_utils/uniref/uniref.py b/kg_microbe/transform_utils/uniref/uniref.py new file mode 100644 index 00000000..bc09315d --- /dev/null +++ b/kg_microbe/transform_utils/uniref/uniref.py @@ -0,0 +1,115 @@ +"""UniRef Transformation Module.""" + +import csv +import gc +import os +import sys +from pathlib import Path +from typing import Optional, Union + +from oaklib import get_adapter +from tqdm import tqdm + +from kg_microbe.transform_utils.constants import ( + CLUSTER_CATEGORY, + NCBI_CATEGORY, + NCBI_TO_CLUSTER_EDGE, + NCBITAXON_PREFIX, + OCCURS_IN, + UNIREF_90_PREFIX, +) +from kg_microbe.transform_utils.transform import Transform +from kg_microbe.utils.dummy_tqdm import DummyTqdm +from kg_microbe.utils.pandas_utils import drop_duplicates + +csv.field_size_limit(sys.maxsize - 1) # _csv.Error: field larger than field limit (131072) + + +class UnirefTransform(Transform): + + """UniRef Transformation Class.""" + + def __init__( + self, + input_dir: Optional[Path] = None, + output_dir: Optional[Path] = None, + ): + """Instantiate part.""" + source_name = "Uniref" + super().__init__(source_name, input_dir, output_dir) + self.ncbi_impl = get_adapter("sqlite:obo:ncbitaxon") + + def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_status: bool = True): + """Run the transformation.""" + input_file = os.path.join( + self.input_base_dir, "uniref90_api_subset.tsv" + ) # must exist already + + progress_class = tqdm if show_status else DummyTqdm + + with open(input_file, "r") as tsvfile, open(self.output_node_file, "w") as nodes_file, open( + self.output_edge_file, "w" + ) as edges_file: + # Create a CSV reader specifying the delimiter as a tab character + tsvreader = csv.DictReader(tsvfile, delimiter="\t") + node_writer = csv.writer(nodes_file, delimiter="\t") + edge_writer = csv.writer(edges_file, delimiter="\t") + source = UNIREF_90_PREFIX.strip(":") + + # Write the header for the files + node_writer.writerow(self.node_header) + edge_writer.writerow(self.edge_header) + + with progress_class(desc="Processing clusters...") as progress: + # Iterate over each row in the TSV file + for row in tsvreader: + # Extract the desired fields + cluster_id = row["Cluster ID"].replace("_", ":") + cluster_name = row["Cluster Name"].lstrip("Cluster:").strip() + ncbitaxon_ids = [ + NCBITAXON_PREFIX + x.strip() for x in row["Organism IDs"].split(";") if x + ] + ncbi_labels = [ + ncbi_label.strip() + for ncbi_label in row["Organisms"].split(";") + if ncbi_label + ] + nodes_data_to_write = [ + [ncbitaxon_id, NCBI_CATEGORY, ncbi_label] + for ncbitaxon_id, ncbi_label in zip( + ncbitaxon_ids, ncbi_labels, strict=False + ) + ] + # nodes_data_to_write.append([cluster_id, CLUSTER_CATEGORY, cluster_name]) + nodes_data_to_write = [ + sublist + [None] * (len(self.node_header) - 3) + for sublist in nodes_data_to_write + ] + node_writer.writerows(nodes_data_to_write) + gc.collect() + + # Write the cluster node + cluster_node_data = [cluster_id, CLUSTER_CATEGORY, cluster_name] + cluster_node_data.extend([None] * (len(self.node_header) - 3)) + node_writer.writerow(cluster_node_data) + + # Write the edge for the cluster + edges_data_to_write = [ + [ + ncbitaxon_id, + NCBI_TO_CLUSTER_EDGE, + cluster_id, + OCCURS_IN, + source, + ] + for ncbitaxon_id in ncbitaxon_ids + ] + edge_writer.writerows(edges_data_to_write) + gc.collect() + + progress.set_description(f"Processing Cluster: {cluster_id}") + # After each iteration, call the update method to advance the progress bar. + progress.update(2000) + + drop_duplicates(self.output_node_file) + drop_duplicates(self.output_edge_file) diff --git a/merge.yaml b/merge.yaml index 66dfea48..ebf58d87 100644 --- a/merge.yaml +++ b/merge.yaml @@ -105,6 +105,13 @@ merged_graph: filename: - data/transformed/uniprot_genome_features/nodes.tsv - data/transformed/uniprot_genome_features/edges.tsv + uniref: + input: + name: "Uniref" + format: tsv + filename: + - data/transformed/Uniref/nodes.tsv + - data/transformed/Uniref/edges.tsv operations: - name: kgx.graph_operations.summarize_graph.generate_graph_stats args: