From 5c18c122138d14ce19fabc1db68111a1dc0c8859 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Tue, 11 Jun 2024 18:21:35 -0400 Subject: [PATCH] HGNC ROBOT template - Update: Refactor method to do this from ShellScript / awk to Python / pandas. General - Update: .gitignore: Simplified ignores for files at root. - Add: Utility function to handle < > around URIs --- .gitignore | 14 +++---- makefile | 17 +++----- omim2obo/mondo_genes_robot_tsv.py | 64 +++++++++++++++++++++++++++++++ omim2obo/utils/utils.py | 16 ++++++++ 4 files changed, 91 insertions(+), 20 deletions(-) create mode 100644 omim2obo/mondo_genes_robot_tsv.py create mode 100644 omim2obo/utils/utils.py diff --git a/.gitignore b/.gitignore index ae2ebb0..7475617 100644 --- a/.gitignore +++ b/.gitignore @@ -26,12 +26,8 @@ allelicVariants.txt allelicVariants.tsv # Outputs -omim.ttl -omim.sssom.tsv -omim.sssom.log.txt -omim.json -*.sssom.owl -mondo_exactmatch_omim.sssom.tsv -mondo_exactmatch_omimps.sssom.tsv -omim.owl -mondo_genes.robot.tsv +/*.json +/*.owl +/*.tsv +/*.ttl +/*.txt diff --git a/makefile b/makefile index eb97c04..e983116 100644 --- a/makefile +++ b/makefile @@ -35,18 +35,13 @@ omim.owl: omim.ttl mondo_exactmatch_omim.sssom.owl mondo_exactmatch_omimps.sssom query --update sparql/hgnc_links.ru \ convert -f ofn -o $@ -mondo_genes.robot.tsv: omim.owl - # Create a TSV of relational information for gene and disease classes +# Create a TSV of relational information for gene and disease classes +mondo_genes.tsv: omim.owl robot query -i omim.owl --query sparql/mondo_genes.sparql $@ - # Insert the source_code column as the second to last column - awk 'BEGIN {FS=OFS="\t"} {if (NR==1) {$$(NF+1)=$$(NF); $$(NF-1)="?source_code";} else {$$(NF+1)=$$(NF); $$(NF-1)="MONDO:OMIM";}} 1' $@ > temp_file && mv temp_file $@ - # Remove the first character, a question mark (?), from each field in the header. This is an artefact of the SPARQL query. - awk 'BEGIN {FS=OFS="\t"} NR==1 {for (i=1; i<=NF; i++) $$i=substr($$i, 2)} {print}' $@ > temp_file && mv temp_file $@ - # Remove < and > characters from specified columns - awk 'BEGIN {FS=OFS="\t"} NR>1 {gsub(/^<|>$$/, "", $$1); gsub(/^<|>$$/, "", $$2); gsub(/^<|>$$/, "", $$5)} {print}' $@ > temp_file && mv temp_file $@ - # Insert ROBOT subheader - robot_subheader="ID\tSC 'has material basis in germline mutation in' some %\t>A oboInOwl:source\t>A oboInOwl:source\t" && \ - sed 1a"$$robot_subheader" $@ > temp_file && mv temp_file $@ + +# Create a TSV of relational information for gene and disease classes, as a ROBOT template +mondo_genes.robot.tsv: mondo_genes.tsv + python -m omim2obo.mondo_genes_robot_tsv --inpath $< --outpath $@ cleanup: @rm -f omim.json diff --git a/omim2obo/mondo_genes_robot_tsv.py b/omim2obo/mondo_genes_robot_tsv.py new file mode 100644 index 0000000..8c49ada --- /dev/null +++ b/omim2obo/mondo_genes_robot_tsv.py @@ -0,0 +1,64 @@ +"""Create: mondo_genes.robot.tsv: a TSV of relational information for gene and disease classes""" +from argparse import ArgumentParser +from pathlib import Path +from typing import Dict, Union + +import pandas as pd + +from omim2obo.utils.utils import remove_angle_brackets + + +ROBOT_SUBHEADER = { + 'mondo_id': 'ID', + 'hgnc_id': "SC 'has material basis in germline mutation in' some %", + 'omim_disease_xref': '>A oboInOwl:source', + 'source_code': '>A oboInOwl:source', + 'omim_gene': '', +} + + +def mondo_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, str]) -> pd.DataFrame: + """Create: mondo_genes.robot.tsv""" + df = pd.read_csv(inpath, sep='\t') + + # Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query. + df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True) + + # Add source_code column + df['source_code'] = 'MONDO:OMIM' + + # Remove < and > characters from specified columns + uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene'] + for col in uri_cols: + df[col] = remove_angle_brackets(list(df[col])) + + # Insert ROBOT subheader + df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df]) + + # Format col order + df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'source_code', 'omim_gene']] + + # Sort + df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref']) + + df.to_csv(outpath, sep='\t', index=False) + return pd.DataFrame() + + +def cli(): + """Command line interface.""" + parser = ArgumentParser( + prog='mondo-genes-robot-tsv', + description='Create a ROBOT template TSV of relational information for gene and disease classes') + parser.add_argument( + '-i', '--inpath', required=True, + help='Path to file with such relational information, but not yet formatted as a ROBOT template.') + parser.add_argument( + '-o', '--outpath', required=True, + help='Path to save output.') + d: Dict = vars(parser.parse_args()) + mondo_genes_robot_tsv(**d) + + +if __name__ == '__main__': + cli() diff --git a/omim2obo/utils/utils.py b/omim2obo/utils/utils.py new file mode 100644 index 0000000..abd119f --- /dev/null +++ b/omim2obo/utils/utils.py @@ -0,0 +1,16 @@ +"""Misc utilities""" +from typing import List, Union + + +# todo: also in mondo-ingest. Refactor into mondolib: https://github.com/monarch-initiative/mondolib/issues/13 +def remove_angle_brackets(uris: Union[str, List[str]]) -> Union[str, List[str]]: + """Remove angle brackets from URIs, e.g.: + --> https://omim.org/entry/100050""" + str_input = isinstance(uris, str) + uris = [uris] if str_input else uris + uris2 = [] + for x in uris: + x = x[1:] if x.startswith('<') else x + x = x[:-1] if x.endswith('>') else x + uris2.append(x) + return uris2[0] if str_input else uris2