Skip to content

Commit

Permalink
HGNC ROBOT template
Browse files Browse the repository at this point in the history
- Update: Refactor method to do this from ShellScript / awk to Python / pandas.

General
- Update: .gitignore: Simplified ignores for files at root.
- Add: Utility function to handle < > around URIs
  • Loading branch information
joeflack4 committed Jun 11, 2024
1 parent 47d744c commit 5c18c12
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 20 deletions.
14 changes: 5 additions & 9 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,8 @@ allelicVariants.txt
allelicVariants.tsv

# Outputs
omim.ttl
omim.sssom.tsv
omim.sssom.log.txt
omim.json
*.sssom.owl
mondo_exactmatch_omim.sssom.tsv
mondo_exactmatch_omimps.sssom.tsv
omim.owl
mondo_genes.robot.tsv
/*.json
/*.owl
/*.tsv
/*.ttl
/*.txt
17 changes: 6 additions & 11 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,13 @@ omim.owl: omim.ttl mondo_exactmatch_omim.sssom.owl mondo_exactmatch_omimps.sssom
query --update sparql/hgnc_links.ru \
convert -f ofn -o $@

mondo_genes.robot.tsv: omim.owl
# Create a TSV of relational information for gene and disease classes
# Create a TSV of relational information for gene and disease classes
mondo_genes.tsv: omim.owl
robot query -i omim.owl --query sparql/mondo_genes.sparql $@
# Insert the source_code column as the second to last column
awk 'BEGIN {FS=OFS="\t"} {if (NR==1) {$$(NF+1)=$$(NF); $$(NF-1)="?source_code";} else {$$(NF+1)=$$(NF); $$(NF-1)="MONDO:OMIM";}} 1' $@ > temp_file && mv temp_file $@
# Remove the first character, a question mark (?), from each field in the header. This is an artefact of the SPARQL query.
awk 'BEGIN {FS=OFS="\t"} NR==1 {for (i=1; i<=NF; i++) $$i=substr($$i, 2)} {print}' $@ > temp_file && mv temp_file $@
# Remove < and > characters from specified columns
awk 'BEGIN {FS=OFS="\t"} NR>1 {gsub(/^<|>$$/, "", $$1); gsub(/^<|>$$/, "", $$2); gsub(/^<|>$$/, "", $$5)} {print}' $@ > temp_file && mv temp_file $@
# Insert ROBOT subheader
robot_subheader="ID\tSC 'has material basis in germline mutation in' some %\t>A oboInOwl:source\t>A oboInOwl:source\t" && \
sed 1a"$$robot_subheader" $@ > temp_file && mv temp_file $@

# Create a TSV of relational information for gene and disease classes, as a ROBOT template
mondo_genes.robot.tsv: mondo_genes.tsv
python -m omim2obo.mondo_genes_robot_tsv --inpath $< --outpath $@

cleanup:
@rm -f omim.json
Expand Down
64 changes: 64 additions & 0 deletions omim2obo/mondo_genes_robot_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""Create: mondo_genes.robot.tsv: a TSV of relational information for gene and disease classes"""
from argparse import ArgumentParser
from pathlib import Path
from typing import Dict, Union

import pandas as pd

from omim2obo.utils.utils import remove_angle_brackets


ROBOT_SUBHEADER = {
'mondo_id': 'ID',
'hgnc_id': "SC 'has material basis in germline mutation in' some %",
'omim_disease_xref': '>A oboInOwl:source',
'source_code': '>A oboInOwl:source',
'omim_gene': '',
}


def mondo_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, str]) -> pd.DataFrame:
"""Create: mondo_genes.robot.tsv"""
df = pd.read_csv(inpath, sep='\t')

# Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query.
df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True)

# Add source_code column
df['source_code'] = 'MONDO:OMIM'

# Remove < and > characters from specified columns
uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene']
for col in uri_cols:
df[col] = remove_angle_brackets(list(df[col]))

# Insert ROBOT subheader
df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df])

# Format col order
df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'source_code', 'omim_gene']]

# Sort
df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref'])

df.to_csv(outpath, sep='\t', index=False)
return pd.DataFrame()


def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='mondo-genes-robot-tsv',
description='Create a ROBOT template TSV of relational information for gene and disease classes')
parser.add_argument(
'-i', '--inpath', required=True,
help='Path to file with such relational information, but not yet formatted as a ROBOT template.')
parser.add_argument(
'-o', '--outpath', required=True,
help='Path to save output.')
d: Dict = vars(parser.parse_args())
mondo_genes_robot_tsv(**d)


if __name__ == '__main__':
cli()
16 changes: 16 additions & 0 deletions omim2obo/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Misc utilities"""
from typing import List, Union


# todo: also in mondo-ingest. Refactor into mondolib: https://github.com/monarch-initiative/mondolib/issues/13
def remove_angle_brackets(uris: Union[str, List[str]]) -> Union[str, List[str]]:
"""Remove angle brackets from URIs, e.g.:
<https://omim.org/entry/100050> --> https://omim.org/entry/100050"""
str_input = isinstance(uris, str)
uris = [uris] if str_input else uris
uris2 = []
for x in uris:
x = x[1:] if x.startswith('<') else x
x = x[:-1] if x.endswith('>') else x
uris2.append(x)
return uris2[0] if str_input else uris2

0 comments on commit 5c18c12

Please sign in to comment.