-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Update: Refactor method to do this from ShellScript / awk to Python / pandas. General - Update: .gitignore: Simplified ignores for files at root. - Add: Utility function to handle < > around URIs
- Loading branch information
Showing
4 changed files
with
91 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
"""Create: mondo_genes.robot.tsv: a TSV of relational information for gene and disease classes""" | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
from typing import Dict, Union | ||
|
||
import pandas as pd | ||
|
||
from omim2obo.utils.utils import remove_angle_brackets | ||
|
||
|
||
ROBOT_SUBHEADER = { | ||
'mondo_id': 'ID', | ||
'hgnc_id': "SC 'has material basis in germline mutation in' some %", | ||
'omim_disease_xref': '>A oboInOwl:source', | ||
'source_code': '>A oboInOwl:source', | ||
'omim_gene': '', | ||
} | ||
|
||
|
||
def mondo_genes_robot_tsv(inpath: Union[Path, str], outpath: Union[Path, str]) -> pd.DataFrame: | ||
"""Create: mondo_genes.robot.tsv""" | ||
df = pd.read_csv(inpath, sep='\t') | ||
|
||
# Remove the first character, a question mark (?), from each field in the header; an artefact of the SPARQL query. | ||
df.rename(columns={col: col[1:] for col in df.columns if col.startswith('?')}, inplace=True) | ||
|
||
# Add source_code column | ||
df['source_code'] = 'MONDO:OMIM' | ||
|
||
# Remove < and > characters from specified columns | ||
uri_cols = ['mondo_id', 'hgnc_id', 'omim_gene'] | ||
for col in uri_cols: | ||
df[col] = remove_angle_brackets(list(df[col])) | ||
|
||
# Insert ROBOT subheader | ||
df = pd.concat([pd.DataFrame([ROBOT_SUBHEADER]), df]) | ||
|
||
# Format col order | ||
df = df[['mondo_id', 'hgnc_id', 'omim_disease_xref', 'source_code', 'omim_gene']] | ||
|
||
# Sort | ||
df = df.sort_values(by=['mondo_id', 'hgnc_id', 'omim_gene', 'omim_disease_xref']) | ||
|
||
df.to_csv(outpath, sep='\t', index=False) | ||
return pd.DataFrame() | ||
|
||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = ArgumentParser( | ||
prog='mondo-genes-robot-tsv', | ||
description='Create a ROBOT template TSV of relational information for gene and disease classes') | ||
parser.add_argument( | ||
'-i', '--inpath', required=True, | ||
help='Path to file with such relational information, but not yet formatted as a ROBOT template.') | ||
parser.add_argument( | ||
'-o', '--outpath', required=True, | ||
help='Path to save output.') | ||
d: Dict = vars(parser.parse_args()) | ||
mondo_genes_robot_tsv(**d) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
"""Misc utilities""" | ||
from typing import List, Union | ||
|
||
|
||
# todo: also in mondo-ingest. Refactor into mondolib: https://github.com/monarch-initiative/mondolib/issues/13 | ||
def remove_angle_brackets(uris: Union[str, List[str]]) -> Union[str, List[str]]: | ||
"""Remove angle brackets from URIs, e.g.: | ||
<https://omim.org/entry/100050> --> https://omim.org/entry/100050""" | ||
str_input = isinstance(uris, str) | ||
uris = [uris] if str_input else uris | ||
uris2 = [] | ||
for x in uris: | ||
x = x[1:] if x.startswith('<') else x | ||
x = x[:-1] if x.endswith('>') else x | ||
uris2.append(x) | ||
return uris2[0] if str_input else uris2 |