Skip to content

Commit

Permalink
ROBOT template xref tweaks
Browse files Browse the repository at this point in the history
- Delete: Logic with duplicative MEDGENCUI rows.
- Add: Logic to filter out any MEDGENCUI at all that could be mapped to Mondo in the future.
- Add: 2 new columns
- Update: flipped xref/source so that MEDGEN UIDs have no source, and UMLS CUIs have MedGen as source. It was the opposite before.
- Add: MeSH mappings
- Bug fix: MeSH prefix spelling / case

General
- Add: Refactor to load mapping set using function that handles lots of repetitive stuff: get_mapping_set()
  • Loading branch information
joeflack4 committed Apr 18, 2024
1 parent 0b8e063 commit 2fd2336
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 26 deletions.
2 changes: 1 addition & 1 deletion config/medgen.sssom-metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ creator_id: orcid:0000-0002-2906-7319
curie_map:
GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/
HP: http://purl.obolibrary.org/obo/HP_
MESH: http://identifiers.org/mesh/
mesh: http://identifiers.org/mesh/
MONDO: http://purl.obolibrary.org/obo/MONDO_
MEDGEN: http://purl.obolibrary.org/obo/MEDGEN_
MEDGENCUI: http://purl.obolibrary.org/obo/MEDGENCUI_
Expand Down
2 changes: 1 addition & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
"""MedGen"""
"""MedGen ingest """
2 changes: 1 addition & 1 deletion src/create_sssom.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO
df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values(
['subject_id', 'object_id'], na_position='first')
# -- add missing prefixes
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x)
df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'mesh:' + x)
# todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to
# move the col removals below (umls) to above
# - add mapping_justification
Expand Down
2 changes: 0 additions & 2 deletions src/mondo_mapping_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
]
CURIE = str

# TODO: Mappings can be considered skos:exactMatch

def ids_prefixless(ids: Set[str]) -> Set[str]:
"""Remove prefix"""
return set([x.split(':')[1] for x in ids])
Expand Down
71 changes: 53 additions & 18 deletions src/mondo_robot_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import pandas as pd

from utils import get_mapping_set, add_prefixes_to_plain_id
from utils import add_prefixes_to_plain_id, get_mapping_set

SRC_DIR = Path(__file__).parent
PROJECT_DIR = SRC_DIR.parent
Expand All @@ -21,31 +21,66 @@
ROBOT_ROW_MAP = {
'mondo_id': 'ID',
'xref_id': 'A oboInOwl:hasDbXref',
'source_id': '>A oboInOwl:source'
'source_id': '>A oboInOwl:source',
'source_medgen_id': '>A oboInOwl:source',
'mapping_predicate': '>A oboInOwl:source',
}


def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
"""Create robot template"""
# todo: refactor to use get_mapping_set(): (1) *maybe* use SSSOM as the intermediate standard (sssomify=True), and
# update column renames below. and (2) use filter_sources (already used by MeSH).
def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE, filter_out_medgencui=True):
"""Create robot template
:param filter_out_medgencui: There should be no cases where Mondo or any other sources we care about will be
MEDGENCUI (CN) instances, but we know we do not want them, so this default filtration step helps with that possible
future edge case."""
# Read input
df: pd.DataFrame = get_mapping_set(input_file)
# Get explicit Medgen (CUI, CN) -> Mondo mappings
df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'})
out_df_cui_cn = df_medgen_mondo.copy()
out_df_cui_cn['xref_id'] = out_df_cui_cn['xref_id'].apply(add_prefixes_to_plain_id)
df = get_mapping_set(input_file, add_prefixes=True, sssomify=False, filter_out_medgencui=filter_out_medgencui)

# Mondo->MEDGEN & Mondo->UMLS
# 1. Get explicit Medgen (CUI, CN) -> Mondo mappings
df_umls_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(
columns={'source_id': 'mondo_id', 'xref_id': 'umls_cui'})

# Get Medgen (UID) -> Mondo mappings
# 2. Get Medgen (UID) -> Mondo mappings
# - Done by proxy: UID <-> CUI <-> MONDO
df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
columns={'source_id': 'medgen_uid'})
# todo: should some of these steps be in _reformat_mapping_set()? to be utilized by SSSOM files?
out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id').rename(
columns={'xref_id': 'source_id', 'medgen_uid': 'xref_id'})[['mondo_id', 'xref_id', 'source_id']]
out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}')
out_df_uid['source_id'] = out_df_uid['source_id'].apply(lambda x: f'UMLS:{x}')
df_umls_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
columns={'source_id': 'medgen_uid', 'xref_id': 'umls_cui'})
df_umls_medgenuid['medgen_uid'] = (
df_umls_medgenuid['medgen_uid'].apply(add_prefixes_to_plain_id)) # should/will all be MEDGEN
df_merged = pd.merge(df_umls_mondo, df_umls_medgenuid, on='umls_cui')
# - Split into (Mondo <-> Medgen UID) & (Mondo <-> UMLS CUI)
out_df_medgenuid = df_merged.rename(columns={'medgen_uid': 'xref_id', 'umls_cui': 'source_id'})[[
'mondo_id', 'xref_id', 'source_id']]
out_df_medgenuid['source_id'] = ''
out_df_umlscui = df_merged.rename(columns={'umls_cui': 'xref_id', 'medgen_uid': 'source_id'})

# Mondo->MESH
df_umls_mesh = get_mapping_set(input_file, filter_sources=['MeSH'], add_prefixes=True, sssomify=False)
df_umls_mesh['source_id'] = df_umls_mesh['source_id'].apply(lambda x: 'mesh:' + x)
out_df_mesh = pd.merge(df_umls_mesh, df_umls_mondo, left_on='xref_id', right_on='umls_cui').rename(
columns={'source_id': 'xref_id', 'xref_id': 'source_id'})[['mondo_id', 'xref_id', 'source_id']]

# Combine mappings
out_df = pd.concat([out_df_medgenuid, out_df_umlscui, out_df_mesh]).sort_values(['xref_id', 'mondo_id'])\
.drop_duplicates().fillna('')

# Add additional cols
out_df['source_medgen_id'] = 'MONDO:MEDGEN'
# todo: could optimize by doing apply just on the xref_id col
def set_mapping_pred(row):
"""Set mapping predicate"""
# M = Concept UI; D = Descriptor UI; C = SupplementalRecordUI; Q = Qualifier UI
pred = 'MONDO:equivalentTo' if row['xref_id'].startswith('mesh:M') \
else 'MONDO:relatedTo' if row['xref_id'].startswith('mesh:') \
else 'MONDO:equivalentTo'
return pred
# Context on MeSH IDs:
# https://docs.google.com/document/d/1ryu6isBmNEno8lyni70jBaw-D_I6tdEXpnnAK86ZWfs/edit#heading=h.3cho5esard3q
out_df['mapping_predicate'] = out_df.apply(set_mapping_pred, axis=1)

# Save
out_df = pd.concat([out_df_cui_cn, out_df_uid]).sort_values(['xref_id', 'mondo_id']).drop_duplicates().fillna('')
out_df = pd.concat([pd.DataFrame([ROBOT_ROW_MAP]), out_df])
out_df.to_csv(output_file, index=False, sep='\t')

Expand Down
13 changes: 10 additions & 3 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def add_prefixes_to_plain_id(x: str) -> str:
else f'MEDGEN:{x}'


# todo: Add to sssom-py. Shared between, at the least, ICD11 and MedGen repos
def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
"""Writes a SSSOM file"""
with open(config_path, 'r') as yaml_file:
Expand All @@ -37,6 +38,7 @@ def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[
# common code for this and robot template, or add a param to not rename that col
def get_mapping_set(
inpath: Union[str, Path], filter_sources: List[str] = None, add_prefixes=False, sssomify=True,
filter_out_medgencui=True
) -> pd.DataFrame:
"""Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications."""
# Read
Expand All @@ -45,9 +47,14 @@ def get_mapping_set(
empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row
if empty_cols:
df = df.drop(columns=empty_cols)
# Add prefixes
if add_prefixes:
df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id)
# Filter MEDGENCUI & add prefixes
df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id)
# - Filter MEDGENCUI
if filter_out_medgencui:
df = df[~df['xref_id'].str.startswith('MEDGENCUI')]
# - Add prefixes
if not add_prefixes:
del df['xref_id']
# Sort
df = df.sort_values(['xref_id', 'source_id'])
if filter_sources:
Expand Down

0 comments on commit 2fd2336

Please sign in to comment.