Merge pull request #19 from monarch-initiative/robot

Finalise MedGen xref table
monarch-initiative · Apr 18, 2024 · 1006885 · 1006885
2 parents 0b8e063 + 2fd2336
commit 1006885
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 26 deletions.
diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml
@@ -2,7 +2,7 @@ creator_id: orcid:0000-0002-2906-7319
 curie_map:
   GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/
   HP: http://purl.obolibrary.org/obo/HP_
-  MESH: http://identifiers.org/mesh/
+  mesh: http://identifiers.org/mesh/
   MONDO: http://purl.obolibrary.org/obo/MONDO_
   MEDGEN: http://purl.obolibrary.org/obo/MEDGEN_
   MEDGENCUI: http://purl.obolibrary.org/obo/MEDGENCUI_

diff --git a/src/__init__.py b/src/__init__.py
@@ -1 +1 @@
-"""MedGen"""
+"""MedGen ingest """
diff --git a/src/create_sssom.py b/src/create_sssom.py
@@ -49,7 +49,7 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO
     df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values(
         ['subject_id', 'object_id'], na_position='first')
     # -- add missing prefixes
-    df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x)
+    df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'mesh:' + x)
     # todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to
     #  move the col removals below (umls) to above
     # - add mapping_justification

diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py
@@ -19,8 +19,6 @@
 ]
 CURIE = str
 
-# TODO: Mappings can be considered skos:exactMatch
-
 def ids_prefixless(ids: Set[str]) -> Set[str]:
     """Remove prefix"""
     return set([x.split(':')[1] for x in ids])

diff --git a/src/mondo_robot_template.py b/src/mondo_robot_template.py
@@ -11,7 +11,7 @@
 
 import pandas as pd
 
-from utils import get_mapping_set, add_prefixes_to_plain_id
+from utils import add_prefixes_to_plain_id, get_mapping_set
 
 SRC_DIR = Path(__file__).parent
 PROJECT_DIR = SRC_DIR.parent
@@ -21,31 +21,66 @@
 ROBOT_ROW_MAP = {
     'mondo_id': 'ID',
     'xref_id': 'A oboInOwl:hasDbXref',
-    'source_id': '>A oboInOwl:source'
+    'source_id': '>A oboInOwl:source',
+    'source_medgen_id': '>A oboInOwl:source',
+    'mapping_predicate': '>A oboInOwl:source',
 }
 
 
-def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
-    """Create robot template"""
+# todo: refactor to use get_mapping_set(): (1) *maybe* use SSSOM as the intermediate standard (sssomify=True), and
+#  update column renames below. and (2) use filter_sources (already used by MeSH).
+def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE, filter_out_medgencui=True):
+    """Create robot template
+
+    :param filter_out_medgencui: There should be no cases where Mondo or any other sources we care about will be
+    MEDGENCUI (CN) instances, but we know we do not want them, so this default filtration step helps with that possible
+    future edge case."""
     # Read input
-    df: pd.DataFrame = get_mapping_set(input_file)
-    # Get explicit Medgen (CUI, CN) -> Mondo mappings
-    df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'})
-    out_df_cui_cn = df_medgen_mondo.copy()
-    out_df_cui_cn['xref_id'] = out_df_cui_cn['xref_id'].apply(add_prefixes_to_plain_id)
+    df = get_mapping_set(input_file, add_prefixes=True, sssomify=False, filter_out_medgencui=filter_out_medgencui)
+
+    # Mondo->MEDGEN & Mondo->UMLS
+    # 1. Get explicit Medgen (CUI, CN) -> Mondo mappings
+    df_umls_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(
+        columns={'source_id': 'mondo_id', 'xref_id': 'umls_cui'})
 
-    # Get Medgen (UID) -> Mondo mappings
+    # 2. Get Medgen (UID) -> Mondo mappings
     # - Done by proxy: UID <-> CUI <-> MONDO
-    df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
-        columns={'source_id': 'medgen_uid'})
-    # todo: should some of these steps be in _reformat_mapping_set()? to be utilized by SSSOM files?
-    out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id').rename(
-        columns={'xref_id': 'source_id', 'medgen_uid': 'xref_id'})[['mondo_id', 'xref_id', 'source_id']]
-    out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}')
-    out_df_uid['source_id'] = out_df_uid['source_id'].apply(lambda x: f'UMLS:{x}')
+    df_umls_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
+        columns={'source_id': 'medgen_uid', 'xref_id': 'umls_cui'})
+    df_umls_medgenuid['medgen_uid'] = (
+        df_umls_medgenuid['medgen_uid'].apply(add_prefixes_to_plain_id))  # should/will all be MEDGEN
+    df_merged = pd.merge(df_umls_mondo, df_umls_medgenuid, on='umls_cui')
+    # - Split into (Mondo <-> Medgen UID) & (Mondo <-> UMLS CUI)
+    out_df_medgenuid = df_merged.rename(columns={'medgen_uid': 'xref_id', 'umls_cui': 'source_id'})[[
+        'mondo_id', 'xref_id', 'source_id']]
+    out_df_medgenuid['source_id'] = ''
+    out_df_umlscui = df_merged.rename(columns={'umls_cui': 'xref_id', 'medgen_uid': 'source_id'})
+
+    # Mondo->MESH
+    df_umls_mesh = get_mapping_set(input_file, filter_sources=['MeSH'], add_prefixes=True, sssomify=False)
+    df_umls_mesh['source_id'] = df_umls_mesh['source_id'].apply(lambda x: 'mesh:' + x)
+    out_df_mesh = pd.merge(df_umls_mesh, df_umls_mondo, left_on='xref_id', right_on='umls_cui').rename(
+        columns={'source_id': 'xref_id', 'xref_id': 'source_id'})[['mondo_id', 'xref_id', 'source_id']]
+
+    # Combine mappings
+    out_df = pd.concat([out_df_medgenuid, out_df_umlscui, out_df_mesh]).sort_values(['xref_id', 'mondo_id'])\
+        .drop_duplicates().fillna('')
+
+    # Add additional cols
+    out_df['source_medgen_id'] = 'MONDO:MEDGEN'
+    # todo: could optimize by doing apply just on the xref_id col
+    def set_mapping_pred(row):
+        """Set mapping predicate"""
+        # M = Concept UI; D = Descriptor UI; C = SupplementalRecordUI; Q = Qualifier UI
+        pred = 'MONDO:equivalentTo' if row['xref_id'].startswith('mesh:M') \
+            else 'MONDO:relatedTo' if row['xref_id'].startswith('mesh:') \
+            else 'MONDO:equivalentTo'
+        return pred
+    # Context on MeSH IDs:
+    #  https://docs.google.com/document/d/1ryu6isBmNEno8lyni70jBaw-D_I6tdEXpnnAK86ZWfs/edit#heading=h.3cho5esard3q
+    out_df['mapping_predicate'] = out_df.apply(set_mapping_pred, axis=1)
 
     # Save
-    out_df = pd.concat([out_df_cui_cn, out_df_uid]).sort_values(['xref_id', 'mondo_id']).drop_duplicates().fillna('')
     out_df = pd.concat([pd.DataFrame([ROBOT_ROW_MAP]), out_df])
     out_df.to_csv(output_file, index=False, sep='\t')
 

diff --git a/src/utils.py b/src/utils.py
@@ -23,6 +23,7 @@ def add_prefixes_to_plain_id(x: str) -> str:
         else f'MEDGEN:{x}'
 
 
+# todo: Add to sssom-py. Shared between, at the least, ICD11 and MedGen repos
 def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]):
     """Writes a SSSOM file"""
     with open(config_path, 'r') as yaml_file:
@@ -37,6 +38,7 @@ def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[
 #  common code for this and robot template, or add a param to not rename that col
 def get_mapping_set(
     inpath: Union[str, Path], filter_sources: List[str] = None, add_prefixes=False, sssomify=True,
+    filter_out_medgencui=True
 ) -> pd.DataFrame:
     """Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications."""
     # Read
@@ -45,9 +47,14 @@ def get_mapping_set(
     empty_cols = [col for col in df.columns if df[col].isnull().all()]  # caused by trailing | at end of each row
     if empty_cols:
         df = df.drop(columns=empty_cols)
-    # Add prefixes
-    if add_prefixes:
-        df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id)
+    # Filter MEDGENCUI & add prefixes
+    df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id)
+    # - Filter MEDGENCUI
+    if filter_out_medgencui:
+        df = df[~df['xref_id'].str.startswith('MEDGENCUI')]
+    # - Add prefixes
+    if not add_prefixes:
+        del df['xref_id']
     # Sort
     df = df.sort_values(['xref_id', 'source_id'])
     if filter_sources: