diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml index f93c10d..e5b4118 100644 --- a/config/medgen.sssom-metadata.yml +++ b/config/medgen.sssom-metadata.yml @@ -2,7 +2,7 @@ creator_id: orcid:0000-0002-2906-7319 curie_map: GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/ HP: http://purl.obolibrary.org/obo/HP_ - MESH: http://identifiers.org/mesh/ + mesh: http://identifiers.org/mesh/ MONDO: http://purl.obolibrary.org/obo/MONDO_ MEDGEN: http://purl.obolibrary.org/obo/MEDGEN_ MEDGENCUI: http://purl.obolibrary.org/obo/MEDGENCUI_ diff --git a/src/__init__.py b/src/__init__.py index 98e378e..05e41d5 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1 +1 @@ -"""MedGen""" +"""MedGen ingest """ diff --git a/src/create_sssom.py b/src/create_sssom.py index c4c4dcc..1df7afa 100644 --- a/src/create_sssom.py +++ b/src/create_sssom.py @@ -49,7 +49,7 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO df_hpo_mesh = df_hpo_mesh[['subject_id', 'predicate_id', 'object_id', 'umls_id', 'umls_label']].sort_values( ['subject_id', 'object_id'], na_position='first') # -- add missing prefixes - df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'MESH:' + x) + df_hpo_mesh['object_id'] = df_hpo_mesh['object_id'].apply(lambda x: 'mesh:' + x) # todo: temp; (1) remove later: saving dataset with no matches, for review (2) after remove, will need to # move the col removals below (umls) to above # - add mapping_justification diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py index fed8e32..ea3a422 100644 --- a/src/mondo_mapping_status.py +++ b/src/mondo_mapping_status.py @@ -19,8 +19,6 @@ ] CURIE = str -# TODO: Mappings can be considered skos:exactMatch - def ids_prefixless(ids: Set[str]) -> Set[str]: """Remove prefix""" return set([x.split(':')[1] for x in ids]) diff --git a/src/mondo_robot_template.py b/src/mondo_robot_template.py index 32b5eba..1070386 100644 --- a/src/mondo_robot_template.py +++ b/src/mondo_robot_template.py @@ -11,7 +11,7 @@ import pandas as pd -from utils import get_mapping_set, add_prefixes_to_plain_id +from utils import add_prefixes_to_plain_id, get_mapping_set SRC_DIR = Path(__file__).parent PROJECT_DIR = SRC_DIR.parent @@ -21,31 +21,66 @@ ROBOT_ROW_MAP = { 'mondo_id': 'ID', 'xref_id': 'A oboInOwl:hasDbXref', - 'source_id': '>A oboInOwl:source' + 'source_id': '>A oboInOwl:source', + 'source_medgen_id': '>A oboInOwl:source', + 'mapping_predicate': '>A oboInOwl:source', } -def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE): - """Create robot template""" +# todo: refactor to use get_mapping_set(): (1) *maybe* use SSSOM as the intermediate standard (sssomify=True), and +# update column renames below. and (2) use filter_sources (already used by MeSH). +def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE, filter_out_medgencui=True): + """Create robot template + + :param filter_out_medgencui: There should be no cases where Mondo or any other sources we care about will be + MEDGENCUI (CN) instances, but we know we do not want them, so this default filtration step helps with that possible + future edge case.""" # Read input - df: pd.DataFrame = get_mapping_set(input_file) - # Get explicit Medgen (CUI, CN) -> Mondo mappings - df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'}) - out_df_cui_cn = df_medgen_mondo.copy() - out_df_cui_cn['xref_id'] = out_df_cui_cn['xref_id'].apply(add_prefixes_to_plain_id) + df = get_mapping_set(input_file, add_prefixes=True, sssomify=False, filter_out_medgencui=filter_out_medgencui) + + # Mondo->MEDGEN & Mondo->UMLS + # 1. Get explicit Medgen (CUI, CN) -> Mondo mappings + df_umls_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename( + columns={'source_id': 'mondo_id', 'xref_id': 'umls_cui'}) - # Get Medgen (UID) -> Mondo mappings + # 2. Get Medgen (UID) -> Mondo mappings # - Done by proxy: UID <-> CUI <-> MONDO - df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename( - columns={'source_id': 'medgen_uid'}) - # todo: should some of these steps be in _reformat_mapping_set()? to be utilized by SSSOM files? - out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id').rename( - columns={'xref_id': 'source_id', 'medgen_uid': 'xref_id'})[['mondo_id', 'xref_id', 'source_id']] - out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}') - out_df_uid['source_id'] = out_df_uid['source_id'].apply(lambda x: f'UMLS:{x}') + df_umls_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename( + columns={'source_id': 'medgen_uid', 'xref_id': 'umls_cui'}) + df_umls_medgenuid['medgen_uid'] = ( + df_umls_medgenuid['medgen_uid'].apply(add_prefixes_to_plain_id)) # should/will all be MEDGEN + df_merged = pd.merge(df_umls_mondo, df_umls_medgenuid, on='umls_cui') + # - Split into (Mondo <-> Medgen UID) & (Mondo <-> UMLS CUI) + out_df_medgenuid = df_merged.rename(columns={'medgen_uid': 'xref_id', 'umls_cui': 'source_id'})[[ + 'mondo_id', 'xref_id', 'source_id']] + out_df_medgenuid['source_id'] = '' + out_df_umlscui = df_merged.rename(columns={'umls_cui': 'xref_id', 'medgen_uid': 'source_id'}) + + # Mondo->MESH + df_umls_mesh = get_mapping_set(input_file, filter_sources=['MeSH'], add_prefixes=True, sssomify=False) + df_umls_mesh['source_id'] = df_umls_mesh['source_id'].apply(lambda x: 'mesh:' + x) + out_df_mesh = pd.merge(df_umls_mesh, df_umls_mondo, left_on='xref_id', right_on='umls_cui').rename( + columns={'source_id': 'xref_id', 'xref_id': 'source_id'})[['mondo_id', 'xref_id', 'source_id']] + + # Combine mappings + out_df = pd.concat([out_df_medgenuid, out_df_umlscui, out_df_mesh]).sort_values(['xref_id', 'mondo_id'])\ + .drop_duplicates().fillna('') + + # Add additional cols + out_df['source_medgen_id'] = 'MONDO:MEDGEN' + # todo: could optimize by doing apply just on the xref_id col + def set_mapping_pred(row): + """Set mapping predicate""" + # M = Concept UI; D = Descriptor UI; C = SupplementalRecordUI; Q = Qualifier UI + pred = 'MONDO:equivalentTo' if row['xref_id'].startswith('mesh:M') \ + else 'MONDO:relatedTo' if row['xref_id'].startswith('mesh:') \ + else 'MONDO:equivalentTo' + return pred + # Context on MeSH IDs: + # https://docs.google.com/document/d/1ryu6isBmNEno8lyni70jBaw-D_I6tdEXpnnAK86ZWfs/edit#heading=h.3cho5esard3q + out_df['mapping_predicate'] = out_df.apply(set_mapping_pred, axis=1) # Save - out_df = pd.concat([out_df_cui_cn, out_df_uid]).sort_values(['xref_id', 'mondo_id']).drop_duplicates().fillna('') out_df = pd.concat([pd.DataFrame([ROBOT_ROW_MAP]), out_df]) out_df.to_csv(output_file, index=False, sep='\t') diff --git a/src/utils.py b/src/utils.py index fdf31b3..6d40cdc 100644 --- a/src/utils.py +++ b/src/utils.py @@ -23,6 +23,7 @@ def add_prefixes_to_plain_id(x: str) -> str: else f'MEDGEN:{x}' +# todo: Add to sssom-py. Shared between, at the least, ICD11 and MedGen repos def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[Path, str]): """Writes a SSSOM file""" with open(config_path, 'r') as yaml_file: @@ -37,6 +38,7 @@ def write_sssom(df: pd.DataFrame, config_path: Union[Path, str], outpath: Union[ # common code for this and robot template, or add a param to not rename that col def get_mapping_set( inpath: Union[str, Path], filter_sources: List[str] = None, add_prefixes=False, sssomify=True, + filter_out_medgencui=True ) -> pd.DataFrame: """Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications.""" # Read @@ -45,9 +47,14 @@ def get_mapping_set( empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row if empty_cols: df = df.drop(columns=empty_cols) - # Add prefixes - if add_prefixes: - df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id) + # Filter MEDGENCUI & add prefixes + df['xref_id'] = df['xref_id'].apply(add_prefixes_to_plain_id) + # - Filter MEDGENCUI + if filter_out_medgencui: + df = df[~df['xref_id'].str.startswith('MEDGENCUI')] + # - Add prefixes + if not add_prefixes: + del df['xref_id'] # Sort df = df.sort_values(['xref_id', 'source_id']) if filter_sources: