From a508bc9ca7cf7c7afe9c12b9efa394f2dae8c12b Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Sun, 19 May 2024 15:53:30 -0400 Subject: [PATCH] Bug fix: Field rename - MedGen source made breaking data model change; a field was renamed. Updated code to match. General - Bug fixes: for 'make all', which we're not currently using. But the goal still exists and, if run, was bugging out because the dependency pipeline was broken. Certain necessary goals were commented out, and some references were incorrect. - Update: Some files in the release/ dir were not actually release files, so they were moved to output/. Code in various files updated for this. - Update: Added and removed some comments and todos. - Update: Some goal prereqs to make it dependencies clearer - Rename: goal 'sssom' to 'sssom-validate' - Update: goal 'sssom' is now a different goal and there for convenience --- .github/workflows/buid_and_release.yml | 2 -- makefile | 39 ++++++++++++++++---------- src/create_sssom.py | 5 ++-- src/mondo_mapping_status.py | 2 +- src/utils.py | 5 +++- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index 4574510..58cb191 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -32,9 +32,7 @@ jobs: title: "${{ steps.current-time.outputs.formattedTime }}" prerelease: false # todo: add back `release/medgen-disease-extract.owl`, pending https://github.com/monarch-initiative/medgen/issues/11 - # todo: add `medgen.sssom.tsv`, pending https://github.com/monarch-initiative/medgen/issues/6 # output/release/medgen-disease-extract.owl -# output/release/medgen.sssom.tsv files: | output/release/medgen.obo output/release/medgen-disease-extract.obo diff --git a/makefile b/makefile index 392d586..42e7efb 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ # that part. In order to force re-download, run `make all -B`. # todo: remove parts of old make/perl pipeline no longer used .DEFAULT_GOAL := all -.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom +.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom sssom-validate OBO=http://purl.obolibrary.org/obo PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl @@ -14,11 +14,10 @@ minimal: build-lite stage-lite clean # stage-lite: These commented out files are produced by `all` but not by `minimal`. Just left here for reference. See: https://github.com/monarch-initiative/medgen/issues/11 stage-lite: | output/release/ # mv medgen-disease-extract.owl output/release/ -# mv medgen.sssom.tsv output/release/ mv *.obo output/release/ mv *.robot.template.tsv output/release/ mv *.sssom.tsv output/release/ -build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv sssom +build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv umls-hpo.sssom.tsv sssom-validate all: build stage clean analyze # analyze: runs more than just this file; that goal creates multiple files @@ -51,6 +50,11 @@ ftp.ncbi.nlm.nih.gov/: uid2cui.tsv: ftp.ncbi.nlm.nih.gov/ ./src/make_uid2cui.pl > $@ +# todo: an issue can happen where the file exists but it triest to run the goal again: +# ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt already exists -- do you wish to overwrite (y or n)? +# This happens because the prerequisite `ftp.ncbi.nlm.nih.gov/` is newer than the goal +# `ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt`. When this happened, it was 5 hours newer. However, I don't +# know how this can possibly be the case, since the goal is unzipped within that folder after the folder is created. ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt: ftp.ncbi.nlm.nih.gov/ @if [ -f "ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz" ]; then \ gzip -dk ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz; \ @@ -75,21 +79,26 @@ medgen-disease-extract.obo: x-Disease-or-Syndrome.obo x-Neoplastic-Process.obo medgen-disease-extract.json: medgen-disease-extract.obo owltools $< -o -f json $@ -medgen-disease-extract.owl: medgen-disease-extract.obo +output/medgen-disease-extract.owl: medgen-disease-extract.obo | output/ owltools $< -o $@ # SSSOM ---------------------------------- -# todo: comemented out old pipeline: remove -#medgen.obographs.json: -# robot convert -i medgen-disease-extract.owl -o $@ -# -#medgen.sssom.tsv: medgen.obographs.json -# sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ -sssom: umls-hpo.sssom.tsv +sssom: umls-hpo.sssom.tsv sssom-validate + +sssom-validate: umls-hpo.sssom.tsv sssom validate umls-hpo.sssom.tsv sssom validate hpo-mesh.sssom.tsv -umls-hpo.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt +# todo: Address GH action build heap space err: +# https://github.com/monarch-initiative/medgen/actions/runs/9150396559/job/25155114016 +# Don't need to fix until the case where we need to use `make all` or otherwise need this file. +output/medgen.obographs.json: output/medgen-disease-extract.owl | output/ + robot convert -i $< -o $@ + +output/medgen.sssom.tsv: output/medgen.obographs.json | output/ + sssom parse $< -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ + +umls-hpo.sssom.tsv hpo-mesh.sssom.tsv output/hpo-mesh_non-matches-included.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml # ---------------------------------------- @@ -113,15 +122,15 @@ tmp/input/mondo.sssom.tsv: | tmp/input/ wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@ # creates more than just this file; that goal creates multiple files -output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: tmp/input/mondo.sssom.tsv | output/ +output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: tmp/input/mondo.sssom.tsv output/medgen.sssom.tsv | output/ python src/mondo_mapping_status.py # ---------------------------------------- # Robot templates # ---------------------------------------- # todo: Ideally I wanted this done at the end of the ingest, permuting from medgen.sssom.tsv, but there were some -# problems with that file. Eventually changing to that feels like it makes more sense. Will have already been -# pre-curated by disease. And some of the logic in this Python script is duplicative. +# problems with that file. Eventually changing to that feels like it makes more sense. Will have already been +# pre-curated by disease. And some of the logic in this Python script is duplicative. medgen-xrefs.robot.template.tsv medgen-xrefs-mesh.robot.template.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt python src/mondo_robot_template.py -i $< \ --outpath-general medgen-xrefs.robot.template.tsv \ diff --git a/src/create_sssom.py b/src/create_sssom.py index 1df7afa..4e52b70 100644 --- a/src/create_sssom.py +++ b/src/create_sssom.py @@ -10,10 +10,12 @@ PROJECT_DIR = SRC_DIR.parent FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen" CONFIG_DIR = PROJECT_DIR / "config" +OUTDIR = PROJECT_DIR / "output" INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt") INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml") OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "umls-hpo.sssom.tsv") OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv") +OUTPUT_FILE_HPO_MESH_WITH_NON_MATCHES = str(OUTDIR / "hpo-mesh_non-matches-included.sssom.tsv") def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame: @@ -54,8 +56,7 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO # move the col removals below (umls) to above # - add mapping_justification df_hpo_mesh['mapping_justification'] = 'semapv:ManualMappingCuration' - write_sssom(df_hpo_mesh, input_sssom_config, - OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-non-matches-included.sssom.tsv')) + write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH_WITH_NON_MATCHES) # -- filter non-matches & drop unneeded cols df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[ x for x in df_hpo_mesh.columns if not x.startswith('umls')]] diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py index ea3a422..e996699 100644 --- a/src/mondo_mapping_status.py +++ b/src/mondo_mapping_status.py @@ -10,7 +10,7 @@ RELEASE_OUTDIR = OUTDIR / 'release' INPUT_DIR = PROJECT_DIR / 'tmp' / 'input' MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv' -MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv' +MEDGEN_SSSOM_TSV = OUTDIR / 'medgen.sssom.tsv' # MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used. # todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest MEDGEN_PREFIXES = [ diff --git a/src/utils.py b/src/utils.py index 6d40cdc..e9cc7e6 100644 --- a/src/utils.py +++ b/src/utils.py @@ -42,7 +42,10 @@ def get_mapping_set( ) -> pd.DataFrame: """Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications.""" # Read - df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'}) + df = pd.read_csv(inpath, sep='|').rename(columns={ + '#CUI_or_CN_id': 'xref_id', + '#CUI': 'xref_id' # 2024/05/19: MedGen Renamed to "#CUI_or_CN_id". Leaving this in case they change back. + }) # Remove empty columns empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row if empty_cols: