diff --git a/.github/workflows/buid_and_release.yml b/.github/workflows/buid_and_release.yml index 4574510..58cb191 100644 --- a/.github/workflows/buid_and_release.yml +++ b/.github/workflows/buid_and_release.yml @@ -32,9 +32,7 @@ jobs: title: "${{ steps.current-time.outputs.formattedTime }}" prerelease: false # todo: add back `release/medgen-disease-extract.owl`, pending https://github.com/monarch-initiative/medgen/issues/11 - # todo: add `medgen.sssom.tsv`, pending https://github.com/monarch-initiative/medgen/issues/6 # output/release/medgen-disease-extract.owl -# output/release/medgen.sssom.tsv files: | output/release/medgen.obo output/release/medgen-disease-extract.obo diff --git a/makefile b/makefile index 392d586..856fccd 100644 --- a/makefile +++ b/makefile @@ -3,7 +3,7 @@ # that part. In order to force re-download, run `make all -B`. # todo: remove parts of old make/perl pipeline no longer used .DEFAULT_GOAL := all -.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom +.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal sssom sssom-validate OBO=http://purl.obolibrary.org/obo PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl @@ -14,11 +14,10 @@ minimal: build-lite stage-lite clean # stage-lite: These commented out files are produced by `all` but not by `minimal`. Just left here for reference. See: https://github.com/monarch-initiative/medgen/issues/11 stage-lite: | output/release/ # mv medgen-disease-extract.owl output/release/ -# mv medgen.sssom.tsv output/release/ mv *.obo output/release/ mv *.robot.template.tsv output/release/ mv *.sssom.tsv output/release/ -build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv sssom +build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv umls-hpo.sssom.tsv sssom-validate all: build stage clean analyze # analyze: runs more than just this file; that goal creates multiple files @@ -51,6 +50,11 @@ ftp.ncbi.nlm.nih.gov/: uid2cui.tsv: ftp.ncbi.nlm.nih.gov/ ./src/make_uid2cui.pl > $@ +# todo: an issue can happen where the file exists but it triest to run the goal again: +# ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt already exists -- do you wish to overwrite (y or n)? +# This happens because the prerequisite `ftp.ncbi.nlm.nih.gov/` is newer than the goal +# `ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt`. When this happened, it was 5 hours newer. However, I don't +# know how this can possibly be the case, since the goal is unzipped within that folder after the folder is created. ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt: ftp.ncbi.nlm.nih.gov/ @if [ -f "ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz" ]; then \ gzip -dk ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz; \ @@ -75,21 +79,26 @@ medgen-disease-extract.obo: x-Disease-or-Syndrome.obo x-Neoplastic-Process.obo medgen-disease-extract.json: medgen-disease-extract.obo owltools $< -o -f json $@ -medgen-disease-extract.owl: medgen-disease-extract.obo +output/medgen-disease-extract.owl: medgen-disease-extract.obo | output/ owltools $< -o $@ # SSSOM ---------------------------------- -# todo: comemented out old pipeline: remove -#medgen.obographs.json: -# robot convert -i medgen-disease-extract.owl -o $@ -# -#medgen.sssom.tsv: medgen.obographs.json -# sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ -sssom: umls-hpo.sssom.tsv +sssom: umls-hpo.sssom.tsv sssom-validate + +sssom-validate: umls-hpo.sssom.tsv sssom validate umls-hpo.sssom.tsv sssom validate hpo-mesh.sssom.tsv -umls-hpo.sssom.tsv hpo-mesh.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt +# todo: Address GH action build heap space err: +# https://github.com/monarch-initiative/medgen/actions/runs/9150396559/job/25155114016 +# Don't need to fix until the case where we need to use `make all` or otherwise need this file. +output/medgen.obographs.json: output/medgen-disease-extract.owl | output/ + robot convert -i $< -o $@ + +output/medgen.sssom.tsv: output/medgen.obographs.json | output/ + sssom parse $< -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ + +umls-hpo.sssom.tsv hpo-mesh.sssom.tsv output/hpo-mesh_non-matches-included.sssom.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt | output/ python src/create_sssom.py --input-mappings $< --input-sssom-config config/medgen.sssom-metadata.yml # ---------------------------------------- @@ -113,15 +122,15 @@ tmp/input/mondo.sssom.tsv: | tmp/input/ wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@ # creates more than just this file; that goal creates multiple files -output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: tmp/input/mondo.sssom.tsv | output/ +output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: tmp/input/mondo.sssom.tsv output/medgen.sssom.tsv | output/ python src/mondo_mapping_status.py # ---------------------------------------- # Robot templates # ---------------------------------------- # todo: Ideally I wanted this done at the end of the ingest, permuting from medgen.sssom.tsv, but there were some -# problems with that file. Eventually changing to that feels like it makes more sense. Will have already been -# pre-curated by disease. And some of the logic in this Python script is duplicative. +# problems with that file. Eventually changing to that feels like it makes more sense. Will have already been +# pre-curated by disease. And some of the logic in this Python script is duplicative. medgen-xrefs.robot.template.tsv medgen-xrefs-mesh.robot.template.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt python src/mondo_robot_template.py -i $< \ --outpath-general medgen-xrefs.robot.template.tsv \ diff --git a/src/create_sssom.py b/src/create_sssom.py index 1df7afa..4e52b70 100644 --- a/src/create_sssom.py +++ b/src/create_sssom.py @@ -10,10 +10,12 @@ PROJECT_DIR = SRC_DIR.parent FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen" CONFIG_DIR = PROJECT_DIR / "config" +OUTDIR = PROJECT_DIR / "output" INPUT_MAPPINGS = str(FTP_DIR / "MedGenIDMappings.txt") INPUT_CONFIG = str(CONFIG_DIR / "medgen.sssom-metadata.yml") OUTPUT_FILE_HPO_UMLS = str(PROJECT_DIR / "umls-hpo.sssom.tsv") OUTPUT_FILE_HPO_MESH = str(PROJECT_DIR / "hpo-mesh.sssom.tsv") +OUTPUT_FILE_HPO_MESH_WITH_NON_MATCHES = str(OUTDIR / "hpo-mesh_non-matches-included.sssom.tsv") def _filter_and_format_cols(df: pd.DataFrame, source: str) -> pd.DataFrame: @@ -54,8 +56,7 @@ def run(input_mappings: str = INPUT_MAPPINGS, input_sssom_config: str = INPUT_CO # move the col removals below (umls) to above # - add mapping_justification df_hpo_mesh['mapping_justification'] = 'semapv:ManualMappingCuration' - write_sssom(df_hpo_mesh, input_sssom_config, - OUTPUT_FILE_HPO_MESH.replace('.sssom.tsv', '-non-matches-included.sssom.tsv')) + write_sssom(df_hpo_mesh, input_sssom_config, OUTPUT_FILE_HPO_MESH_WITH_NON_MATCHES) # -- filter non-matches & drop unneeded cols df_hpo_mesh = df_hpo_mesh[df_hpo_mesh['subject_id'].notna()][[ x for x in df_hpo_mesh.columns if not x.startswith('umls')]] diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py index ea3a422..e996699 100644 --- a/src/mondo_mapping_status.py +++ b/src/mondo_mapping_status.py @@ -10,7 +10,7 @@ RELEASE_OUTDIR = OUTDIR / 'release' INPUT_DIR = PROJECT_DIR / 'tmp' / 'input' MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv' -MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv' +MEDGEN_SSSOM_TSV = OUTDIR / 'medgen.sssom.tsv' # MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used. # todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest MEDGEN_PREFIXES = [ diff --git a/src/utils.py b/src/utils.py index 6d40cdc..b2807c5 100644 --- a/src/utils.py +++ b/src/utils.py @@ -42,7 +42,9 @@ def get_mapping_set( ) -> pd.DataFrame: """Load up MedGen mapping set (MedGenIDMappings.txt), with some modifications.""" # Read - df = pd.read_csv(inpath, sep='|').rename(columns={'#CUI': 'xref_id'}) + df = pd.read_csv(inpath, sep='|').rename(columns={ + '#CUI_or_CN_id': 'xref_id', + }) # Remove empty columns empty_cols = [col for col in df.columns if df[col].isnull().all()] # caused by trailing | at end of each row if empty_cols: