Skip to content

Commit

Permalink
Subclass Sync - Direct in source, indirect in Mondo (#714)
Browse files Browse the repository at this point in the history
* Subclass Sync - Direct in source, indirect in Mondo
- Add: New outputs: *confirmed-direct-source-indirect-mondo.robot.tsv
- Update: Python & makefile code to handle this case
  - Also: Filter out cases where subject or object in 'MONDO:0000001', 'MONDO:0700096' - 'disease' or 'human disease'
- More general updates:
  - Update: Includes some refactoring around common '-confirmed' cases and in general
  - Update more generally: Tweaked some variable names to be more clear. Corrected an erroneous comment.
  • Loading branch information
joeflack4 authored Dec 16, 2024
1 parent 287ef0b commit 8879acb
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 28 deletions.
8 changes: 6 additions & 2 deletions src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ sync: sync-subclassof sync-synonyms

# Synchronization: SubclassOf
.PHONY: sync-subclassof
sync-subclassof: $(REPORTDIR)/sync-subClassOf.confirmed.tsv $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv
sync-subclassof: $(REPORTDIR)/sync-subClassOf.confirmed.tsv $(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv $(TMPDIR)/sync-subClassOf.added.self-parentage.tsv

# todo: drop this? This is really just an alias here for quality of life, but not used by anything.
.PHONY: sync-subclassof-%
Expand All @@ -561,11 +561,15 @@ $(REPORTDIR)/sync-subClassOf.direct-in-mondo-only.tsv: $(foreach n,$(ALL_COMPONE
$(REPORTDIR)/sync-subClassOf.confirmed.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed.robot.tsv)
awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed.robot.tsv > $@

$(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv
$(REPORTDIR)/sync-subClassOf.confirmed-direct-source-indirect-mondo.tsv: $(foreach n,$(ALL_COMPONENT_IDS), $(REPORTDIR)/$(n).subclass.confirmed-direct-source-indirect-mondo.robot.tsv)
awk '(NR == 1) || (NR == 2) || (FNR > 2)' $(REPORTDIR)/*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv > $@

$(REPORTDIR)/%.subclass.confirmed.robot.tsv $(REPORTDIR)/%.subclass.confirmed-direct-source-indirect-mondo.robot.tsv $(REPORTDIR)/%.subclass.added.robot.tsv $(REPORTDIR)/%.subclass.added-obsolete.robot.tsv $(REPORTDIR)/%.subclass.direct-in-mondo-only.tsv $(TMPDIR)/%.subclass.self-parentage.tsv: $(TMPDIR)/mondo-ingest.db $(TMPDIR)/mondo.db $(TMPDIR)/mondo.sssom.tsv
python3 $(SCRIPTSDIR)/sync_subclassof.py \
--outpath-added $(REPORTDIR)/$*.subclass.added.robot.tsv \
--outpath-added-obsolete $(REPORTDIR)/$*.subclass.added-obsolete.robot.tsv \
--outpath-confirmed $(REPORTDIR)/$*.subclass.confirmed.robot.tsv \
--outpath-confirmed-direct-source-indirect-mondo $(REPORTDIR)/$*.subclass.confirmed-direct-source-indirect-mondo.robot.tsv \
--outpath-direct-in-mondo-only $(REPORTDIR)/$*.subclass.direct-in-mondo-only.tsv \
--outpath-self-parentage $(TMPDIR)/$*.subclass.self-parentage.tsv \
--mondo-db-path $(TMPDIR)/mondo.db \
Expand Down
84 changes: 58 additions & 26 deletions src/scripts/sync_subclassof.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@
from src.scripts.utils import CACHE_DIR, MONDO_PREFIX_MAP, PREFIX_MAP, get_owned_prefix_map


COMMON_SORT_COLS = ['subject_mondo_id', 'object_mondo_id', 'subject_source_id', 'object_source_id']


def _edges_with_metadata_from_plain_edges(edges: Set[RELATIONSHIP], ns_data_map: Dict[CURIE, Dict]) -> List[Dict]:
"""From simple (subject, predicate, object) edge tuples, create dictionaries with mondo mappings and labels
Expand Down Expand Up @@ -114,6 +117,20 @@ def _get_mondo_data_map(
return mondo_data_map


def _confirmed_df(rows: List[Dict], outpath: Union[str, Path], save=True):
"""Create -confirmed cases (exist in both Mondo and source) dataframe"""
subheader = deepcopy(ROBOT_SUBHEADER)
subheader[0]['object_mondo_id'] = 'SC %'
df = pd.DataFrame(rows)
if len(df) == 0:
df = pd.DataFrame(columns=list(subheader[0].keys()))
df = df.sort_values(COMMON_SORT_COLS)
df = pd.concat([pd.DataFrame(subheader), df])
if save:
df.to_csv(outpath, sep='\t', index=False)
return df


def _get_direct_scr_rels(
curies: List[CURIE], db: SqlImplementation, prefix_map: PREFIX_MAP = None, verbose=True
) -> Set[RELATIONSHIP]:
Expand Down Expand Up @@ -208,6 +225,7 @@ def _convert_edge_namespace(

def sync_subclassof(
outpath_added: str = EX_DEFAULTS['outpath_added'], outpath_confirmed: str = EX_DEFAULTS['outpath_confirmed'],
outpath_confirmed_direct_source_indirect_mondo: str = EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'],
outpath_added_obsolete: str = EX_DEFAULTS['outpath_added_obsolete'],
mondo_db_path: str = EX_DEFAULTS['mondo_db_path'], mondo_ingest_db_path: str = EX_DEFAULTS['mondo_ingest_db_path'],
mondo_mappings_path: str = EX_DEFAULTS['mondo_mappings_path'],
Expand Down Expand Up @@ -298,10 +316,12 @@ def sync_subclassof(
# - rels_indirect_source_source: Indirect relationships (source IDs) from source
# - rels_indirect_source_mondo: Relationships from rels_indirect_source_source that appear in Mondo
# - rels_indirect_source_mondo_and_1or2_ids_not_in_mondo: from rels_indirect_source_source not in Mondo (unused)
# - rels_indirect_mondo_source: Relationships from rels_indirect_mondo_mondo that appear in source
logging.info('- Collecting indirect subclass relations')
rels_indirect_mondo_mondo = ancestors_mondo_mondo.difference(rels_direct_mondo_mondo)
# todo: remove unused, commented out vars?
# rels_indirect_mondo_source = '' # not needed?
rels_indirect_mondo_source, rels_indirect_mondo_mondo_and_1or2_ids_not_in_source = _convert_edge_namespace(
rels_indirect_mondo_mondo, mondo_source_map)
# todo: remove unused, commented out vars? or leave for future cases?
# rels_indirect_source_source = ancestors_source_source.difference(rels_direct_source_source)
# rels_indirect_source_mondo, rels_indirect_source_mondo_and_1or2_ids_not_in_mondo = _convert_edge_namespace(
# rels_indirect_source_source, source_mondo_map)
Expand All @@ -323,13 +343,13 @@ def sync_subclassof(
'\n\nExiting.')

# Determine hierarchy differences -----------------------------------------------------------------------------------
# todo: remove unused, commented out vars? (they were created in anticipation of possible cases)
# todo: remove unused, commented out vars? or leave for future cases?
logging.info('Calculating various differences in hierarchies between source and Mondo')
# Find which edges appear in both Mondo and source, or only in one or the other
# - direct <--> direct
# - Direct SCRs exist in both Mondo and source
in_both_direct_source_edges: Set[RELATIONSHIP] = rels_direct_source_source.intersection(rels_direct_mondo_source)
in_both_direct: List[Dict] = _edges_with_metadata_from_plain_edges(in_both_direct_source_edges, source_data_map)
in_both_direct__source_edges: Set[RELATIONSHIP] = rels_direct_source_source.intersection(rels_direct_mondo_source)
in_both_direct: List[Dict] = _edges_with_metadata_from_plain_edges(in_both_direct__source_edges, source_data_map)
# - Direct SCRs in Mondo which are not direct in source (could be indirect or non-existent)
# in_mondo_only_direct_source_edges: Set[RELATIONSHIP] = (
# rels_direct_mondo_source.difference(rels_direct_source_source))
Expand All @@ -341,6 +361,19 @@ def sync_subclassof(
# in_source_only_direct: List[Dict] = _edges_with_metadata_from_plain_edges(
# in_source_only_direct_source_edges, source_data_map)

# - direct --> indirect
in_source_direct_mondo_indirect__source_edges: Set[RELATIONSHIP] = \
rels_direct_source_source.intersection(rels_indirect_mondo_source)
in_source_direct_mondo_indirect: List[Dict] = _edges_with_metadata_from_plain_edges(
in_source_direct_mondo_indirect__source_edges, source_data_map)

# filter edges with: "disease" or "human disease"
filter_terms = ('MONDO:0000001', 'MONDO:0700096')
in_source_direct_mondo_indirect = [
x for x in in_source_direct_mondo_indirect
if x['subject_mondo_id'] not in filter_terms and x['object_mondo_id'] not in filter_terms
]

# - indirect <--> indirect
# - Indirect SCRs that exist in both Mondo and source
# in_both_indirect_mondo_edges: Set[RELATIONSHIP] = (
Expand All @@ -357,13 +390,13 @@ def sync_subclassof(
# in_mondo_only_indirect: List[Dict] = _edges_with_metadata_from_plain_edges(
# in_mondo_only_indirect_mondo_edges, mondo_data_map)

# - direct <--> ancestor
# - direct <--> absent
# - Direct SCRs in source that don't exist at all in Mondo
in_source_direct_not_in_mondo_source_edges = rels_direct_source_source.difference(ancestors_mondo_source)
in_source_direct_not_in_mondo__source_edges = rels_direct_source_source.difference(ancestors_mondo_source)
in_source_direct_not_in_mondo: List[Dict] = _edges_with_metadata_from_plain_edges(
in_source_direct_not_in_mondo_source_edges, source_data_map)
in_source_direct_not_in_mondo__source_edges, source_data_map)
# - Direct SCRs in Mondo that don't exist at all in source
in_mondo_direct_not_in_source_mondo_edges: Set[RELATIONSHIP] = (
in_mondo_direct_not_in_source__mondo_edges: Set[RELATIONSHIP] = (
rels_direct_mondo_mondo.difference(ancestors_source_mondo))
# in_mondo_direct_not_in_source: List[Dict] = _edges_with_metadata_from_plain_edges(
# in_mondo_direct_not_in_source_mondo_edges, mondo_data_map)
Expand All @@ -372,20 +405,12 @@ def sync_subclassof(
# Google doc about cases:
# https://docs.google.com/document/d/1H8fJKiKD-L1tfS-2LJu8t0_2YXJ1PQJ_7Zkoj7Vf7xA/edit#heading=h.9hixairfgxa1
logging.info('Creating outputs for synchronization cases')
common_sort_cols = ['subject_mondo_id', 'object_mondo_id', 'subject_source_id', 'object_source_id']

# Case 1: SCR direct in source and Mondo
subheader = deepcopy(ROBOT_SUBHEADER)
subheader[0]['object_mondo_id'] = 'SC %'
df1 = pd.DataFrame(in_both_direct)
if len(df1) == 0:
df1 = pd.DataFrame(columns=list(subheader[0].keys()))
df1 = df1.sort_values(common_sort_cols)
df1 = pd.concat([pd.DataFrame(subheader), df1])
df1.to_csv(outpath_confirmed, sep='\t', index=False)
_confirmed_df(in_both_direct, outpath_confirmed)

# Case 2: SCR is direct in source, but indirect Mondo
pass # no output for this case
_confirmed_df(in_source_direct_mondo_indirect, outpath_confirmed_direct_source_indirect_mondo)

# Case 3: SCR is direct in the source, but not at all in Mondo
subheader = deepcopy(ROBOT_SUBHEADER)
Expand All @@ -402,14 +427,14 @@ def sync_subclassof(
f'higher number than expected.')
# - format
# only object_mondo_id should ever be missing, but checking both to be safe
df3 = df3[~df3['subject_mondo_id'].isna() & ~df3['object_mondo_id'].isna()].sort_values(common_sort_cols)
df3 = df3[~df3['subject_mondo_id'].isna() & ~df3['object_mondo_id'].isna()].sort_values(COMMON_SORT_COLS)
# - obsolete cases
obsoletes = (df3['subject_mondo_label'].str.startswith('obsolete') |
df3['object_mondo_label'].str.startswith('obsolete'))
# todo: sort_values: from here and below for this section, should remove; should not be necessary
df3_obs = df3[obsoletes].sort_values(common_sort_cols)
df3_obs = df3[obsoletes].sort_values(COMMON_SORT_COLS)
# - filter: non-obsolete cases
df3 = df3[~obsoletes].sort_values(common_sort_cols)
df3 = df3[~obsoletes].sort_values(COMMON_SORT_COLS)
# - self-parentage / proxy merges
self_parentage_cases = df3['subject_mondo_id'] == df3['object_mondo_id']
df3_self_parentage = df3[self_parentage_cases]
Expand All @@ -419,7 +444,7 @@ def sync_subclassof(
logging.warning(sp_err)
df3_self_parentage.to_csv(outpath_self_parentage, sep='\t', index=False)
# - filter: non-self parentage cases
df3 = df3[~self_parentage_cases].sort_values(common_sort_cols)
df3 = df3[~self_parentage_cases].sort_values(COMMON_SORT_COLS)
# - save
df3_obs = pd.concat([pd.DataFrame(subheader), df3_obs])
df3_obs.to_csv(outpath_added_obsolete, sep='\t', index=False)
Expand All @@ -441,7 +466,7 @@ def sync_subclassof(
'UNSUPPORTED-MISSING' if any(x not in mondo_source_map for x in [row['subject_id'], row['object_id']])
# Supported / Unsupported: Whether edge has a direct or indirect SCR in source
else 'UNSUPPORTED-SUBCLASS'
if (row['subject_id'], 'rdfs:subClassOf', row['object_id']) in in_mondo_direct_not_in_source_mondo_edges
if (row['subject_id'], 'rdfs:subClassOf', row['object_id']) in in_mondo_direct_not_in_source__mondo_edges
else 'SUPPORTED',
axis=1)
df5 = df5.sort_values([src_field, 'subject_id', 'object_id'])
Expand All @@ -468,8 +493,13 @@ def cli(): # todo: #remove-temp-defaults
'into Mondo, except for that these terms are obsolete in Mondo.')
parser.add_argument(
'-c', '--outpath-confirmed', required=False, default=EX_DEFAULTS['outpath_confirmed'],
help='Path to output robot template containing subclass relations for given ontology that exist in Mondo and '
'are confirmed to also exist in the source.')
help='Path to output robot template containing direct subclass relations for given ontology that exist in '
'Mondo and are confirmed to also exist in the source.')
parser.add_argument(
'-C', '--outpath-confirmed-direct-source-indirect-mondo', required=False,
default=EX_DEFAULTS['outpath_confirmed_direct_source_indirect_mondo'],
help='Path to output robot template containing subclass relations for given ontology that exist in Mondo as '
'indirect relations and are confirmed to also exist in the source as direct relations.')
parser.add_argument(
'-M', '--outpath-direct-in-mondo-only', required=False,
default=EX_DEFAULTS['outpath_direct_in_mondo_only'],
Expand Down Expand Up @@ -508,6 +538,8 @@ def run_defaults(use_cache=True): # todo: #remove-temp-defaults
sync_subclassof(**{
'outpath_added': str(REPORTS_DIR / f'{name}.subclass.added.robot.tsv'),
'outpath_confirmed': str(REPORTS_DIR / f'{name}.subclass.confirmed.robot.tsv'),
'outpath_confirmed_direct_source_indirect_mondo': \
str(REPORTS_DIR / f'{name}.subclass.confirmed-direct-source-indirect-mondo.robot.tsv'),
'onto_config_path': str(METADATA_DIR / f'{name}.yml'),
'mondo_db_path': str(TMP_DIR / 'mondo.db'),
'mondo_ingest_db_path': str(TMP_DIR / 'mondo-ingest.db'),
Expand Down
2 changes: 2 additions & 0 deletions src/scripts/sync_subclassof_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
'outpath_added': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.added.robot.tsv'),
'outpath_added_obsolete': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.added-obsolete.robot.tsv'),
'outpath_confirmed': str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.confirmed.robot.tsv'),
'outpath_confirmed_direct_source_indirect_mondo': \
str(REPORTS_DIR / f'{EX_ONTO_NAME}.subclass.confirmed-direct-source-indirect-mondo.robot.tsv'),
'onto_config_path': str(METADATA_DIR / f'{EX_ONTO_NAME}.yml'),
'mondo_db_path': str(TMP_DIR / 'mondo.db'),
'mondo_ingest_db_path': str(TMP_DIR / 'mondo-ingest.db'),
Expand Down

0 comments on commit 8879acb

Please sign in to comment.