diff --git a/containers/crosswalking/context/main.py b/containers/crosswalking/context/main.py index 16fdb49..264dac3 100644 --- a/containers/crosswalking/context/main.py +++ b/containers/crosswalking/context/main.py @@ -2,6 +2,7 @@ import anndata import pandas as pd from pathlib import Path +import re def filter_crosswalk_table( @@ -21,6 +22,14 @@ def filter_crosswalk_table( return crosswalk_table[COLUMNS].drop_duplicates() +def generate_iri(label: str): + """generate IRIs for labels not found in crosswalk tables""" + suffix = label.lower().strip() + suffix = re.sub(r"/\W+/g", "-", suffix) + suffix = re.sub(r"[^a-z0-9-]+", "", suffix) + return "ASCTB-TEMP:" + suffix + + def crosswalk( matrix: anndata.AnnData, annotation_column: str, @@ -42,6 +51,15 @@ def crosswalk( right_on=crosswalk_table_label_column, how="left", ).drop(crosswalk_table_label_column, axis=1) + merged_obs.loc[ + merged_obs[crosswalk_table_clid_column].isna(), crosswalk_table_clid_column + ] = merged_obs.apply( + lambda row: generate_iri(row[annotation_column]), + axis=1, + ) + merged_obs.loc[ + merged_obs[crosswalk_table_match_column].isna(), crosswalk_table_match_column + ] = "skos:exactMatch" merged_obs.index = matrix.obs.index matrix.obs = merged_obs return matrix