Skip to content

Commit

Permalink
support new format for providing concept and categories codes
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Apr 2, 2024
1 parent 885aa98 commit 0fb6b6a
Showing 1 changed file with 21 additions and 12 deletions.
33 changes: 21 additions & 12 deletions backend/src/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,19 +158,15 @@ def parse_categorical_string(s: str) -> list[dict[str, str]]:
"MAX",
"Definition",
"Formula",
"ICD-10",
"ATC-DDD",
"LOINC",
"SNOMED-CT",
"OMOP",
"Visits",
]
ACCEPTED_DATATYPES = ["STR", "FLOAT", "INT", "DATETIME"]
# Old multiple column format for concept codes:
ID_COLUMNS_NAMESPACES = {"ICD-10": "icd10", "SNOMED-CT": "snomedct", "ATC-DDD": "atc", "LOINC": "loinc"}


def create_uri_from_id(row):
"""Build concepts URIs from the ID provided in the various columns of the data dictionary"""
def get_id_from_multi_columns(row):
"""Old format to pass concepts URIs from the ID provided in the various columns of the data dictionary (ICD-10, ATC, SNOMED...)"""
uris_list = []

for column in ID_COLUMNS_NAMESPACES:
Expand Down Expand Up @@ -211,7 +207,11 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da
detail=f"Missing column `{column}`",
)
df["categories"] = df["CATEGORICAL"].apply(parse_categorical_string)
df["concept_id"] = df.apply(lambda row: create_uri_from_id(row), axis=1)
if "Label Concept Code" in df.columns:
df["concept_id"] = df["Label Concept Code"]
else:
# Try to get IDs from old format multiple columns
df["concept_id"] = df.apply(lambda row: get_id_from_multi_columns(row), axis=1)

cohort_uri = get_cohort_uri(cohort_id)
g = init_graph()
Expand All @@ -226,7 +226,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da
errors.append(f"Row {i+2} is missing required data: variable_name, variable_label, or var_type")
if row["VAR TYPE"] not in ACCEPTED_DATATYPES:
errors.append(
f"Row {i+2} for variable {row['VARIABLE NAME']} is using a wrong datatype: {row['VAR TYPE']}. It should be one of: {', '.join(ACCEPTED_DATATYPES)}"
f"Row {i+2} for variable `{row['VARIABLE NAME']}` is using a wrong datatype: `{row['VAR TYPE']}`. It should be one of: {', '.join(ACCEPTED_DATATYPES)}"
)
# TODO: raise error when duplicate value for VARIABLE LABEL?

Expand All @@ -240,6 +240,10 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da
g.add((variable_uri, RDFS.label, Literal(row["VARIABLE LABEL"]), cohort_uri))
g.add((variable_uri, ICARE["index"], Literal(i, datatype=XSD.integer), cohort_uri))

# Get categories code if provided
categories_codes = []
if "Categorical Value Concept Code" in row and row["Categorical Value Concept Code"]:
categories_codes = row["Categorical Value Concept Code"].split(",")
# Add properties
for column, value in row.items():
# if value and column not in ["categories"]:
Expand All @@ -258,7 +262,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da
if column in ["categories"]:
if len(value) == 1:
errors.append(
f"Row {i+2} for variable {row['VARIABLE NAME']} has only one category `{row['categories'][0]['value']}`. It should have at least two."
f"Row {i+2} for variable `{row['VARIABLE NAME']}` has only one category `{row['categories'][0]['value']}`. It should have at least two."
)
continue
for index, category in enumerate(value):
Expand All @@ -267,7 +271,12 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da
g.add((cat_uri, RDF.type, ICARE.VariableCategory, cohort_uri))
g.add((cat_uri, RDF.value, Literal(category["value"]), cohort_uri))
g.add((cat_uri, RDFS.label, Literal(category["label"]), cohort_uri))
# TODO: add categories
if categories_codes:
cat_code_uri = converter.expand(categories_codes[index])
if not cat_code_uri:
errors.append(f"Row {i+2} for variable `{row['VARIABLE NAME']}` the category concept code provided for `{categories_codes[index]}` is not valid. Use one of snomedct:, icd10:, atc: or loinc: prefixes.")
else:
g.add((cat_uri, ICARE.conceptId, URIRef(cat_code_uri), cohort_uri))
# print(g.serialize(format="turtle"))
# Print all errors at once
if len(errors) > 0:
Expand All @@ -278,7 +287,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str, user_email: str) -> Da
except Exception as e:
raise HTTPException(
status_code=422,
detail=str(e),
detail=str(e)[5:],
)
return g

Expand Down

0 comments on commit 0fb6b6a

Please sign in to comment.