fix concept categories

MaastrichtU-IDS · Apr 12, 2024 · 9c3d26c · 9c3d26c
1 parent cc3baa8
commit 9c3d26c
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -21,10 +21,6 @@ It aims to enable *data custodians* and *data scientists* to:
     *   The DCR will be automatically created with a data schema corresponding to the selected cohorts, generated from the metadata provided by the data custodians.
     *   The data scientist can then access their DCR in Decentriq, write the code for their analysis, and request computation of this code on the provisioned cohorts.
 
-> [!WARNING]
->
-> If you logged in with a Decentriq user that does not have access to the Cohort Explorer, and need to re-login with another user: you will need to clear cache and cookies. Because Auth0 will keep your login in mind for some time, and it can be quite tricky to reset (they don't give the tools for managing that properly).
-
 > [!IMPORTANT]
 >
 > Only the owner of the cohort (as described in the spreadsheet holding all cohorts generic metadata), and the platform admins,  can upload the data dictionary or edit mappings for a cohort.
@@ -33,6 +29,13 @@ It aims to enable *data custodians* and *data scientists* to:
 >
 > You can reupload a cohort dictionary that have been already uploaded (in case you need to fix something). The mappings defined via the Cohort Explorer will be kept, as long as the variables names do not change.
 
+## ⚠️ Known issues
+
+Here are a known "issues" with the Cohort Explorer, and how to fix them:
+
+- [ ] If you logged in with a Decentriq user that does not have access to the Cohort Explorer, and need to re-login with another user: you will need to clear cache and cookies. Because Auth0 will keep your login in mind for some time, and it can be quite tricky to reset (they don't give the tools for managing that properly).
+- [ ] After a period of inactivity you might see a black screen with an error message, in this case just reload the page
+
 ## 🗺️ Technical overview
 
 This platform is composed of 3 main components:
@@ -48,15 +51,17 @@ This platform is composed of 3 main components:
 
 🔐 Authentication is done through the Decentriq OAuth provider, but it could be replaced by any other OAuth provider easily. Once the user logged in through the external OAuth provider, the backend generates an encrypted JWT token, which is passed to the frontend using HTTP-only cookies.
 
-> \[!NOTE]
+> [!NOTE] 
 >
 > All metadata about cohorts and variables are retrieved by one mighty SPARQL query, and passed to the frontend as one big dictionary. Filtering and searching is then done in TypeScript on this cohorts dictionary.
 >
 > We expect the amount of metadata for all cohorts will stay small enough to be handled directly on the client. If it becomes too big, it can be replaced by performing search and applying filters using SPARQL queries, to only retrieve metadata about relevant cohorts.
 
 ## ☑️ To do
 
-*   [ ] Integrate LUCE blockchain component. Should it be deployed separately, or as a service in the `docker-compose.yml`?
+*   [ ] Integrate the LUCE blockchain component for data sharing consent: 
+    *   [ ] We will store blockchain addresses, handle authentication, and add the UI elements directly in the Cohort Explorer (we can even store private keys or do wallet stuff there too if needed)
+    *   [ ] But we need to be able to query the blockchain easily through an API from our system (a basic HTTP OpenAPI would suffice, e.g. built with [FastAPI](https://fastapi.tiangolo.com))
 
 ## 🧑‍💻 Development
 

diff --git a/backend/src/models.py b/backend/src/models.py
@@ -8,6 +8,7 @@ class VariableCategory:
 
     value: str
     label: str
+    concept_id: Optional[str] = None
     mapped_id: Optional[str] = None
     mapped_label: Optional[str] = None
 

diff --git a/backend/src/upload.py b/backend/src/upload.py
@@ -211,7 +211,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
                 )
         df["categories"] = df["CATEGORICAL"].apply(parse_categorical_string)
         if "Label Concept Code" in df.columns:
-            df["concept_id"] = str(df["Label Concept Code"]).strip()
+            df["concept_id"] = df.apply(lambda row: str(row["Label Concept Code"]).strip(), axis=1)
         else:
             # Try to get IDs from old format multiple columns
             df["concept_id"] = df.apply(lambda row: get_id_from_multi_columns(row), axis=1)
@@ -245,35 +245,34 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
             categories_codes = []
             if row.get("Categorical Value Concept Code"):
                 categories_codes = row["Categorical Value Concept Code"].split(",")
-            # Add properties
-            for column, value in row.items():
-                # if value and column not in ["categories"]:
-                if column not in ["categories"] and value:
+            for column, col_value in row.items():
+                if column not in ["categories"] and col_value:
+                    # NOTE: we literally use the column name as the property URI in camelcase (that's what I call lazy loading!)
                     property_uri = ICARE[to_camelcase(column)]
                     if (
-                        isinstance(value, str)
-                        and (value.startswith("http://") or value.startswith("https://"))
-                        and " " not in value
+                        isinstance(col_value, str)
+                        and (col_value.startswith("http://") or col_value.startswith("https://"))
+                        and " " not in col_value
                     ):
-                        g.add((variable_uri, property_uri, URIRef(value), cohort_uri))
+                        g.add((variable_uri, property_uri, URIRef(col_value), cohort_uri))
                     else:
-                        g.add((variable_uri, property_uri, Literal(value), cohort_uri))
+                        g.add((variable_uri, property_uri, Literal(col_value), cohort_uri))
 
                 # Handle Category
                 if column in ["categories"]:
-                    if len(value) == 1:
+                    if len(col_value) == 1:
                         errors.append(
                             f"Row {i+2} for variable `{row['VARIABLE NAME']}` has only one category `{row['categories'][0]['value']}`. It should have at least two."
                         )
                         continue
-                    for index, category in enumerate(value):
+                    for index, category in enumerate(col_value):
                         cat_uri = get_category_uri(variable_uri, index)
                         g.add((variable_uri, ICARE.categories, cat_uri, cohort_uri))
                         g.add((cat_uri, RDF.type, ICARE.VariableCategory, cohort_uri))
                         g.add((cat_uri, RDF.value, Literal(category["value"]), cohort_uri))
                         g.add((cat_uri, RDFS.label, Literal(category["label"]), cohort_uri))
                         try:
-                            if categories_codes:
+                            if categories_codes and str(categories_codes[index]).strip() != "na":
                                 cat_code_uri = converter.expand(str(categories_codes[index]).strip())
                                 if not cat_code_uri:
                                     errors.append(

diff --git a/backend/src/utils.py b/backend/src/utils.py
@@ -41,7 +41,7 @@ def run_query(query: str) -> dict[str, Any]:
 SELECT DISTINCT ?cohortId ?cohortInstitution ?cohortType ?cohortEmail ?study_type ?study_participants
     ?study_duration ?study_ongoing ?study_population ?study_objective ?airlock
     ?variable ?varName ?varLabel ?varType ?index ?count ?na ?max ?min ?units ?formula ?definition
-    ?omopDomain ?conceptId ?mappedId ?mappedLabel ?visits ?categoryValue ?categoryLabel ?categoryMappedId ?categoryMappedLabel
+    ?omopDomain ?conceptId ?mappedId ?mappedLabel ?visits ?categoryValue ?categoryLabel ?categoryConceptId ?categoryMappedId ?categoryMappedLabel
 WHERE {
     GRAPH ?cohortMetadataGraph {
         ?cohort a icare:Cohort ;
@@ -79,6 +79,7 @@ def run_query(query: str) -> dict[str, Any]:
                 ?variable icare:categories ?category.
                 ?category rdfs:label ?categoryLabel ;
                     rdf:value ?categoryValue .
+                OPTIONAL { ?category icare:conceptId ?categoryConceptId }
             }
         }
     }
@@ -180,6 +181,7 @@ def retrieve_cohorts_metadata(user_email: str) -> dict[str, Cohort]:
                 VariableCategory(
                     value=str(row["categoryValue"]["value"]),
                     label=str(row["categoryLabel"]["value"]),
+                    concept_id=get_curie_value("categoryConceptId", row),
                     mapped_id=get_curie_value("categoryMappedId", row),
                     mapped_label=get_value("categoryMappedLabel", row),
                 )

diff --git a/frontend/src/components/VariablesList.tsx b/frontend/src/components/VariablesList.tsx
@@ -314,8 +314,8 @@ const VariablesList = ({cohortId, searchFilters = {searchQuery: ''}}: any) => {
                                   <AutocompleteConcept
                                     query={option.label}
                                     index={`${cohortId}_${variable.index}_category_${index}`}
-                                    value={option.mapped_id}
-                                    tooltip={option.mapped_label || option.mapped_id}
+                                    value={option.mapped_id || option.concept_id}
+                                    tooltip={option.mapped_label || option.mapped_id || option.concept_id}
                                     onSelect={concept => handleConceptSelect(variable.var_name, concept, index)}
                                     canEdit={cohortsData[cohortId].can_edit}
                                   />

diff --git a/frontend/src/types.ts b/frontend/src/types.ts
@@ -25,19 +25,21 @@ export interface Variable {
   visits: string;
   formula: string;
   definition: string;
-  concept_id: string;
   omop_domain: string;
   index: number;
+  concept_id: string;
+  mapped_id: string | null;
+  mapped_label: string | null;
   categories: Category[];
-  mapped_concept: string | null;
   [key: string]: any;
 }
 
 export interface Category {
   value: string;
   label: string;
-  concept_id: string;
-  mapped_concept: string | null;
+  concept_id: string | null;
+  mapped_id: string | null;
+  mapped_label: string | null;
 }
 
 export interface Concept {