Skip to content

Commit

Permalink
fix concept categories
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Apr 12, 2024
1 parent cc3baa8 commit 9c3d26c
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 26 deletions.
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@ It aims to enable *data custodians* and *data scientists* to:
* The DCR will be automatically created with a data schema corresponding to the selected cohorts, generated from the metadata provided by the data custodians.
* The data scientist can then access their DCR in Decentriq, write the code for their analysis, and request computation of this code on the provisioned cohorts.

> [!WARNING]
>
> If you logged in with a Decentriq user that does not have access to the Cohort Explorer, and need to re-login with another user: you will need to clear cache and cookies. Because Auth0 will keep your login in mind for some time, and it can be quite tricky to reset (they don't give the tools for managing that properly).
> [!IMPORTANT]
>
> Only the owner of the cohort (as described in the spreadsheet holding all cohorts generic metadata), and the platform admins, can upload the data dictionary or edit mappings for a cohort.
Expand All @@ -33,6 +29,13 @@ It aims to enable *data custodians* and *data scientists* to:
>
> You can reupload a cohort dictionary that have been already uploaded (in case you need to fix something). The mappings defined via the Cohort Explorer will be kept, as long as the variables names do not change.
## ⚠️ Known issues

Here are a known "issues" with the Cohort Explorer, and how to fix them:

- [ ] If you logged in with a Decentriq user that does not have access to the Cohort Explorer, and need to re-login with another user: you will need to clear cache and cookies. Because Auth0 will keep your login in mind for some time, and it can be quite tricky to reset (they don't give the tools for managing that properly).
- [ ] After a period of inactivity you might see a black screen with an error message, in this case just reload the page

## 🗺️ Technical overview

This platform is composed of 3 main components:
Expand All @@ -48,15 +51,17 @@ This platform is composed of 3 main components:

🔐 Authentication is done through the Decentriq OAuth provider, but it could be replaced by any other OAuth provider easily. Once the user logged in through the external OAuth provider, the backend generates an encrypted JWT token, which is passed to the frontend using HTTP-only cookies.

> \[!NOTE]
> [!NOTE]
>
> All metadata about cohorts and variables are retrieved by one mighty SPARQL query, and passed to the frontend as one big dictionary. Filtering and searching is then done in TypeScript on this cohorts dictionary.
>
> We expect the amount of metadata for all cohorts will stay small enough to be handled directly on the client. If it becomes too big, it can be replaced by performing search and applying filters using SPARQL queries, to only retrieve metadata about relevant cohorts.
## ☑️ To do

* [ ] Integrate LUCE blockchain component. Should it be deployed separately, or as a service in the `docker-compose.yml`?
* [ ] Integrate the LUCE blockchain component for data sharing consent:
* [ ] We will store blockchain addresses, handle authentication, and add the UI elements directly in the Cohort Explorer (we can even store private keys or do wallet stuff there too if needed)
* [ ] But we need to be able to query the blockchain easily through an API from our system (a basic HTTP OpenAPI would suffice, e.g. built with [FastAPI](https://fastapi.tiangolo.com))

## 🧑‍💻 Development

Expand Down
1 change: 1 addition & 0 deletions backend/src/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class VariableCategory:

value: str
label: str
concept_id: Optional[str] = None
mapped_id: Optional[str] = None
mapped_label: Optional[str] = None

Expand Down
25 changes: 12 additions & 13 deletions backend/src/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
)
df["categories"] = df["CATEGORICAL"].apply(parse_categorical_string)
if "Label Concept Code" in df.columns:
df["concept_id"] = str(df["Label Concept Code"]).strip()
df["concept_id"] = df.apply(lambda row: str(row["Label Concept Code"]).strip(), axis=1)
else:
# Try to get IDs from old format multiple columns
df["concept_id"] = df.apply(lambda row: get_id_from_multi_columns(row), axis=1)
Expand Down Expand Up @@ -245,35 +245,34 @@ def load_cohort_dict_file(dict_path: str, cohort_id: str) -> Dataset:
categories_codes = []
if row.get("Categorical Value Concept Code"):
categories_codes = row["Categorical Value Concept Code"].split(",")
# Add properties
for column, value in row.items():
# if value and column not in ["categories"]:
if column not in ["categories"] and value:
for column, col_value in row.items():
if column not in ["categories"] and col_value:
# NOTE: we literally use the column name as the property URI in camelcase (that's what I call lazy loading!)
property_uri = ICARE[to_camelcase(column)]
if (
isinstance(value, str)
and (value.startswith("http://") or value.startswith("https://"))
and " " not in value
isinstance(col_value, str)
and (col_value.startswith("http://") or col_value.startswith("https://"))
and " " not in col_value
):
g.add((variable_uri, property_uri, URIRef(value), cohort_uri))
g.add((variable_uri, property_uri, URIRef(col_value), cohort_uri))
else:
g.add((variable_uri, property_uri, Literal(value), cohort_uri))
g.add((variable_uri, property_uri, Literal(col_value), cohort_uri))

# Handle Category
if column in ["categories"]:
if len(value) == 1:
if len(col_value) == 1:
errors.append(
f"Row {i+2} for variable `{row['VARIABLE NAME']}` has only one category `{row['categories'][0]['value']}`. It should have at least two."
)
continue
for index, category in enumerate(value):
for index, category in enumerate(col_value):
cat_uri = get_category_uri(variable_uri, index)
g.add((variable_uri, ICARE.categories, cat_uri, cohort_uri))
g.add((cat_uri, RDF.type, ICARE.VariableCategory, cohort_uri))
g.add((cat_uri, RDF.value, Literal(category["value"]), cohort_uri))
g.add((cat_uri, RDFS.label, Literal(category["label"]), cohort_uri))
try:
if categories_codes:
if categories_codes and str(categories_codes[index]).strip() != "na":
cat_code_uri = converter.expand(str(categories_codes[index]).strip())
if not cat_code_uri:
errors.append(
Expand Down
4 changes: 3 additions & 1 deletion backend/src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run_query(query: str) -> dict[str, Any]:
SELECT DISTINCT ?cohortId ?cohortInstitution ?cohortType ?cohortEmail ?study_type ?study_participants
?study_duration ?study_ongoing ?study_population ?study_objective ?airlock
?variable ?varName ?varLabel ?varType ?index ?count ?na ?max ?min ?units ?formula ?definition
?omopDomain ?conceptId ?mappedId ?mappedLabel ?visits ?categoryValue ?categoryLabel ?categoryMappedId ?categoryMappedLabel
?omopDomain ?conceptId ?mappedId ?mappedLabel ?visits ?categoryValue ?categoryLabel ?categoryConceptId ?categoryMappedId ?categoryMappedLabel
WHERE {
GRAPH ?cohortMetadataGraph {
?cohort a icare:Cohort ;
Expand Down Expand Up @@ -79,6 +79,7 @@ def run_query(query: str) -> dict[str, Any]:
?variable icare:categories ?category.
?category rdfs:label ?categoryLabel ;
rdf:value ?categoryValue .
OPTIONAL { ?category icare:conceptId ?categoryConceptId }
}
}
}
Expand Down Expand Up @@ -180,6 +181,7 @@ def retrieve_cohorts_metadata(user_email: str) -> dict[str, Cohort]:
VariableCategory(
value=str(row["categoryValue"]["value"]),
label=str(row["categoryLabel"]["value"]),
concept_id=get_curie_value("categoryConceptId", row),
mapped_id=get_curie_value("categoryMappedId", row),
mapped_label=get_value("categoryMappedLabel", row),
)
Expand Down
4 changes: 2 additions & 2 deletions frontend/src/components/VariablesList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,8 @@ const VariablesList = ({cohortId, searchFilters = {searchQuery: ''}}: any) => {
<AutocompleteConcept
query={option.label}
index={`${cohortId}_${variable.index}_category_${index}`}
value={option.mapped_id}
tooltip={option.mapped_label || option.mapped_id}
value={option.mapped_id || option.concept_id}
tooltip={option.mapped_label || option.mapped_id || option.concept_id}
onSelect={concept => handleConceptSelect(variable.var_name, concept, index)}
canEdit={cohortsData[cohortId].can_edit}
/>
Expand Down
10 changes: 6 additions & 4 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,21 @@ export interface Variable {
visits: string;
formula: string;
definition: string;
concept_id: string;
omop_domain: string;
index: number;
concept_id: string;
mapped_id: string | null;
mapped_label: string | null;
categories: Category[];
mapped_concept: string | null;
[key: string]: any;
}

export interface Category {
value: string;
label: string;
concept_id: string;
mapped_concept: string | null;
concept_id: string | null;
mapped_id: string | null;
mapped_label: string | null;
}

export interface Concept {
Expand Down

0 comments on commit 9c3d26c

Please sign in to comment.