Skip to content

Commit

Permalink
individual pandas scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Apr 9, 2024
1 parent 65c8cb4 commit cdfb382
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 29 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ It aims to enable data owners and data scientists to:
* Filter variables per OMOP domain, data type, categorical or not
* 🔗 Data owners can map each variable of their cohorts to standard concepts, sourced from [OHDSI Athena](https://athena.ohdsi.org/search-terms/terms?query=) API (SNOMEDCT, LOINC...) through the web app.
* Mapping variables will help with data processing and exploration (⚠️ work in progress)
* We use namespaces from the [Bioregistry](https://bioregistry.io) to convert concepts CURIEs to URIs.
* 🛒 Data scientists can add the cohorts they need to perform their analysis to a Data Clean Room (DCR)
* Once complete, the data scientists can publish their DCR to Decentriq in one click.
* The DCR will be automatically created with a data schema corresponding to the selected cohorts, generated from the metadata provided by the data owners.
Expand Down
2 changes: 1 addition & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ dependencies = [
"oxrdflib",
"SPARQLWrapper",
"python-dotenv",
"decentriq_platform >=0.26.0", # TODO: conflict with pydantic 2
"decentriq_platform >=0.26.2rc1", # TODO: conflict with pydantic 2
"curies",
# "pydantic >=2.0.0",
# "pydantic-settings",
Expand Down
57 changes: 29 additions & 28 deletions backend/src/decentriq.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,40 +127,23 @@ async def create_compute_dcr(

# Get metadata for selected cohorts and variables
selected_cohorts = {}
# We generate a pandas script to automatically prepare the data from the cohort based on known metadata
pandas_script = "import pandas as pd\nimport decentriq_util\n\n"

# TODO: DONT FILTER COLUMNS IN SCHEMA
# 1 prepare script per data node
for cohort_id, requested_vars in cohorts_request["cohorts"].items():
cohort_meta = deepcopy(all_cohorts[cohort_id])
df_var = f"df_{cohort_id.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '')}"
if isinstance(requested_vars, list):
# Direct cohort variables list
# pandas_script += f"{df_var} = pd.read_csv('{cohort_id}.csv')\n"
pandas_script += f'{df_var} = decentriq_util.read_tabular_data("/input/{cohort_id}")\n'

if len(requested_vars) <= len(cohort_meta.variables):
# Add filter variables to pandas script
pandas_script += f"{df_var} = {df_var}[{requested_vars}]\n"
# NOTE: this block would filter variables only selected by user.
# We don't want this anymore.
# Get all cohort and variables metadata for selected variables
# for var in all_cohorts[cohort_id].variables:
# if var not in requested_vars:
# del cohort_meta.variables[var]
selected_cohorts[cohort_id] = cohort_meta
elif isinstance(requested_vars, dict):
# Merge operation, need to be implemented on the frontend
pandas_script += pandas_script_merge_cohorts(requested_vars, all_cohorts)
# TODO: add merged cohorts schema to selected_cohorts
# elif isinstance(requested_vars, dict):
# # Merge operation, need to be implemented on the frontend
# pandas_script += pandas_script_merge_cohorts(requested_vars, all_cohorts)
# # TODO: add merged cohorts schema to selected_cohorts
else:
raise HTTPException(status_code=400, detail=f"Invalid structure for cohort {cohort_id}")
pandas_script += f'{df_var}.to_csv("/output/{cohort_id}.csv", index=False, header=True)\n\n'


# TODO: Add pandas_script to the DCR?
# print(pandas_script)

# Establish connection to Decentriq
client = dq.create_client(settings.decentriq_email, settings.decentriq_token)
Expand All @@ -172,13 +155,11 @@ async def create_compute_dcr(
builder = (
AnalyticsDcrBuilder(client=client)
.with_name(dcr_title)
# .with_owner(user["email"])
.with_owner(settings.decentriq_email)
.with_description("A data clean room to run computations on cohorts for the iCARE4CVD project")
.with_airlock()
)


preview_nodes = []
# Convert cohort variables to decentriq schema
for cohort_id, cohort in selected_cohorts.items():
Expand All @@ -196,14 +177,34 @@ async def create_compute_dcr(
))
preview_nodes.append(preview_node_id)

# Add data owners to provision the data
for owner in cohort.cohort_email:
builder.add_participant(owner, data_owner_of=[data_node_id])

# Add python data preparation script
# builder.add_node_definition(
# PythonComputeNodeDefinition(name="prepare-data", script=pandas_script, dependencies=data_nodes)
# )
# builder.add_participant(user["email"], analyst_of=["prepare-data", *preview_nodes])
# Add pandas preparation script
pandas_script = "import pandas as pd\nimport decentriq_util\n\n"
df_var = f"df_{cohort_id.replace('-', '_')}"
requested_vars = cohorts_request["cohorts"][cohort_id]
if isinstance(requested_vars, list):
# Direct cohort variables list
pandas_script += f'{df_var} = decentriq_util.read_tabular_data("/input/{cohort_id}")\n'

if len(requested_vars) <= len(cohort.variables):
# Add filter variables to pandas script
pandas_script += f"{df_var} = {df_var}[{requested_vars}]\n"
elif isinstance(requested_vars, dict):
# Merge operation, need to be implemented on the frontend
pandas_script += pandas_script_merge_cohorts(requested_vars, all_cohorts)
# TODO: add merged cohorts schema to selected_cohorts
else:
raise HTTPException(status_code=400, detail=f"Invalid structure for cohort {cohort_id}")
pandas_script += f'{df_var}.to_csv("/output/{cohort_id}.csv", index=False, header=True)\n\n'

# Add python data preparation script
builder.add_node_definition(
PythonComputeNodeDefinition(name=f"prepare-{cohort_id}", script=pandas_script, dependencies=[data_node_id])
)
builder.add_participant(user["email"], analyst_of=[f"prepare-{cohort_id}"])

# Add users permissions
builder.add_participant(user["email"], analyst_of=preview_nodes)
Expand Down

0 comments on commit cdfb382

Please sign in to comment.