From 12791b5297ec5cd27fb40d9e71f098b1415c9f50 Mon Sep 17 00:00:00 2001 From: "houhan@gmail.com" Date: Fri, 9 Aug 2024 00:33:46 +0000 Subject: [PATCH] feat: put David's app in --- code/pages/3_AIND data access playground.py | 29 ++++++++ code/util/fetch_data_docDB.py | 74 +++++++++++++++++++++ requirements.txt | 2 + 3 files changed, 105 insertions(+) create mode 100644 code/pages/3_AIND data access playground.py create mode 100644 code/util/fetch_data_docDB.py diff --git a/code/pages/3_AIND data access playground.py b/code/pages/3_AIND data access playground.py new file mode 100644 index 0000000..78293a8 --- /dev/null +++ b/code/pages/3_AIND data access playground.py @@ -0,0 +1,29 @@ +'''Migrated from David's toy app https://codeocean.allenneuraldynamics.org/capsule/9532498/tree +''' + +import logging +from aind_data_access_api.document_db import MetadataDbClient +from util.fetch_data_docDB import fetch_fip_data + +import streamlit as st +from streamlit_dynamic_filters import DynamicFilters + +@st.cache_data +def load_data(): + df = fetch_fip_data(client) + return df + +@st.cache_resource +def load_client(): + return MetadataDbClient( + host="api.allenneuraldynamics.org", + database="metadata_index", + collection="data_assets" + ) + +client = load_client() +df = load_data() + +dynamic_filters = DynamicFilters(df=df, filters=['subject_id', 'subject_genotype']) +dynamic_filters.display_filters() +dynamic_filters.display_df() diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py new file mode 100644 index 0000000..d456db5 --- /dev/null +++ b/code/util/fetch_data_docDB.py @@ -0,0 +1,74 @@ +import pandas as pd +import logging +import time + +logger = logging.getLogger(__name__) + +def fetch_fip_data(client): + # search for records that have the "fib" (for fiber photometry) modality in data_description + logger.warning("fetching 'fib' records...") + modality_results = client.retrieve_docdb_records(filter_query={ + "data_description.modality.abbreviation": "fib" + }) + + # there are more from the past that didn't specify modality correctly. + # until this is fixed, need to guess by asset name + logger.warning("fetching FIP records by name...") + name_results = client.retrieve_docdb_records(filter_query={ + "name": {"$regex": "^FIP.*"} + }) + + # make some dataframes from these two queries + records_by_modality_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in modality_results ]) + records_by_name_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in name_results ]) + + # currently there are some sessions uploaded twice in two different locations. + # let's filter out the ones in aind-ophys-data, a deprecated location + dup_df = records_by_name_df[records_by_name_df.duplicated('session_name',keep=False)] + dup_df = dup_df[dup_df.location.str.contains("aind-ophys-data")] + records_by_name_df = records_by_name_df.drop(dup_df.index.values) + + # now we have a master data frame + combined_df = pd.concat([records_by_modality_df, records_by_name_df], axis=0).drop_duplicates() + + # let's get processed results too + logger.warning("fetching processed results...") + processed_results = client.retrieve_docdb_records(filter_query={ + "name": {"$regex": "^behavior_.*processed_.*"} + }) + + # converting to a dictionary + processed_results_by_name = { r['name']: r for r in processed_results } + + # adding two columns to our master dataframe - result name and result s3 location + combined_df['results'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name')) + combined_df['results_location'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location')) + + return combined_df + + +def map_record_to_dict(record): + """ function to map a metadata dictionary to a simpler dictionary with the fields we care about """ + dd = record.get('data_description', {}) or {} + creation_time = dd.get('creation_time', '') or '' + subject = record.get('subject', {}) or {} + subject_id = subject.get('subject_id') or '' + subject_genotype = subject.get('genotype') or '' + + return { + 'location': record['location'], + 'session_name': record['name'], + 'creation_time': creation_time, + 'subject_id': subject_id, + 'subject_genotype': subject_genotype, + } + + +def find_result(x, lookup): + """ lazy shortcut; we can look for a corresponding result by seeing if part of another record's prefix """ + for result_name, result in lookup.items(): + if result_name.startswith(x): + return result + return {} + + diff --git a/requirements.txt b/requirements.txt index 3bbe467..5ca0c0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,3 +95,5 @@ yarl==1.8.1 zipp==3.10.0 git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main pygwalker==0.4.7 +aind-data-access-api[docdb]==0.13.0 +streamlit-dynamic-filters==0.1.9 \ No newline at end of file