Skip to content

Commit

Permalink
Merge pull request #82 from AllenNeuralDynamics/merge_in_df_docDB
Browse files Browse the repository at this point in the history
feat: Merge in dataframe from mongo_db
  • Loading branch information
hanhou authored Aug 21, 2024
2 parents 8de2877 + 4bf1fee commit 2f796f8
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 48 deletions.
8 changes: 7 additions & 1 deletion .streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
[theme]
base="dark"
base="dark"

[server]
fileWatcherType = "poll"

[global]
disableWidgetStateDuplicationWarning = true
26 changes: 23 additions & 3 deletions code/Home.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""

__ver__ = 'v2.4.1'
__ver__ = 'v2.5.0'

import pandas as pd
import streamlit as st
Expand Down Expand Up @@ -42,6 +42,8 @@
multiselect_wrapper_for_url_query, number_input_wrapper_for_url_query,
)

from util.fetch_data_docDB import load_data_from_docDB

from aind_auto_train.curriculum_manager import CurriculumManager
from aind_auto_train.auto_train_manager import DynamicForagingAutoTrainManager
from aind_auto_train import __version__ as auto_train_version
Expand Down Expand Up @@ -309,7 +311,7 @@ def init():

# For historial reason, the suffix of df['sessions_bonsai'] just mean the data of the Home.py page
df['sessions_bonsai'] = pd.concat([df['sessions_bonsai'], df_bpod['sessions_bonsai']], axis=0)

st.session_state.df = df
st.session_state.df_selected_from_plotly = pd.DataFrame(columns=['h2o', 'session'])
st.session_state.df_selected_from_dataframe = pd.DataFrame(columns=['h2o', 'session'])
Expand Down Expand Up @@ -450,9 +452,12 @@ def _get_data_source(rig):
'task', 'notes']
new_order = first_several_cols + [col for col in _df.columns if col not in first_several_cols]
_df = _df[new_order]


# --- Load data from docDB ---
_df = merge_in_df_docDB(_df)

st.session_state.df['sessions_bonsai'] = _df # Somehow _df loses the reference to the original dataframe

st.session_state.session_stats_names = [keys for keys in _df.keys()]

# Set session state from URL
Expand All @@ -463,6 +468,21 @@ def _get_data_source(rig):

return True

def merge_in_df_docDB(_df):
# Fetch df_docDB
df = load_data_from_docDB()

# Parse session and subject_id from session_name
df['session_date'] = pd.to_datetime(df['session_name'].str.split('_').str[2])
# Extract the session_time. remove the '-' and remove the leading zero.
df['session_time'] = df['session_name'].str.split('_').str[-1]
df['nwb_suffix'] = df['session_time'].str.replace('-', '').str.lstrip('0').astype('int64')

# Merge with _df. left merged to keep everything on han's side

left_merged = pd.merge(_df, df, how='left', on=['subject_id', 'session_date', 'nwb_suffix'])

return left_merged

def app():

Expand Down
25 changes: 7 additions & 18 deletions code/pages/3_AIND data access playground.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
'''

import logging
from aind_data_access_api.document_db import MetadataDbClient
from util.fetch_data_docDB import fetch_fip_data

import streamlit as st
from streamlit_dynamic_filters import DynamicFilters

from util.fetch_data_docDB import load_data_from_docDB

try:
st.set_page_config(layout="wide",
page_title='Foraging behavior browser',
Expand All @@ -20,22 +19,12 @@
except:
pass

@st.cache_data
def load_data():
df = fetch_fip_data(client)
return df

@st.cache_resource
def load_client():
return MetadataDbClient(
host="api.allenneuraldynamics.org",
database="metadata_index",
collection="data_assets"
)
df = load_data_from_docDB()

client = load_client()
df = load_data()
st.markdown(f'### Note: the dataframe showing here has been merged in to the master table on the Home page!')

dynamic_filters = DynamicFilters(df=df, filters=['subject_id', 'subject_genotype'])
dynamic_filters = DynamicFilters(
df=df,
filters=['subject_id', 'subject_genotype'])
dynamic_filters.display_filters()
dynamic_filters.display_df()
91 changes: 72 additions & 19 deletions code/util/fetch_data_docDB.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,84 @@
"""Code to fetch data from docDB by David Feng
"""

import pandas as pd
import logging
import time

import semver
import streamlit as st
logger = logging.getLogger(__name__)

from aind_data_access_api.document_db import MetadataDbClient

@st.cache_data(ttl=3600*12) # Cache the df_docDB up to 12 hours
def load_data_from_docDB():
client = load_client()
df = fetch_fip_data(client)
return df

@st.cache_resource
def load_client():
return MetadataDbClient(
host="api.allenneuraldynamics.org",
database="metadata_index",
collection="data_assets"
)


def find_probes(r):
version = semver.Version.parse((r.get('procedures') or {}).get('schema_version', '0.0.0'))

probes = []
if version >= "0.8.1": # post introduction of Surgery concept
sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {}
for sub_proc in sub_procs:
if sub_proc.get('procedure_type') == 'Surgery':
for sp in sub_proc['procedures']:
if sp['procedure_type'] == 'Fiber implant':
probes += sp['probes']
else: # pre Surgery
sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {}
for sp in sub_procs:
if sp['procedure_type'] == 'Fiber implant':
probes += sp['probes']

return probes


def fetch_fip_data(client):
# search for records that have the "fib" (for fiber photometry) modality in data_description
logger.warning("fetching 'fib' records...")
modality_results = client.retrieve_docdb_records(filter_query={
"data_description.modality.abbreviation": "fib"
})
modality_results = client.retrieve_docdb_records(
filter_query={"data_description.modality.abbreviation": "fib"},
paginate_batch_size=500
)
logger.warning(f"found {len(modality_results)} results")

# there are more from the past that didn't specify modality correctly.
# until this is fixed, need to guess by asset name
logger.warning("fetching FIP records by name...")
name_results = client.retrieve_docdb_records(filter_query={
"name": {"$regex": "^FIP.*"}
})

# make some dataframes from these two queries
records_by_modality_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in modality_results ])
records_by_name_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in name_results ])
name_results = client.retrieve_docdb_records(
filter_query={"name": {"$regex": "^FIP.*"}},
paginate_batch_size=500
)
logger.warning(f"found {len(name_results)} results")

# in case there is overlap between these two queries, filter down to a single list with unique IDs
unique_results_by_id = { r['_id']: r for r in modality_results } | { r['_id']: r for r in name_results }
results = list(unique_results_by_id.values())
logger.warning(f"found {len(results)} unique results")

# filter out results with 'processed' in the name because I can't rely on data_description.data_level :(
results = [ r for r in results if not 'processed' in r['name'] ]

# make a dataframe
records_df = pd.DataFrame.from_records([ map_record_to_dict(d) for d in results ])

# currently there are some sessions uploaded twice in two different locations.
# let's filter out the ones in aind-ophys-data, a deprecated location
dup_df = records_by_name_df[records_by_name_df.duplicated('session_name',keep=False)]
dup_df = records_df[records_df.duplicated('session_name',keep=False)]
dup_df = dup_df[dup_df.location.str.contains("aind-ophys-data")]
records_by_name_df = records_by_name_df.drop(dup_df.index.values)

# now we have a master data frame
combined_df = pd.concat([records_by_modality_df, records_by_name_df], axis=0).drop_duplicates()
records_df = records_df.drop(dup_df.index.values)

# let's get processed results too
logger.warning("fetching processed results...")
Expand All @@ -41,10 +90,11 @@ def fetch_fip_data(client):
processed_results_by_name = { r['name']: r for r in processed_results }

# adding two columns to our master dataframe - result name and result s3 location
combined_df['results'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name'))
combined_df['results_location'] = combined_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location'))
records_df['results'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('name'))
records_df['results_location'] = records_df.session_name.apply(lambda x: find_result(x, processed_results_by_name).get('location'))

return combined_df
return records_df



def map_record_to_dict(record):
Expand All @@ -55,12 +105,15 @@ def map_record_to_dict(record):
subject_id = subject.get('subject_id') or ''
subject_genotype = subject.get('genotype') or ''


return {
'location': record['location'],
'session_name': record['name'],
'creation_time': creation_time,
'subject_id': subject_id,
'subject_genotype': subject_genotype,
'probes': str(find_probes(record))

}


Expand Down
5 changes: 2 additions & 3 deletions code/util/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,9 +1004,8 @@ def _add_agg(df_this, x_name, y_name, group, aggr_method, if_use_x_quantile, q_q
else:
df['dot_size'] = dot_size_base

# Turn column of group_by to string if it's not
if not is_string_dtype(df[group_by]):
df[group_by] = df[group_by].astype(str)
# Always turn group_by column to str
df[group_by] = df[group_by].astype(str)

# Add a diagonal line first
if if_show_diagonal:
Expand Down
4 changes: 2 additions & 2 deletions environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ RUN pip install -U --no-cache-dir \
scipy==1.10.0 \
scikit-learn==1.3.2 \
seaborn==0.11.2 \
semver==2.13.0 \
six==1.16.0 \
smmap==5.0.0 \
statannotations==0.5.0 \
Expand Down Expand Up @@ -107,7 +106,8 @@ RUN pip install -U --no-cache-dir \
git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main \
pygwalker==0.4.7 \
aind-data-access-api[docdb]==0.13.0 \
streamlit-dynamic-filters==0.1.9
streamlit-dynamic-filters==0.1.9 \
semver==3.0.2


ADD "https://github.com/coder/code-server/releases/download/v4.21.1/code-server-4.21.1-linux-amd64.tar.gz" /.code-server/code-server.tar.gz
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ s3fs==2022.11.0
scipy==1.10.0
scikit-learn==1.3.2
seaborn==0.11.2
semver==2.13.0
six==1.16.0
smmap==5.0.0
statannotations==0.5.0
Expand Down Expand Up @@ -96,4 +95,5 @@ zipp==3.10.0
git+https://github.com/AllenNeuralDynamics/aind-foraging-behavior-bonsai-automatic-training.git@main
pygwalker==0.4.7
aind-data-access-api[docdb]==0.13.0
streamlit-dynamic-filters==0.1.9
streamlit-dynamic-filters==0.1.9
semver==3.0.2

0 comments on commit 2f796f8

Please sign in to comment.