diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ffc4e16e..64b31133 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,6 +17,10 @@ jobs: uses: actions/setup-python@v2 with: python-version: 3.8 + - name: Install Forest dependencies + # required by librosa + if: ${{ matrix.os == 'ubuntu-20.04' }} + run: sudo apt-get install -y ffmpeg libsndfile1 - name: Install Forest run: pip install -e . - name: Install dev dependecies diff --git a/docs/source/sycamore.md b/docs/source/sycamore.md index 2e7094fb..db721b29 100644 --- a/docs/source/sycamore.md +++ b/docs/source/sycamore.md @@ -4,6 +4,15 @@ Use `sycamore` to process and analyze Beiwe survey data. +## Installation + +Before using sycamore, dependencies for librosa (ffmpeg and libsndfile1) must be installed first in order to enable processing of audio survey files. + +To install these dependencies on ubuntu, simply run: +`sudo apt-get install -y ffmpeg libsndfile1` + +For more information, see the [librosa documentation](https://librosa.org/doc/latest/install.html) + ## Import User-facing functions can be imported directly from sycamore: @@ -14,8 +23,8 @@ User-facing functions can be imported directly from sycamore: `from forest.sycamore import survey_submits_no_config` `from forest.sycamore import agg_changed_answers_summary` -## Usage: -Download raw data from your Beiwe server and use this package to process the data in the `survey_timings` data stream, using `survey_answers` as a backup for possible missing `survey_timings` files. Summary data provides metrics around survey submissions and survey question completion. Additional outputs are generated if a config file is provided. +## Usage: +Download raw data from your Beiwe server and use this package to process the data in the `survey_timings`, `survey_answers`, and `audio_recordings` data streams, using `survey_answers` as a backup for possible missing `survey_timings` files. Summary data provides metrics around survey submissions and survey question completion. Sycamore takes various auxiliary files which can be downloaded from the Beiwe website to ensure accurate output. ## Data: Methods are designed for use on the `survey_timings` and `survey_answers` data from the Beiwe app. @@ -41,13 +50,13 @@ from forest.sycamore import compute_survey_stats study_dir = path/to/data output_dir = path/to/output beiwe_ids = list of ids in study_dir -time_start = start time -time_end = end time +start_date = "2022-01-01" +end_date = "2022-06-04" study_tz = Timezone of study (if not defined, defaults to 'UTC') compute_survey_stats( - study_dir, output_dir, study_tz, beiwe_ids, time_start=time_start, - time_end = time_end + study_dir, output_dir, study_tz, beiwe_ids, start_date=start_date, + end_date=end_date ) ``` @@ -55,20 +64,25 @@ compute_survey_stats( ``` config_path = path/to/config file interventions_path = path/to/interventions file +history_path = path/to/history/file study_dir = path/to/data output_dir = path/to/output beiwe_ids = list of ids in study_dir -time_start = start time -time_end = end time +start_date = "2022-01-01" +end_date = "2022-06-04" study_tz = Timezone of study (if not defined, defaults to 'UTC') + compute_survey_stats( - study_dir, output_dir, study_tz, beiwe_ids, time_start=time_start, - time_end=time_end, config_path, interventions_path + study_dir, output_dir, study_tz, beiwe_ids, start_date=start_date, + end_date=end_date, config_path, interventions_path, + history_path=history_path ) ``` +Most users should be able to use `compute_survey_stats` for all of their survey processing needs. However, if a study has collected a very large number of surveys, subprocesses are also exposed to reduce processing time. + ___ ## 2. `sycamore.common.aggregate_surveys_config` @@ -78,7 +92,7 @@ Aggregate all survey information from a study, using the config file to infer in ``` from forest.sycamore import aggregate_surveys_config -agg_data = aggregate_surveys_config(study_dir, config_path, study_tz) +agg_data = aggregate_surveys_config(study_dir, config_path, study_tz, history_path=history_path) ``` ___ @@ -92,6 +106,7 @@ from forest.sycamore.submits import survey_submits config_path = path/to/config file interventions_path = path/to/interventions file +history_path = path/to/history/file study_dir = path/to/data output_dir = path/to/output beiwe_ids = list of ids in study_dir @@ -101,11 +116,9 @@ study_tz = Timezone of study (if not defined, defaults to 'UTC') agg_data = aggregate_surveys_config(study_dir, config_path, study_tz) -all_interventions_dict = get_all_interventions_dict(interventions_path) - submits_detail, submits_summary = survey_submits( - config_path, time_start, time_end, beiwe_ids, agg_data, - all_interventions_dict + config_path, time_start, time_end, beiwe_ids, interventions_path, agg_data, + history_path ) ``` @@ -118,9 +131,8 @@ Used to extract an alternative survey submits table that does not include delive from forest.sycamore import survey_submits_no_config study_dir = path/to/data -study_tz = Timezone of study (if not defined, defaults to 'UTC') -submits_tbl = survey_submits_no_config(study_dir, study_tz) +submits_tbl = survey_submits_no_config(study_dir) ``` @@ -133,6 +145,7 @@ Used to extract data summarizing user responses from forest.sycamore import agg_changed_answers_summary config_path = path/to/config file +history_path = path/to/history/file study_dir = path/to/data output_dir = path/to/output beiwe_ids = list of ids in study_dir @@ -140,7 +153,7 @@ time_start = start time time_end = end time study_tz = Timezone of study (if not defined, defaults to 'UTC') -agg_data = aggregate_surveys_config(study_dir, config_path, study_tz) +agg_data = aggregate_surveys_config(study_dir, config_path, study_tz, history_path=history_path) ca_detail, ca_summary = agg_changed_answers_summary(config_path, agg_data) diff --git a/forest/sycamore/read_audio.py b/forest/sycamore/read_audio.py index 54d1b1d4..478e7ab2 100644 --- a/forest/sycamore/read_audio.py +++ b/forest/sycamore/read_audio.py @@ -4,6 +4,7 @@ import os from typing import Dict +import librosa import numpy as np import pandas as pd @@ -12,7 +13,6 @@ filename_to_timestamp) from forest.utils import get_ids - logger = logging.getLogger(__name__) @@ -118,15 +118,18 @@ def read_user_audio_recordings_stream( for survey in survey_ids: # get all audio files in the survey subdirectory all_files = [] - for filepath in os.listdir(os.path.join(audio_dir, survey)): - filename = os.path.basename(filepath) - valid_file = (filepath.endswith(".wav") - or filepath.endswith(".mp4") + all_durations = [] + for filename in os.listdir(os.path.join(audio_dir, survey)): + valid_file = (filename.endswith(".wav") + or filename.endswith(".mp4") and (timestamp_start < filename_to_timestamp(filename, tz_str) < timestamp_end)) if valid_file: - all_files.append(filepath) + all_files.append(filename) + all_durations.append(librosa.get_duration( + filename=os.path.join(audio_dir, survey, filename) + )) if len(all_files) == 0: logger.warning("No audio_recordings for user %s in given time " @@ -146,17 +149,23 @@ def read_user_audio_recordings_stream( # We need to enumerate to tell different survey occasions apart for i, file in enumerate(all_files): filename = os.path.basename(file) + submit_time = filename_to_timestamp(filename, "UTC") + start_time = submit_time - pd.Timedelta(all_durations[i], unit="s") + # Later on, we will delete all rows with blank responses. So, we + # want two rows with the timings and an additional row to be + # deleted later. + current_df = pd.DataFrame({ - "UTC time": [filename_to_timestamp(filename, "UTC")] * 2, - "survey id": [survey] * 2, - "question_id": [survey] * 2, - "answer": ["audio recording", ""], - "question type": ["audio recording", ""], - "question text": [survey_prompt] * 2, - "question answer options": ["audio recording", ""], - "submit_line": [0, 1], # one of the lines will be a submit + "UTC time": [start_time, submit_time, submit_time], + "survey id": [survey] * 3, + "question id": [survey] * 3, + "answer": ["audio recording"]*2 + [""], + "question type": ["audio recording"]*2 + [""], + "question text": [survey_prompt] * 3, + "question answer options": ["audio recording"]*2 + [""], + "submit_line": [0, 0, 1], # one of the lines will be a submit # line - "surv_inst_flg": [i] * 2 + "surv_inst_flg": [i] * 3 }) survey_dfs.append(current_df) if len(survey_dfs) == 0: diff --git a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-14 19_02_10+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-14 19_02_10+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-14 19_02_10+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-14 19_02_10+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-18 01_13_12+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-18 01_13_12+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-18 01_13_12+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-18 01_13_12+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-14 19_12_10+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-14 19_12_10+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-14 19_12_10+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-14 19_12_10+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 differ diff --git a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 index e69de29b..e7f18616 100644 Binary files a/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 and b/forest/sycamore/tests/sample_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 differ diff --git a/forest/sycamore/tests/test_functions.py b/forest/sycamore/tests/test_functions.py index 25d6ced8..79c55266 100644 --- a/forest/sycamore/tests/test_functions.py +++ b/forest/sycamore/tests/test_functions.py @@ -384,8 +384,8 @@ def test_read_user_audio_recordings_stream(): df = read_user_audio_recordings_stream( SAMPLE_DIR, "audioqdz", history_path=AUDIO_SURVEY_HISTORY ) - assert df.shape[0] == 16 - assert df["UTC time"].nunique() == 8 + assert df.shape[0] == 24 # 8 surveys, 3 per survey + assert df["UTC time"].nunique() == 16 # 2 times per survey assert df["survey id"].nunique() == 2 assert df["question text"].nunique() == 2 @@ -394,8 +394,8 @@ def test_read_user_audio_recordings_stream_no_history(): df = read_user_audio_recordings_stream( SAMPLE_DIR, "audioqdz" ) - assert df.shape[0] == 16 - assert df["UTC time"].nunique() == 8 + assert df.shape[0] == 24 # 8 surveys, 3 lines per survey + assert df["UTC time"].nunique() == 16 # 8 surveys, 2 times per survey assert df["question text"].nunique() == 1 assert df["survey id"].nunique() == 2 @@ -404,8 +404,8 @@ def test_read_aggregate_audio_recordings_stream(): df = read_aggregate_audio_recordings_stream( SAMPLE_DIR, history_path=AUDIO_SURVEY_HISTORY ) - assert df.shape[0] == 26 - assert df["UTC time"].nunique() == 8 + assert df.shape[0] == 39 # 13 surveys, with 3 lines each + assert df["UTC time"].nunique() == 16 # 8 times, 2 times per survey assert df["survey id"].nunique() == 2 assert df["question text"].nunique() == 2 assert df["beiwe_id"].nunique() == 2 @@ -413,8 +413,8 @@ def test_read_aggregate_audio_recordings_stream(): def test_read_aggregate_audio_recordings_stream_no_history(): df = read_aggregate_audio_recordings_stream(SAMPLE_DIR) - assert df.shape[0] == 26 - assert df["UTC time"].nunique() == 8 + assert df.shape[0] == 39 + assert df["UTC time"].nunique() == 16 assert df["survey id"].nunique() == 2 assert df["question text"].nunique() == 1 # should only have "UNKNOWN" assert df["question text"].unique().tolist() == ["UNKNOWN"] @@ -423,7 +423,7 @@ def test_read_aggregate_audio_recordings_stream_no_history(): def test_aggregate_surveys_no_config_with_audio(): agg_data = aggregate_surveys_no_config(SAMPLE_DIR, study_tz="UTC",) - assert agg_data.shape[0] == 76 + assert agg_data.shape[0] == 89 assert len(agg_data.DOW.unique()) == 4 diff --git a/mypy.ini b/mypy.ini index 71466e05..712a36bc 100644 --- a/mypy.ini +++ b/mypy.ini @@ -4,6 +4,9 @@ python_version = 3.8 [mypy-holidays] ignore_missing_imports = True +[mypy-librosa] +ignore_missing_imports = True + [mypy-openrouteservice] ignore_missing_imports = True diff --git a/setup.py b/setup.py index 8b4284f4..c1ea26d8 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ requires = [ 'holidays', # poplar + 'librosa', # for audio file durations in sycamore 'numpy', 'openrouteservice', 'pandas',