Add duration of audio surveys to sycamore (#139)

* get durations for audio surveys * add librosa to requirements * add real audio to tests * ignore typing for librosa * Add librosa dependencies * update documentation to match current function calls; add instructions for installing dependencies
onnela-lab · Jan 10, 2023 · 8fde57e · 8fde57e
1 parent 5cb9408
commit 8fde57e
Show file tree

Hide file tree

Showing 19 changed files with 72 additions and 42 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -17,6 +17,10 @@ jobs:
         uses: actions/setup-python@v2
         with:
           python-version: 3.8
+      - name: Install Forest dependencies
+        # required by librosa
+        if: ${{ matrix.os == 'ubuntu-20.04' }}
+        run: sudo apt-get install -y ffmpeg libsndfile1
       - name: Install Forest
         run: pip install -e .
       - name: Install dev dependecies

diff --git a/docs/source/sycamore.md b/docs/source/sycamore.md
@@ -4,6 +4,15 @@
 
 Use `sycamore` to process and analyze Beiwe survey data.
 
+## Installation
+
+Before using sycamore, dependencies for librosa (ffmpeg and libsndfile1) must be installed first in order to enable processing of audio survey files.  
+
+To install these dependencies on ubuntu, simply run:  
+`sudo apt-get install -y ffmpeg libsndfile1`  
+
+For more information, see the [librosa documentation](https://librosa.org/doc/latest/install.html)
+
 ## Import
 
 User-facing functions can be imported directly from sycamore:
@@ -14,8 +23,8 @@ User-facing functions can be imported directly from sycamore:
 `from forest.sycamore import survey_submits_no_config` 
 `from forest.sycamore import agg_changed_answers_summary` 
 
-## Usage:  
-Download raw data from your Beiwe server and use this package to process the data in the `survey_timings` data stream, using `survey_answers` as a backup for possible missing `survey_timings` files. Summary data provides metrics around survey submissions and survey question completion. Additional outputs are generated if a config file is provided.
+## Usage:   
+Download raw data from your Beiwe server and use this package to process the data in the `survey_timings`, `survey_answers`, and `audio_recordings` data streams, using `survey_answers` as a backup for possible missing `survey_timings` files. Summary data provides metrics around survey submissions and survey question completion. Sycamore takes various auxiliary files which can be downloaded from the Beiwe website to ensure accurate output.  
 
 ## Data:   
 Methods are designed for use on the `survey_timings` and `survey_answers` data from the Beiwe app.
@@ -41,34 +50,39 @@ from forest.sycamore import compute_survey_stats
 study_dir = path/to/data  
 output_dir = path/to/output
 beiwe_ids = list of ids in study_dir
-time_start = start time
-time_end = end time  
+start_date = "2022-01-01"
+end_date = "2022-06-04"
 study_tz = Timezone of study (if not defined, defaults to 'UTC')
 
 compute_survey_stats(
-    study_dir, output_dir, study_tz, beiwe_ids, time_start=time_start, 
-    time_end = time_end
+    study_dir, output_dir, study_tz, beiwe_ids, start_date=start_date, 
+    end_date=end_date
 )
 ```
 
 *Example (with config file)* 
 ```
 config_path = path/to/config file
 interventions_path = path/to/interventions file
+history_path = path/to/history/file
 study_dir = path/to/data  
 output_dir = path/to/output
 beiwe_ids = list of ids in study_dir
-time_start = start time
-time_end = end time  
+start_date = "2022-01-01"
+end_date = "2022-06-04"
 study_tz = Timezone of study (if not defined, defaults to 'UTC')
 
+
 compute_survey_stats(
-    study_dir, output_dir, study_tz, beiwe_ids, time_start=time_start, 
-    time_end=time_end, config_path, interventions_path
+    study_dir, output_dir, study_tz, beiwe_ids, start_date=start_date, 
+    end_date=end_date, config_path, interventions_path, 
+    history_path=history_path
 )
 
 ```
 
+Most users should be able to use `compute_survey_stats` for all of their survey processing needs. However, if a study has collected a very large number of surveys, subprocesses are also exposed to reduce processing time. 
+
 ___
 ## 2. `sycamore.common.aggregate_surveys_config`
 
@@ -78,7 +92,7 @@ Aggregate all survey information from a study, using the config file to infer in
 ```
 from forest.sycamore import aggregate_surveys_config
 
-agg_data = aggregate_surveys_config(study_dir, config_path, study_tz)
+agg_data = aggregate_surveys_config(study_dir, config_path, study_tz, history_path=history_path)
 ```
 
 ___
@@ -92,6 +106,7 @@ from forest.sycamore.submits import survey_submits
 
 config_path = path/to/config file
 interventions_path = path/to/interventions file
+history_path = path/to/history/file
 study_dir = path/to/data  
 output_dir = path/to/output
 beiwe_ids = list of ids in study_dir
@@ -101,11 +116,9 @@ study_tz = Timezone of study (if not defined, defaults to 'UTC')
 
 agg_data = aggregate_surveys_config(study_dir, config_path, study_tz)
 
-all_interventions_dict = get_all_interventions_dict(interventions_path)
-
 submits_detail, submits_summary = survey_submits(
-    config_path, time_start, time_end, beiwe_ids, agg_data, 
-    all_interventions_dict
+    config_path, time_start, time_end, beiwe_ids, interventions_path, agg_data, 
+    history_path
 )
 ```
 
@@ -118,9 +131,8 @@ Used to extract an alternative survey submits table that does not include delive
 from forest.sycamore import survey_submits_no_config
 
 study_dir = path/to/data  
-study_tz = Timezone of study (if not defined, defaults to 'UTC')
 
-submits_tbl = survey_submits_no_config(study_dir, study_tz)
+submits_tbl = survey_submits_no_config(study_dir)
 
 ```
 
@@ -133,14 +145,15 @@ Used to extract data summarizing user responses
 from forest.sycamore import agg_changed_answers_summary
 
 config_path = path/to/config file
+history_path = path/to/history/file
 study_dir = path/to/data  
 output_dir = path/to/output
 beiwe_ids = list of ids in study_dir
 time_start = start time
 time_end = end time  
 study_tz = Timezone of study (if not defined, defaults to 'UTC')
 
-agg_data = aggregate_surveys_config(study_dir, config_path, study_tz)
+agg_data = aggregate_surveys_config(study_dir, config_path, study_tz, history_path=history_path)
 
 ca_detail, ca_summary = agg_changed_answers_summary(config_path, agg_data)
  

diff --git a/forest/sycamore/read_audio.py b/forest/sycamore/read_audio.py
@@ -4,6 +4,7 @@
 import os
 from typing import Dict
 
+import librosa
 import numpy as np
 import pandas as pd
 
@@ -12,7 +13,6 @@
                                    filename_to_timestamp)
 from forest.utils import get_ids
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -118,15 +118,18 @@ def read_user_audio_recordings_stream(
     for survey in survey_ids:
         # get all audio files in the survey subdirectory
         all_files = []
-        for filepath in os.listdir(os.path.join(audio_dir, survey)):
-            filename = os.path.basename(filepath)
-            valid_file = (filepath.endswith(".wav")
-                          or filepath.endswith(".mp4")
+        all_durations = []
+        for filename in os.listdir(os.path.join(audio_dir, survey)):
+            valid_file = (filename.endswith(".wav")
+                          or filename.endswith(".mp4")
                           and (timestamp_start
                                < filename_to_timestamp(filename, tz_str)
                                < timestamp_end))
             if valid_file:
-                all_files.append(filepath)
+                all_files.append(filename)
+                all_durations.append(librosa.get_duration(
+                    filename=os.path.join(audio_dir, survey, filename)
+                ))
 
         if len(all_files) == 0:
             logger.warning("No audio_recordings for user %s in given time "
@@ -146,17 +149,23 @@ def read_user_audio_recordings_stream(
         # We need to enumerate to tell different survey occasions apart
         for i, file in enumerate(all_files):
             filename = os.path.basename(file)
+            submit_time = filename_to_timestamp(filename, "UTC")
+            start_time = submit_time - pd.Timedelta(all_durations[i], unit="s")
+            # Later on, we will delete all rows with blank responses. So, we
+            # want two rows with the timings and an additional row to be
+            # deleted later.
+
             current_df = pd.DataFrame({
-                "UTC time": [filename_to_timestamp(filename, "UTC")] * 2,
-                "survey id": [survey] * 2,
-                "question_id": [survey] * 2,
-                "answer": ["audio recording", ""],
-                "question type": ["audio recording", ""],
-                "question text": [survey_prompt] * 2,
-                "question answer options": ["audio recording", ""],
-                "submit_line": [0, 1],  # one of the lines will be a submit
+                "UTC time": [start_time, submit_time, submit_time],
+                "survey id": [survey] * 3,
+                "question id": [survey] * 3,
+                "answer": ["audio recording"]*2 + [""],
+                "question type": ["audio recording"]*2 + [""],
+                "question text": [survey_prompt] * 3,
+                "question answer options": ["audio recording"]*2 + [""],
+                "submit_line": [0, 0, 1],  # one of the lines will be a submit
                 # line
-                "surv_inst_flg": [i] * 2
+                "surv_inst_flg": [i] * 3
             })
             survey_dfs.append(current_df)
         if len(survey_dfs) == 0:

diff --git a/...mple_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 b/...mple_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4
diff --git a/...mple_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 b/...mple_dir/audioid2/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4
diff --git a/...mple_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 b/...mple_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4
diff --git a/...mple_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 b/...mple_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4
diff --git a/...mple_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 b/...mple_dir/audioid2/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-11 09_12_10+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-14 19_02_10+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-14 19_02_10+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-18 01_13_12+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-18 01_13_12+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/6iWVNrsd1RE2zAeIPegZDrCc/2021-12-21 18_48_38+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-08 01_13_12+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-14 19_12_10+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-14 19_12_10+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-18 09_12_10+00_00.mp4
diff --git a/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4 b/...mple_dir/audioqdz/audio_recordings/tO1GFjGJjMnaDRThUQK6l4dv/2021-12-20 18_48_38+00_00.mp4
diff --git a/forest/sycamore/tests/test_functions.py b/forest/sycamore/tests/test_functions.py
@@ -384,8 +384,8 @@ def test_read_user_audio_recordings_stream():
     df = read_user_audio_recordings_stream(
         SAMPLE_DIR, "audioqdz", history_path=AUDIO_SURVEY_HISTORY
     )
-    assert df.shape[0] == 16
-    assert df["UTC time"].nunique() == 8
+    assert df.shape[0] == 24  # 8 surveys, 3 per survey
+    assert df["UTC time"].nunique() == 16  # 2 times per survey
     assert df["survey id"].nunique() == 2
     assert df["question text"].nunique() == 2
 
@@ -394,8 +394,8 @@ def test_read_user_audio_recordings_stream_no_history():
     df = read_user_audio_recordings_stream(
         SAMPLE_DIR, "audioqdz"
     )
-    assert df.shape[0] == 16
-    assert df["UTC time"].nunique() == 8
+    assert df.shape[0] == 24  # 8 surveys, 3 lines per survey
+    assert df["UTC time"].nunique() == 16   # 8 surveys, 2 times per survey
     assert df["question text"].nunique() == 1
     assert df["survey id"].nunique() == 2
 
@@ -404,17 +404,17 @@ def test_read_aggregate_audio_recordings_stream():
     df = read_aggregate_audio_recordings_stream(
         SAMPLE_DIR, history_path=AUDIO_SURVEY_HISTORY
     )
-    assert df.shape[0] == 26
-    assert df["UTC time"].nunique() == 8
+    assert df.shape[0] == 39  # 13 surveys, with 3 lines each
+    assert df["UTC time"].nunique() == 16  # 8 times, 2 times per survey
     assert df["survey id"].nunique() == 2
     assert df["question text"].nunique() == 2
     assert df["beiwe_id"].nunique() == 2
 
 
 def test_read_aggregate_audio_recordings_stream_no_history():
     df = read_aggregate_audio_recordings_stream(SAMPLE_DIR)
-    assert df.shape[0] == 26
-    assert df["UTC time"].nunique() == 8
+    assert df.shape[0] == 39
+    assert df["UTC time"].nunique() == 16
     assert df["survey id"].nunique() == 2
     assert df["question text"].nunique() == 1  # should only have "UNKNOWN"
     assert df["question text"].unique().tolist() == ["UNKNOWN"]
@@ -423,7 +423,7 @@ def test_read_aggregate_audio_recordings_stream_no_history():
 
 def test_aggregate_surveys_no_config_with_audio():
     agg_data = aggregate_surveys_no_config(SAMPLE_DIR, study_tz="UTC",)
-    assert agg_data.shape[0] == 76
+    assert agg_data.shape[0] == 89
     assert len(agg_data.DOW.unique()) == 4
 
 

diff --git a/mypy.ini b/mypy.ini
@@ -4,6 +4,9 @@ python_version = 3.8
 [mypy-holidays]
 ignore_missing_imports = True
 
+[mypy-librosa]
+ignore_missing_imports = True
+
 [mypy-openrouteservice]
 ignore_missing_imports = True
 

diff --git a/setup.py b/setup.py
@@ -2,6 +2,7 @@
 
 requires = [
     'holidays',  # poplar
+    'librosa',  # for audio file durations in sycamore
     'numpy',
     'openrouteservice',
     'pandas',