Sycamore clean (#194)

onnela-lab · Aug 11, 2023 · 121a90c · 121a90c
1 parent 3042848
commit 121a90c
Show file tree

Hide file tree

Showing 7 changed files with 251 additions and 170 deletions.
diff --git a/forest/sycamore/base.py b/forest/sycamore/base.py
@@ -1,3 +1,4 @@
+"""Base functions for computing survey statistics"""
 import logging
 import os
 from typing import Optional, List
@@ -32,68 +33,82 @@ def compute_survey_stats(
     """Compute statistics on surveys
 
     Args:
-    output_folder:
-        File path to output summaries and details
-    study_folder:
-        File path to study data
-    config_path:
-        File path to study configuration file. Study configuration files can be
-        downloaded by clicking "Edit this Study" on the Beiwe website, then
-        clicking "Export study settings JSON file" under "Export/Import study
-        settings".
-    start_date:
-        The earliest date of survey data to read in, in YYYY-MM-DD format
-    end_date:
-        The latest survey data to read in, in YYYY-MM-DD format
-    users:
-        List of users in study for which we are generating a survey schedule
-    tz_str:
-        Timezone of study. This defaults to "UTC"
-    interventions_filepath:
-        filepath where interventions json file is. The interventions json file
-        can be downloaded by clicking "Edit this Study" on the Beiwe
-        website, then clicking clicking "Download Interventions" next to
-        "Intervention Data".
-    augment_with_answers:
-        Whether to use the survey_answers stream to fill in missing surveys
-        from survey_timings
-    submits_timeframe:
-        The timeframe to summarize survey submissions over of class
-        forest.constants.Frequency
-        An overall summary for each user is
-        always generated ("submits_summary_overall.csv"), and submissions can
-        also be generated across days ("submits_summary_daily.csv"), hours
-        ("submits_summary_hourly.csv") or both.
-    submits_by_survey_id:
-        Whether to summarize survey submits with separate lines for different
-        surveys in submits_summary.csv. By default, this is True, so a
-        different line for each survey will be generated.
-    history_path: Path to survey history file. If this is included, the survey
-        history file is used to find instances of commas or semicolons in
-        answer choices to determine the correct choice for Android radio
-        questions. In addition, this is used to generate timings for audio
-        surveys. The survey history json file can be downloaded by clicking
-        "Edit this Study" on the Beiwe website, then clicking clicking
-        "Download Surveys" next to "Survey History".
-    include_audio_surveys:
-        Whether to include submissions of audio surveys in addition to text
-        surveys
+        output_folder:
+            File path to output summaries and details
+        study_folder:
+            File path to study data
+        config_path:
+            File path to study configuration file. Study configuration files
+            can be downloaded by clicking "Edit this Study" on the
+            Beiwe website, then clicking "Export study settings JSON file"
+            under "Export/Import study settings".
+        start_date:
+            The earliest date of survey data to read in, in YYYY-MM-DD format
+        end_date:
+            The latest survey data to read in, in YYYY-MM-DD format
+        users:
+            List of users in study for which
+            we are generating a survey schedule
+        tz_str:
+            Timezone of study. This defaults to "UTC"
+        interventions_filepath:
+            filepath where interventions json file is.
+            The interventions json file
+            can be downloaded by clicking "Edit this Study" on the Beiwe
+            website, then clicking clicking "Download Interventions" next to
+            "Intervention Data".
+        augment_with_answers:
+            Whether to use the survey_answers
+            stream to fill in missing surveys
+            from survey_timings
+        submits_timeframe:
+            The timeframe to summarize survey submissions over of class
+            forest.constants.Frequency
+            An overall summary for each user is
+            always generated ("submits_summary_overall.csv"),
+            and submissions can
+            also be generated across days ("submits_summary_daily.csv"), hours
+            ("submits_summary_hourly.csv") or both.
+        submits_by_survey_id:
+            Whether to summarize survey submits
+            with separate lines for different
+            surveys in submits_summary.csv. By default, this is True, so a
+            different line for each survey will be generated.
+        history_path: Path to survey history file.
+            If this is included, the survey
+            history file is used to find instances of commas or semicolons in
+            answer choices to determine the correct choice for Android radio
+            questions. In addition, this is used to generate timings for audio
+            surveys. The survey history json file
+            can be downloaded by clicking
+            "Edit this Study" on the Beiwe website, then clicking clicking
+            "Download Surveys" next to "Survey History".
+        include_audio_surveys:
+            Whether to include submissions of audio surveys in addition to text
+            surveys
+
+    Returns:
+        True if successful, False otherwise
     """
     os.makedirs(output_folder, exist_ok=True)
     os.makedirs(os.path.join(output_folder, "summaries"), exist_ok=True)
     os.makedirs(os.path.join(output_folder, "by_survey"), exist_ok=True)
+
     if users is None:
         users = get_ids(study_folder)
     if end_date is None:
         end_date = get_month_from_today()
+
     # Read, aggregate and clean data
     if config_path is None:
         logger.warning("No config file provided. "
                        "Skipping some summary outputs.")
+
         agg_data = aggregate_surveys_no_config(
             study_folder, tz_str, users, start_date, end_date,
             augment_with_answers, include_audio_surveys
         )
+
         if agg_data.shape[0] == 0:
             logger.error("Error: No survey data found in %s", study_folder)
             return True
@@ -102,9 +117,11 @@ def compute_survey_stats(
             study_folder, config_path, tz_str, users, start_date,
             end_date, augment_with_answers, history_path, include_audio_surveys
         )
+
         if agg_data.shape[0] == 0:
             logger.error("Error: No survey data found in %s", study_folder)
             return True
+
         # Create changed answers detail and summary
         ca_detail, ca_summary = agg_changed_answers_summary(config_path,
                                                             agg_data)
@@ -116,14 +133,17 @@ def compute_survey_stats(
             os.path.join(output_folder, "summaries", "answers_summary.csv"),
             index=False
         )
+
         if start_date is not None and end_date is not None:
             # Create survey submits detail and summary
             ss_detail = survey_submits(
                 config_path, start_date, end_date,
                 users, agg_data, interventions_filepath, history_path
             )
+
             ss_summary = summarize_submits(ss_detail, None,
                                            submits_by_survey_id)
+
             if ss_summary.shape[0] > 0:
                 ss_detail.to_csv(os.path.join(output_folder, "summaries",
                                               "submits_and_deliveries.csv"),
@@ -132,6 +152,7 @@ def compute_survey_stats(
                     os.path.join(output_folder, "summaries",
                                  "submits_summary.csv"), index=False
                 )
+
                 if submits_timeframe == Frequency.HOURLY_AND_DAILY:
                     ss_summary_h = summarize_submits(
                         ss_detail, Frequency.HOURLY, submits_by_survey_id
@@ -147,6 +168,7 @@ def compute_survey_stats(
                         os.path.join(output_folder, "summaries",
                                      "submits_summary_hourly.csv"), index=False
                     )
+
                 elif submits_timeframe == Frequency.HOURLY:
                     ss_summary_h = summarize_submits(
                         ss_detail, Frequency.HOURLY, submits_by_survey_id
@@ -155,6 +177,7 @@ def compute_survey_stats(
                         os.path.join(output_folder, "summaries",
                                      "submits_summary_hourly.csv"), index=False
                     )
+
                 elif submits_timeframe == Frequency.DAILY:
                     ss_summary_d = summarize_submits(
                         ss_detail, Frequency.DAILY, submits_by_survey_id
@@ -163,13 +186,16 @@ def compute_survey_stats(
                         os.path.join(output_folder, "summaries",
                                      "submits_summary_daily.csv"), index=False
                     )
+
             else:
                 logger.error("An Error occurred when "
                              "getting survey submit summaries")
+
     surveys_dict = format_responses_by_submission(agg_data)
-    for survey_id in surveys_dict.keys():
+
+    for survey_id in surveys_dict:
         surveys_dict[survey_id].to_csv(
-            os.path.join(output_folder, "by_survey", survey_id + ".csv"),
+            os.path.join(output_folder, "by_survey", f"{survey_id}.csv"),
             index=False
         )
 
@@ -178,12 +204,14 @@ def compute_survey_stats(
         os.path.join(output_folder, "summaries", "agg_survey_data.csv"),
         index=False
     )
+
     # Add alternative survey submits table
     submits_tbl = survey_submits_no_config(agg_data)
     submits_tbl.to_csv(
         os.path.join(output_folder, "summaries", "submits_only.csv"),
         index=False
     )
+
     return True
 
 
@@ -194,74 +222,83 @@ def get_submits_for_tableau(
         interventions_filepath: str = None,
         submits_timeframe: Frequency = Frequency.DAILY,
         history_path: str = None
-):
+) -> None:
     """Get survey submissions per day for integration into Tableau WDC
-    Args:
-    study_folder:
-        File path to study data
-    output_folder:
-        File path to output submission summaries
-    config_path:
-        File path to study configuration file
-    tz_str:
-        Timezone of study. This defaults to "UTC"
-    start_date:
-        The earliest date of survey data to read in, in YYYY-MM-DD format
-    end_date:
-        The latest survey data to read in, in YYYY-MM-DD format
-    users:
-        List of users in study for which we are generating a survey schedule
-    interventions_filepath:
-        filepath where interventions json file is.
-    submits_timeframe:
-        The timeframe to summarize survey submissions over, of class
-        forest.constants.Frequency.
-    history_path: Filepath to the survey history file. If this is not
-            included, audio survey timings cannot be estimated.
 
-    Returns:
-    Writes a csv file for each user in the output folder with survey summary
-        statistics"""
+    Args:
+        study_folder:
+            File path to study data
+        output_folder:
+            File path to output submission summaries
+        config_path:
+            File path to study configuration file
+        tz_str:
+            Timezone of study. This defaults to "UTC"
+        start_date:
+            The earliest date of survey data to read in, in YYYY-MM-DD format
+        end_date:
+            The latest survey data to read in, in YYYY-MM-DD format
+        users:
+            List of users in study for which we
+            are generating a survey schedule
+        interventions_filepath:
+            filepath where interventions json file is.
+        submits_timeframe:
+            The timeframe to summarize survey submissions over, of class
+            forest.constants.Frequency.
+        history_path: Filepath to the survey history file. If this is not
+                included, audio survey timings cannot be estimated.
+    """
     os.makedirs(output_folder, exist_ok=True)
+
     if users is None:
         users = get_ids(study_folder)
+
     if end_date is None:
         end_date = get_month_from_today()
+
     # Read, aggregate and clean data
     else:
         agg_data = aggregate_surveys_config(
             study_folder, config_path, tz_str, users, start_date,
             end_date, augment_with_answers=True, include_audio_surveys=True
         )
+
         if agg_data.shape[0] == 0:
             logger.error("Error: No survey data found in %s", study_folder)
             return
+
         # Create survey submits detail and summary
         ss_detail = survey_submits(
             config_path, start_date, end_date,
             users, agg_data, interventions_filepath, history_path
         )
+
         if ss_detail.shape[0] == 0:
             logger.error("Error: no submission data found")
             return
+
         if submits_timeframe == Frequency.HOURLY_AND_DAILY:
             ss_summary_h = summarize_submits(
                 ss_detail, Frequency.HOURLY, False
             )
             ss_summary_d = summarize_submits(
                 ss_detail, Frequency.DAILY, False
             )
+
             write_data_by_user(ss_summary_d,
                                os.path.join(output_folder, "both", "daily"),
                                users)
             write_data_by_user(ss_summary_h,
                                os.path.join(output_folder, "both", "hourly"),
                                users)
+
         elif submits_timeframe == Frequency.HOURLY:
             ss_summary_h = summarize_submits(
                 ss_detail, Frequency.HOURLY, False
             )
             write_data_by_user(ss_summary_h, output_folder, users)
+
         elif submits_timeframe == Frequency.DAILY:
             ss_summary_d = summarize_submits(
                 ss_detail, Frequency.DAILY, False