From 0c9449e0041f60150ea4840d8e7f755b23001519 Mon Sep 17 00:00:00 2001
From: GeorgiosEfstathiadis
 <54844705+GeorgeEfstathiadis@users.noreply.github.com>
Date: Fri, 15 Dec 2023 16:48:54 -0500
Subject: [PATCH 1/4] Raise error if no data with lower accuracy than limit
 (#237)

---
 forest/jasmine/data2mobmat.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/forest/jasmine/data2mobmat.py b/forest/jasmine/data2mobmat.py
index 31ceaf95..cadcd0bf 100644
--- a/forest/jasmine/data2mobmat.py
+++ b/forest/jasmine/data2mobmat.py
@@ -179,6 +179,10 @@ def collapse_data(
     # Filter out rows where the GPS accuracy is beyond
     # the provided accuracy_limit
     data = data[data.accuracy < accuracy_limit]
+    if data.shape[0] == 0:
+        raise ValueError(
+            f"No GPS record with accuracy less than {accuracy_limit}."
+        )
 
     # Get the start and end timestamps in seconds
     t_start = sorted(np.array(data.timestamp))[0] / 1000

From f230edd5a94568bfea80d8019b646a224ee18f50 Mon Sep 17 00:00:00 2001
From: GeorgiosEfstathiadis
 <54844705+GeorgeEfstathiadis@users.noreply.github.com>
Date: Fri, 15 Dec 2023 16:59:12 -0500
Subject: [PATCH 2/4] Add checks for empty elements in imputation (#236)

---
 forest/jasmine/mobmat2traj.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/forest/jasmine/mobmat2traj.py b/forest/jasmine/mobmat2traj.py
index e4579ed8..1ac1cbf2 100644
--- a/forest/jasmine/mobmat2traj.py
+++ b/forest/jasmine/mobmat2traj.py
@@ -3,7 +3,6 @@
 """
 import logging
 import math
-import sys
 from typing import Optional, Tuple
 
 import numpy as np
@@ -278,7 +277,7 @@ def indicate_flight(
     # Calculate k1 using the specified method
     k1 = calculate_k1(method, current_t, current_x, current_y, bv_subset, pars)
     if k1 is None:
-        sys.exit("Invalid method for calculate_k1.")
+        raise ValueError("Invalid method for calculate_k1.")
 
     # Select flight and pause indicators from the bv_subset
     flight_k = k1[bv_subset[:, 0] == 1]
@@ -662,8 +661,8 @@ def forward_impute(
             method, start_t, start_x, start_y,
             flight_table, pars
         )
-        if weight is None:
-            sys.exit("Invalid method for calculate_k1.")
+        if weight is None or len(weight) == 0:
+            raise ValueError("Invalid method for calculate_k1.")
 
         normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
         flight_index = np.random.choice(flight_table.shape[0], p=normalize_w)
@@ -743,7 +742,7 @@ def forward_impute(
                 pause_table, pars
             )
             if weight is None:
-                sys.exit("Invalid method for calculate_k1.")
+                raise ValueError("Invalid method for calculate_k1.")
 
             normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
             pause_index = np.random.choice(pause_table.shape[0], p=normalize_w)
@@ -832,7 +831,7 @@ def backward_impute(
             flight_table, pars
         )
         if weight is None:
-            sys.exit("Invalid method for calculate_k1.")
+            raise ValueError("Invalid method for calculate_k1.")
 
         normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
         flight_index = np.random.choice(flight_table.shape[0], p=normalize_w)
@@ -907,8 +906,8 @@ def backward_impute(
                 method, end_t, end_x, end_y,
                 pause_table, pars
             )
-            if weight is None:
-                sys.exit("Invalid method for calculate_k1.")
+            if weight is None or len(weight) == 0:
+                raise ValueError("Invalid method for calculate_k1.")
 
             normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
             pause_index = np.random.choice(pause_table.shape[0], p=normalize_w)
@@ -972,6 +971,11 @@ def impute_gps(
     # for observed flights, observed pauses, and missing intervals
     flight_table, pause_table, mis_table = create_tables(mob_mat, bv_subset)
 
+    if len(flight_table) == 0:
+        raise ValueError("No flight observed in the data.")
+    if len(pause_table) == 0:
+        raise ValueError("No pause observed in the data.")
+
     # initialize the imputed trajectory table
     imp_table = np.zeros((1, 7))
 

From 6355d5188fe3f7251b874c0645fce383b18b78e0 Mon Sep 17 00:00:00 2001
From: Eli Jones <biblicabeebli@gmail.com>
Date: Tue, 16 Jan 2024 14:04:03 -0500
Subject: [PATCH 3/4] Beiwe taskrunner (#228)

* refactor of parts of gps_stats_main in jasmine to new file structure output
* Refactor on willow for the beiwe forest taskrunner. Dedents a lot of code.
* Sycamore refactor/changes to have output in canonical beiwe forest ruvnner locations, and definitely a big bug fix.
* fixes incorrect type annotation on gps_stats_generate_summary

---------

Co-authored-by: Ilya Sytchev <isytchev@hsph.harvard.edu>
---
 .gitignore                   |   6 +-
 forest/jasmine/traj2stats.py | 139 +++++++++++++-----------
 forest/oak/base.py           |   2 +-
 forest/sycamore/base.py      |  91 +++++++---------
 forest/sycamore/utils.py     |   3 +-
 forest/willow/log_stats.py   | 205 ++++++++++++++++-------------------
 6 files changed, 213 insertions(+), 233 deletions(-)

diff --git a/.gitignore b/.gitignore
index f5a3955b..904de1e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,9 @@
 __pycache__/
 .DS_Store
 
-# IntelliJ project files
+# IntelliJ, VsCode project files
 .idea
+.vscode
 
 # for installing Forest in editable mode when developing
 /forest.egg-info/
@@ -18,3 +19,6 @@ __pycache__/
 
 #sphinx build
 docs/_build/
+
+# any python environment files
+.python-version
\ No newline at end of file
diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py
index e25f653c..a67388af 100644
--- a/forest/jasmine/traj2stats.py
+++ b/forest/jasmine/traj2stats.py
@@ -1561,7 +1561,8 @@ def gps_stats_main(
     Args:
         study_folder: str, the path of the study folder
         output_folder: str, the path of the folder
-            where you want to save results
+            where you want to save results. A folder named jasmine
+            will be created containing all output.
         tz_str: str, timezone
         frequency: Frequency, the frequency of the summary stats
             (resolution for summary statistics)
@@ -1598,16 +1599,30 @@ def gps_stats_main(
     Raises:
         ValueError: Frequency is not valid
     """
-
     # no minutely analysis on GPS data
     if frequency == Frequency.MINUTE:
         raise ValueError("Frequency cannot be minutely.")
 
-    os.makedirs(output_folder, exist_ok=True)
-
     if parameters is None:
         parameters = Hyperparameters()
 
+    if frequency == Frequency.HOURLY_AND_DAILY:
+        frequencies = [Frequency.HOURLY, Frequency.DAILY]
+    else:
+        frequencies = [frequency]
+
+    # Ensure that the correct output folder structures exist, centralize folder
+    # names. Note that frequencies
+    trajectory_folder = f"{output_folder}/trajectory"
+    logs_folder = f"{output_folder}/logs"
+    os.makedirs(output_folder, exist_ok=True)
+    os.makedirs(logs_folder, exist_ok=True)
+    for freq in frequencies:
+        os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True)
+    if save_traj:
+        os.makedirs(trajectory_folder, exist_ok=True)
+
+    # pars0 is passed to bv_select, pars1 to impute_gps
     pars0 = [
         parameters.l1, parameters.l2, parameters.l3, parameters.a1,
         parameters.a2, parameters.b1, parameters.b2, parameters.b3
@@ -1620,24 +1635,20 @@ def gps_stats_main(
     # participant_ids should be a list of str
     if participant_ids is None:
         participant_ids = get_ids(study_folder)
-    # create a record of processed user participant_id and starting/ending time
 
+    # Create a record of processed participant_id and starting/ending time.
+    # These are updated and saved to disk after each participant is processed.
+    all_memory_dict_file = f"{output_folder}/all_memory_dict.pkl"
+    all_bv_set_file = f"{output_folder}/all_bv_set.pkl"
     if all_memory_dict is None:
         all_memory_dict = {}
         for participant_id in participant_ids:
             all_memory_dict[str(participant_id)] = None
-
     if all_bv_set is None:
         all_bv_set = {}
         for participant_id in participant_ids:
             all_bv_set[str(participant_id)] = None
 
-    if frequency == Frequency.HOURLY_AND_DAILY:
-        os.makedirs(f"{output_folder}/hourly", exist_ok=True)
-        os.makedirs(f"{output_folder}/daily", exist_ok=True)
-    if save_traj:
-        os.makedirs(f"{output_folder}/trajectory", exist_ok=True)
-
     for participant_id in participant_ids:
         logger.info("User: %s", participant_id)
         # data quality check
@@ -1664,6 +1675,7 @@ def gps_stats_main(
                 params_w = np.mean(data.accuracy)
             else:
                 params_w = parameters.w
+
             # process data
             mobmat1 = gps_to_mobmat(
                 data, parameters.itrvl, parameters.accuracylim,
@@ -1681,6 +1693,8 @@ def gps_stats_main(
             )
             all_bv_set[str(participant_id)] = bv_set = out_dict["BV_set"]
             all_memory_dict[str(participant_id)] = out_dict["memory_dict"]
+
+            # impute_gps can fail, if so we skip this participant.
             try:
                 imp_table = impute_gps(
                     mobmat2, bv_set, parameters.method,
@@ -1690,6 +1704,7 @@ def gps_stats_main(
             except RuntimeError as e:
                 logger.error("Error: %s", e)
                 continue
+
             traj = imp_to_traj(imp_table, mobmat2, params_w)
             # raise error if traj coordinates are not in the range of
             # [-90, 90] and [-180, 180]
@@ -1709,72 +1724,64 @@ def gps_stats_main(
                         "[-90, 90] and [-180, 180]."
                     )
             # save all_memory_dict and all_bv_set
-            with open(f"{output_folder}/all_memory_dict.pkl", "wb") as f:
+            with open(all_memory_dict_file, "wb") as f:
                 pickle.dump(all_memory_dict, f)
-            with open(f"{output_folder}/all_bv_set.pkl", "wb") as f:
+            with open(all_bv_set_file, "wb") as f:
                 pickle.dump(all_bv_set, f)
             if save_traj is True:
                 pd_traj = pd.DataFrame(traj)
                 pd_traj.columns = ["status", "x0", "y0", "t0", "x1", "y1",
                                    "t1", "obs"]
                 pd_traj.to_csv(
-                    f"{output_folder}/trajectory/{participant_id}.csv",
+                    f"{trajectory_folder}/{participant_id}.csv",
                     index=False
                 )
-            if frequency == Frequency.HOURLY_AND_DAILY:
-                summary_stats1, logs1 = gps_summaries(
-                    traj,
-                    tz_str,
-                    Frequency.HOURLY,
-                    parameters,
-                    places_of_interest,
-                    osm_tags,
-                )
-                write_all_summaries(participant_id, summary_stats1,
-                                    f"{output_folder}/hourly")
-                summary_stats2, logs2 = gps_summaries(
-                    traj,
-                    tz_str,
-                    Frequency.DAILY,
-                    parameters,
-                    places_of_interest,
-                    osm_tags,
-                )
-                write_all_summaries(participant_id, summary_stats2,
-                                    f"{output_folder}/daily")
-                if parameters.save_osm_log:
-                    os.makedirs(f"{output_folder}/logs", exist_ok=True)
-                    with open(
-                        f"{output_folder}/logs/locations_logs_hourly.json",
-                        "w",
-                    ) as hourly:
-                        json.dump(logs1, hourly, indent=4)
-                    with open(
-                        f"{output_folder}/logs/locations_logs_daily.json",
-                        "w",
-                    ) as daily:
-                        json.dump(logs2, daily, indent=4)
-            else:
-                summary_stats, logs = gps_summaries(
-                    traj,
-                    tz_str,
-                    frequency,
-                    parameters,
-                    places_of_interest,
-                    osm_tags,
-                )
-                write_all_summaries(
-                    participant_id, summary_stats, output_folder
+
+            # generate summary stats.
+            # (variable "frequency" is already declared in signature)
+            for freq in frequencies:
+                gps_stats_generate_summary(
+                    traj=traj,
+                    tz_str=tz_str,
+                    frequency=freq,
+                    participant_id=participant_id,
+                    output_folder=f"{output_folder}/{freq.name.lower()}",
+                    logs_folder=logs_folder,
+                    parameters=parameters,
+                    places_of_interest=places_of_interest,
+                    osm_tags=osm_tags,
                 )
-                if parameters.save_osm_log:
-                    os.makedirs(f"{output_folder}/logs", exist_ok=True)
-                    with open(
-                        f"{output_folder}/logs/locations_logs.json",
-                        "w",
-                    ) as loc:
-                        json.dump(logs, loc, indent=4)
         else:
             logger.info(
                 "GPS data are not collected"
                 " or the data quality is too low"
             )
+
+
+def gps_stats_generate_summary(
+        traj: np.ndarray,
+        tz_str: str,
+        frequency: Frequency,
+        participant_id: str,
+        output_folder: str,
+        logs_folder: str,
+        parameters: Hyperparameters,
+        places_of_interest: Optional[list] = None,
+        osm_tags: Optional[List[OSMTags]] = None):
+    """This is simply the inner functionality of gps_stats_main.
+    Runs summaries code, writes to disk, saves logs if required. """
+    summary_stats, logs = gps_summaries(
+        traj,
+        tz_str,
+        frequency,
+        parameters,
+        places_of_interest,
+        osm_tags,
+    )
+    write_all_summaries(participant_id, summary_stats, output_folder)
+    if parameters.save_osm_log:
+        with open(
+            f"{logs_folder}/locations_logs_{frequency.name.lower()}.json",
+            "wa",
+        ) as loc:
+            json.dump(logs, loc, indent=4)
diff --git a/forest/oak/base.py b/forest/oak/base.py
index 3fccb03e..f21ef82a 100644
--- a/forest/oak/base.py
+++ b/forest/oak/base.py
@@ -691,7 +691,7 @@ def run(study_folder: str, output_folder: str, tz_str: Optional[str] = None,
                     'walking_time': walkingtime_daily[:, -1],
                     'steps': steps_daily[:, -1],
                     'cadence': cadence_daily[:, -1]})
-                output_file = user + "_gait_daily.csv"
+                output_file = user + ".csv"
                 dest_path = os.path.join(output_folder, "daily", output_file)
                 summary_stats.to_csv(dest_path, index=False)
             if frequency != Frequency.DAILY:
diff --git a/forest/sycamore/base.py b/forest/sycamore/base.py
index 67da6024..86649926 100644
--- a/forest/sycamore/base.py
+++ b/forest/sycamore/base.py
@@ -224,12 +224,16 @@ def compute_survey_stats(
 
 
 def get_submits_for_tableau(
-        study_folder: str, output_folder: str, config_path: str,
-        tz_str: str = "UTC", start_date: str = EARLIEST_DATE,
-        end_date: Optional[str] = None, users: Optional[List] = None,
-        interventions_filepath: Optional[str] = None,
-        submits_timeframe: Frequency = Frequency.DAILY,
-        history_path: Optional[str] = None
+    study_folder: str,
+    output_folder: str,
+    config_path: str,
+    tz_str: str = "UTC",
+    start_date: str = EARLIEST_DATE,
+    end_date: Optional[str] = None,
+    users: Optional[List] = None,
+    interventions_filepath: Optional[str] = None,
+    submits_timeframe: Frequency = Frequency.DAILY,
+    history_path: Optional[str] = None
 ) -> None:
     """Get survey submissions per day for integration into Tableau WDC
 
@@ -247,8 +251,7 @@ def get_submits_for_tableau(
         end_date:
             The latest survey data to read in, in YYYY-MM-DD format
         users:
-            List of users in study for which we
-            are generating a survey schedule
+            List of users in study that we are generating a survey schedule for
         interventions_filepath:
             filepath where interventions json file is.
         submits_timeframe:
@@ -257,65 +260,49 @@ def get_submits_for_tableau(
         history_path: Filepath to the survey history file. If this is not
                 included, audio survey timings cannot be estimated.
     """
-
     if submits_timeframe not in [
         Frequency.HOURLY, Frequency.DAILY, Frequency.HOURLY_AND_DAILY
     ]:
         logger.error("Error: Invalid submits timeframe")
         return
 
+    if submits_timeframe == Frequency.HOURLY_AND_DAILY:
+        submits_timeframes = [Frequency.HOURLY, Frequency.DAILY]
+    else:
+        submits_timeframes = [submits_timeframe]
+
     os.makedirs(output_folder, exist_ok=True)
+    for freq in submits_timeframes:
+        os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True)
 
     if users is None:
         users = get_ids(study_folder)
-
     if end_date is None:
         end_date = get_month_from_today()
 
     # Read, aggregate and clean data
-    else:
-        agg_data = aggregate_surveys_config(
-            study_folder, config_path, tz_str, users, start_date,
-            end_date, augment_with_answers=True, include_audio_surveys=True
-        )
-
-        if agg_data.shape[0] == 0:
-            logger.error("Error: No survey data found in %s", study_folder)
-            return
-
-        # Create survey submits detail and summary
-        ss_detail = survey_submits(
-            config_path, start_date, end_date,
-            users, agg_data, interventions_filepath, history_path
-        )
-
-        if ss_detail.shape[0] == 0:
-            logger.error("Error: no submission data found")
-            return
+    agg_data = aggregate_surveys_config(
+        study_folder, config_path, tz_str, users, start_date,
+        end_date, augment_with_answers=True, include_audio_surveys=True
+    )
 
-        if submits_timeframe == Frequency.HOURLY_AND_DAILY:
-            ss_summary_h = summarize_submits(
-                ss_detail, Frequency.HOURLY, False
-            )
-            ss_summary_d = summarize_submits(
-                ss_detail, Frequency.DAILY, False
-            )
+    if agg_data.shape[0] == 0:
+        logger.error("Error: No survey data found in %s", study_folder)
+        return
 
-            write_data_by_user(ss_summary_d,
-                               os.path.join(output_folder, "both", "daily"),
-                               users)
-            write_data_by_user(ss_summary_h,
-                               os.path.join(output_folder, "both", "hourly"),
-                               users)
+    # Create survey submits detail and summary
+    ss_detail = survey_submits(
+        config_path, start_date, end_date,
+        users, agg_data, interventions_filepath, history_path
+    )
 
-        elif submits_timeframe == Frequency.HOURLY:
-            ss_summary_h = summarize_submits(
-                ss_detail, Frequency.HOURLY, False
-            )
-            write_data_by_user(ss_summary_h, output_folder, users)
+    if ss_detail.shape[0] == 0:
+        logger.error("Error: no submission data found")
+        return
 
-        elif submits_timeframe == Frequency.DAILY:
-            ss_summary_d = summarize_submits(
-                ss_detail, Frequency.DAILY, False
-            )
-            write_data_by_user(ss_summary_d, output_folder, users)
+    # run once for every submits_timeframe, per-user is handled internally
+    for freq in submits_timeframes:
+        ss_summary = summarize_submits(ss_detail, freq, False)
+        write_data_by_user(
+            ss_summary, f"{output_folder}/{freq.name.lower()}", users
+        )
diff --git a/forest/sycamore/utils.py b/forest/sycamore/utils.py
index a4f365cd..9b69b70f 100644
--- a/forest/sycamore/utils.py
+++ b/forest/sycamore/utils.py
@@ -31,8 +31,7 @@ def get_month_from_today():
             datetime.timedelta(31)).strftime("%Y-%m-%d")
 
 
-def filename_to_timestamp(filename: str, tz_str: str = "UTC"
-                          ) -> pd.Timestamp:
+def filename_to_timestamp(filename: str, tz_str: str = "UTC") -> pd.Timestamp:
     """Extract a datetime from a filepath.
 
     Args:
diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py
index 3eee8a25..b797e981 100644
--- a/forest/willow/log_stats.py
+++ b/forest/willow/log_stats.py
@@ -413,7 +413,7 @@ def log_stats_main(
     frequency: Frequency,
     time_start: Optional[List] = None,
     time_end: Optional[List] = None,
-    beiwe_id: Optional[List[str]] = None,
+    beiwe_ids: Optional[List[str]] = None,
 ) -> None:
     """Main function for calculating the summary statistics for the
     communication logs.
@@ -426,7 +426,7 @@ def log_stats_main(
             determining resolution of the summary stats
         time_start: starting timestamp of the study
         time_end: ending timestamp of the study
-        beiwe_id: list of Beiwe IDs to be processed
+        beiwe_ids: list of Beiwe IDs to be processed
     """
 
     if frequency not in [
@@ -437,121 +437,104 @@ def log_stats_main(
             "HOURLY_AND_DAILY, DAILY, HOURLY"
         )
 
-    os.makedirs(output_folder, exist_ok=True)
-
     if frequency == Frequency.HOURLY_AND_DAILY:
-        os.makedirs(output_folder + "/hourly", exist_ok=True)
-        os.makedirs(output_folder + "/daily", exist_ok=True)
+        frequencies = [Frequency.HOURLY, Frequency.DAILY]
+    else:
+        frequencies = [frequency]
+
+    os.makedirs(output_folder, exist_ok=True)
+    for freq in frequencies:
+        os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True)
 
     # beiwe_id should be a list of str
-    if beiwe_id is None:
-        beiwe_id = [
-            i for i in os.listdir(study_folder)
-            if os.path.isdir(f"{study_folder}/{i}")
+    if beiwe_ids is None:
+        beiwe_ids = [
+            participant_id for participant_id in os.listdir(study_folder)
+            if os.path.isdir(f"{study_folder}/{participant_id}")
         ]
 
-    if len(beiwe_id) > 0:
-        for bid in beiwe_id:
-            logger.info("User: %s", bid)
+    # process the data for each participant in each frequency into a folder of
+    # the corresponding frequency.
+    for beiwe_id in beiwe_ids:
+        for freq in frequencies:
+            logger.info("(%s) Participant: %s", freq.name.lower(), beiwe_id)
             try:
-                # read data
-                text_data, text_stamp_start, text_stamp_end = read_data(
-                    bid, study_folder, "texts", tz_str, time_start, time_end
-                )
-                call_data, call_stamp_start, call_stamp_end = read_data(
-                    bid, study_folder, "calls", tz_str, time_start, time_end
+                log_stats_inner(
+                    beiwe_id,
+                    f"{output_folder}/{freq.name.lower()}",
+                    study_folder,
+                    frequency,
+                    tz_str,
+                    time_start,
+                    time_end
                 )
+            except Exception as err:
+                logger.error("An error occurred when processing data: %s", err)
 
-                if text_data.shape[0] > 0 or call_data.shape[0] > 0:
-                    # stamps from call and text should be the stamp_end
-                    logger.info("Data imported ...")
-                    stamp_start = min(text_stamp_start, call_stamp_start)
-                    stamp_end = max(text_stamp_end, call_stamp_end)
-
-                    # process data
-                    if frequency == Frequency.HOURLY_AND_DAILY:
-                        stats_pdframe1 = comm_logs_summaries(
-                            text_data,
-                            call_data,
-                            stamp_start,
-                            stamp_end,
-                            tz_str,
-                            Frequency.HOURLY,
-                        )
-                        stats_pdframe2 = comm_logs_summaries(
-                            text_data,
-                            call_data,
-                            stamp_start,
-                            stamp_end,
-                            tz_str,
-                            Frequency.DAILY,
-                        )
-
-                        write_all_summaries(
-                            bid, stats_pdframe1, output_folder + "/hourly"
-                        )
-                        write_all_summaries(
-                            bid, stats_pdframe2, output_folder + "/daily"
-                        )
-                    else:
-                        stats_pdframe = comm_logs_summaries(
-                            text_data,
-                            call_data,
-                            stamp_start,
-                            stamp_end,
-                            tz_str,
-                            frequency,
-                        )
-                        # num_uniq_individuals_call_or_text is the cardinality
-                        # of the union of several sets. It should should always
-                        # be at least as large as the cardinality of any one of
-                        # the sets, and it should never be larger than the sum
-                        # of the cardinalities of all of the sets
-                        # (it may be equal if all the sets are disjoint)
-                        sum_all_set_cols = pd.Series(
-                            [0]*stats_pdframe.shape[0]
-                        )
-                        for col in [
-                            "num_s_tel", "num_r_tel", "num_in_caller",
-                            "num_out_caller", "num_mis_caller"
-                        ]:
-                            sum_all_set_cols += stats_pdframe[col]
-                            if (
-                                stats_pdframe[
-                                    "num_uniq_individuals_call_or_text"
-                                ] < stats_pdframe[col]
-                            ).any():
-                                logger.error(
-                                    "Error: "
-                                    "num_uniq_individuals_call_or_text "
-                                    "was found to be less than %s for at "
-                                    "least one time interval. This error "
-                                    "comes from an issue with the code,"
-                                    " not an issue with the input data",
-                                    col
-                                    )
-                        if (
-                            stats_pdframe[
-                                "num_uniq_individuals_call_or_text"
-                            ] > sum_all_set_cols
-                        ).any():
-                            logger.error(
-                                    "Error: "
-                                    "num_uniq_individuals_call_or_text "
-                                    "was found to be larger than the sum "
-                                    "of individual cardinalities for at "
-                                    "least one time interval. This error "
-                                    "comes from an issue with the code,"
-                                    " not an issue with the input data"
-                                    )
-
-                        write_all_summaries(bid, stats_pdframe, output_folder)
-
-                    logger.info(
-                        "Summary statistics obtained. Finished."
-                    )
+    logger.info("Summary statistics obtained. Finished.")
 
-            except Exception as err:
-                logger.error(
-                    "An error occurred when processing the data: %s", err
-                )
+
+def log_stats_inner(
+    beiwe_id: str,
+    output_folder: str,
+    study_folder: str,
+    frequency: Frequency,
+    tz_str: str,
+    time_start: Optional[List] = None,
+    time_end: Optional[List] = None,
+):
+    """ Inner functionality of log_stats_main """
+    # read data
+    text_data, text_stamp_start, text_stamp_end = read_data(
+        beiwe_id, study_folder, "texts", tz_str, time_start, time_end
+    )
+    call_data, call_stamp_start, call_stamp_end = read_data(
+        beiwe_id, study_folder, "calls", tz_str, time_start, time_end
+    )
+
+    # give up early if there is no data
+    if text_data.shape[0] <= 0 and call_data.shape[0] <= 0:
+        logger.info("There was no data for participant %s", beiwe_id)
+        return
+
+    # stamps from call and text should be the stamp_end
+    logger.info("Data imported ...")
+    stamp_start = min(text_stamp_start, call_stamp_start)
+    stamp_end = max(text_stamp_end, call_stamp_end)
+
+    # process the data
+    stats_pdframe = comm_logs_summaries(
+        text_data, call_data, stamp_start, stamp_end, tz_str, frequency
+    )
+
+    # num_uniq_individuals_call_or_text is the cardinality of the union of
+    # several sets. It should should always be at least as large as the
+    # cardinality of any one of the sets, and it should never be larger than
+    # the sum of the cardinalities of all of the sets. (it may be equal if all
+    # the sets are disjoint)
+    num_uniq_column = "num_uniq_individuals_call_or_text"  # legibility hax.
+    sum_all_set_cols = pd.Series([0]*stats_pdframe.shape[0])
+    for column in [
+        "num_s_tel", "num_r_tel", "num_in_caller",
+        "num_out_caller", "num_mis_caller"
+    ]:
+        sum_all_set_cols += stats_pdframe[column]
+        if (stats_pdframe[num_uniq_column] < stats_pdframe[column]).any():
+            logger.error(
+                "Error: "
+                "num_uniq_individuals_call_or_text was found to be less than "
+                "%s for at least one time interval. This error comes from an "
+                "issue with the code, not an issue with the input data.",
+                column
+            )
+
+    if (stats_pdframe[num_uniq_column] > sum_all_set_cols).any():
+        logger.error(
+            "Error: "
+            "num_uniq_individuals_call_or_text was found to be larger than the"
+            "sum of individual cardinalities for at least one time interval. "
+            "This error comes from an issue with the code, not an issue with "
+            "the input data."
+        )
+
+    write_all_summaries(beiwe_id, stats_pdframe, output_folder)

From 1c0e7bcc9bed574b0392db7004921c51a440b4e0 Mon Sep 17 00:00:00 2001
From: Zachary Clement <clementzach@icloud.com>
Date: Mon, 12 Feb 2024 19:22:26 -0500
Subject: [PATCH 4/4] Jasmine updates (#241)

* pep8 fixes

* add warning to users requesting OSM summaries, make the transformation keep values the same if within valid range

* force compute_flight_positions and compute_future_flight_positions to return valid coordinates

* pep8 fixes

* add type hint

* Ensure longitude is in data columns before type checking

* Add comment explaining logic behind wrapping

* Explain better what the force_valid_coordinate function does

* PEP8 fixes
---
 forest/jasmine/data2mobmat.py | 23 +++++++++++++++++++----
 forest/jasmine/traj2stats.py  | 28 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/forest/jasmine/data2mobmat.py b/forest/jasmine/data2mobmat.py
index cadcd0bf..9b3f5c30 100644
--- a/forest/jasmine/data2mobmat.py
+++ b/forest/jasmine/data2mobmat.py
@@ -619,6 +619,21 @@ def gps_to_mobmat(
     return mobmat
 
 
+def force_valid_longitude(longitude: float) -> float:
+    """Forces a longitude coordinate to be within -180 and 180
+
+    In some cases, the imputation code seems to yield out-of-range
+    GPS coordinates. This function wrps longitude coordinates to be back
+    in the correct range so an error isn't thrown.
+
+    For example, 190 would get transformed into -170.
+
+    Args:
+        longitude: float. The longitude to be coerced
+    """
+    return (longitude + 180) % 360 - 180
+
+
 def compute_flight_positions(
     index: int, mobmat: np.ndarray, interval: float
 ) -> np.ndarray:
@@ -660,8 +675,8 @@ def compute_flight_positions(
     # Update the mobility matrix with the new start and end positions
     mobmat[index, 1] = start_x
     mobmat[index, 4] = end_x
-    mobmat[index, 2] = start_y
-    mobmat[index, 5] = end_y
+    mobmat[index, 2] = force_valid_longitude(start_y)
+    mobmat[index, 5] = force_valid_longitude(end_y)
 
     return mobmat
 
@@ -708,8 +723,8 @@ def compute_future_flight_positions(
     # Update the mobility matrix with the new start and end positions
     mobmat[index, 1] = start_x
     mobmat[index, 4] = end_x
-    mobmat[index, 2] = start_y
-    mobmat[index, 5] = end_y
+    mobmat[index, 2] = force_valid_longitude(start_y)
+    mobmat[index, 5] = force_valid_longitude(end_y)
 
     return mobmat
 
diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py
index a67388af..43852c59 100644
--- a/forest/jasmine/traj2stats.py
+++ b/forest/jasmine/traj2stats.py
@@ -1660,6 +1660,34 @@ def gps_stats_main(
                 participant_id, study_folder, "gps",
                 tz_str, time_start, time_end,
             )
+            # If the data comes from a study thata hada GPS fuzzing,
+            # and the study was prior to March 2023, the longitude
+            # coordinates may be outside of the required range of
+            # (-180, 180). This chunk of code wraps out of range
+            # coordinates to be in that range
+            if (
+                    ("longitude" in data.columns)
+                    and (
+                        (data["longitude"].max() > 180)
+                        or (data["longitude"].min() < -180)
+                        )
+                    ):
+                logger.info("Reconciled bad longitude data for user %s",
+                            participant_id)
+                data["longitude"] = (data["longitude"] + 180) % 360 - 180
+                if ((places_of_interest is not None)
+                        or (osm_tags is not None)):
+                    logger.warning("Warning: user %s had longitude values "
+                                   "outside the valid range [-180, 180] "
+                                   "but OSM location summaries were "
+                                   "requested. Longitude values outside "
+                                   "the valid range may signify that GPS "
+                                   "fuzzing was directed to be used in "
+                                   "the study setup file. If GPS "
+                                   "coordinates were fuzzed, OSM "
+                                   "location summaries are meaningless",
+                                   participant_id)
+
             if data.shape == (0, 0):
                 logger.info("No data available.")
                 continue