From dcc8715848ed96dce06c86019e97473047e33c63 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Wed, 15 Nov 2023 13:47:25 -0500 Subject: [PATCH 01/10] refactor of parts of gps_stats_main in jasmine to new file structure output. --- .gitignore | 6 +- forest/jasmine/traj2stats.py | 139 ++++++++++++++++++----------------- 2 files changed, 78 insertions(+), 67 deletions(-) diff --git a/.gitignore b/.gitignore index f5a3955b..904de1e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ __pycache__/ .DS_Store -# IntelliJ project files +# IntelliJ, VsCode project files .idea +.vscode # for installing Forest in editable mode when developing /forest.egg-info/ @@ -18,3 +19,6 @@ __pycache__/ #sphinx build docs/_build/ + +# any python environment files +.python-version \ No newline at end of file diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index ed2d8a92..ac7bca76 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -1561,7 +1561,8 @@ def gps_stats_main( Args: study_folder: str, the path of the study folder output_folder: str, the path of the folder - where you want to save results + where you want to save results. A folder named jasmine + will be created containing all output. tz_str: str, timezone frequency: Frequency, the frequency of the summary stats (resolution for summary statistics) @@ -1596,12 +1597,26 @@ def gps_stats_main( and a record csv file to show which users are processed and logger csv file to show warnings and bugs during the run """ - - os.makedirs(output_folder, exist_ok=True) - if parameters is None: parameters = Hyperparameters() + if frequency == Frequency.HOURLY_AND_DAILY: + frequencies = [Frequency.HOURLY, Frequency.DAILY] + else: + frequencies = [frequency] + + # Ensure that the correct output folder structures exist, centralize folder names. + # Note that frequencies + trajectory_folder = f"{output_folder}/trajectory" + logs_folder = f"{output_folder}/logs" + os.makedirs(output_folder, exist_ok=True) + os.makedirs(logs_folder, exist_ok=True) + for freq in frequencies: + os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True) + if save_traj: + os.makedirs(trajectory_folder, exist_ok=True) + + # pars0 is passed to bv_select, pars1 to impute_gps pars0 = [ parameters.l1, parameters.l2, parameters.l3, parameters.a1, parameters.a2, parameters.b1, parameters.b2, parameters.b3 @@ -1614,24 +1629,20 @@ def gps_stats_main( # participant_ids should be a list of str if participant_ids is None: participant_ids = get_ids(study_folder) - # create a record of processed user participant_id and starting/ending time + # Create a record of processed user participant_id and starting/ending time. + # These are updated and saved to disk after each participant is processed. + all_memory_dict_file = f"{output_folder}/all_memory_dict.pkl" + all_bv_set_file = f"{output_folder}/all_bv_set.pkl" if all_memory_dict is None: all_memory_dict = {} for participant_id in participant_ids: all_memory_dict[str(participant_id)] = None - if all_bv_set is None: all_bv_set = {} for participant_id in participant_ids: all_bv_set[str(participant_id)] = None - if frequency == Frequency.HOURLY_AND_DAILY: - os.makedirs(f"{output_folder}/hourly", exist_ok=True) - os.makedirs(f"{output_folder}/daily", exist_ok=True) - if save_traj: - os.makedirs(f"{output_folder}/trajectory", exist_ok=True) - for participant_id in participant_ids: logger.info("User: %s", participant_id) # data quality check @@ -1658,6 +1669,7 @@ def gps_stats_main( params_w = np.mean(data.accuracy) else: params_w = parameters.w + # process data mobmat1 = gps_to_mobmat( data, parameters.itrvl, parameters.accuracylim, @@ -1675,6 +1687,8 @@ def gps_stats_main( ) all_bv_set[str(participant_id)] = bv_set = out_dict["BV_set"] all_memory_dict[str(participant_id)] = out_dict["memory_dict"] + + # impute_gps can fail, if so we skip this participant. try: imp_table = impute_gps( mobmat2, bv_set, parameters.method, @@ -1684,6 +1698,7 @@ def gps_stats_main( except RuntimeError as e: logger.error("Error: %s", e) continue + traj = imp_to_traj(imp_table, mobmat2, params_w) # raise error if traj coordinates are not in the range of # [-90, 90] and [-180, 180] @@ -1703,72 +1718,64 @@ def gps_stats_main( "[-90, 90] and [-180, 180]." ) # save all_memory_dict and all_bv_set - with open(f"{output_folder}/all_memory_dict.pkl", "wb") as f: + with open(all_memory_dict_file, "wb") as f: pickle.dump(all_memory_dict, f) - with open(f"{output_folder}/all_bv_set.pkl", "wb") as f: + with open(all_bv_set_file, "wb") as f: pickle.dump(all_bv_set, f) if save_traj is True: pd_traj = pd.DataFrame(traj) pd_traj.columns = ["status", "x0", "y0", "t0", "x1", "y1", "t1", "obs"] pd_traj.to_csv( - f"{output_folder}/trajectory/{participant_id}.csv", + f"{trajectory_folder}/{participant_id}.csv", index=False ) - if frequency == Frequency.HOURLY_AND_DAILY: - summary_stats1, logs1 = gps_summaries( - traj, - tz_str, - Frequency.HOURLY, - parameters, - places_of_interest, - osm_tags, - ) - write_all_summaries(participant_id, summary_stats1, - f"{output_folder}/hourly") - summary_stats2, logs2 = gps_summaries( - traj, - tz_str, - Frequency.DAILY, - parameters, - places_of_interest, - osm_tags, - ) - write_all_summaries(participant_id, summary_stats2, - f"{output_folder}/daily") - if parameters.save_osm_log: - os.makedirs(f"{output_folder}/logs", exist_ok=True) - with open( - f"{output_folder}/logs/locations_logs_hourly.json", - "w", - ) as hourly: - json.dump(logs1, hourly, indent=4) - with open( - f"{output_folder}/logs/locations_logs_daily.json", - "w", - ) as daily: - json.dump(logs2, daily, indent=4) - else: - summary_stats, logs = gps_summaries( - traj, - tz_str, - frequency, - parameters, - places_of_interest, - osm_tags, - ) - write_all_summaries( - participant_id, summary_stats, output_folder + + # generate summary stats. (variable "frequency" is already declared in signature) + for freq in frequencies: + gps_stats_generate_summary( + traj=traj, + tz_str=tz_str, + frequency=freq, + participant_id=participant_id, + output_folder=f"{output_folder}/{freq.name.lower()}", + logs_folder=logs_folder, + parameters=parameters, + places_of_interest=places_of_interest, + osm_tags=osm_tags, ) - if parameters.save_osm_log: - os.makedirs(f"{output_folder}/logs", exist_ok=True) - with open( - f"{output_folder}/logs/locations_logs.json", - "w", - ) as loc: - json.dump(logs, loc, indent=4) else: logger.info( "GPS data are not collected" " or the data quality is too low" ) + + +def gps_stats_generate_summary( + traj: np.ndarray, + tz_str: str, + frequency: Frequency, + participant_id: str, + output_folder: str, + logs_folder: str, + parameters: Optional[Hyperparameters] = None, + places_of_interest: Optional[list] = None, + osm_tags: Optional[List[OSMTags]] = None, + ): + """This is simply the inner functionality of gps_stats_main. + Runs summaries code, writes to disk, saves logs if required. """ + summary_stats, logs = gps_summaries( + traj, + tz_str, + frequency, + parameters, + places_of_interest, + osm_tags, + ) + write_all_summaries(participant_id, summary_stats, output_folder) + if parameters.save_osm_log: + with open( + f"{logs_folder}/locations_logs_{frequency.name.lower()}.json", + "wa", + ) as loc: + json.dump(logs, loc, indent=4) From d68ed158dda243295aac85e5f3c78ad8d9a69cb2 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Wed, 15 Nov 2023 14:24:13 -0500 Subject: [PATCH 02/10] oak only needed final output file name changed. --- forest/oak/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forest/oak/base.py b/forest/oak/base.py index 14ac0e49..dc942cdb 100644 --- a/forest/oak/base.py +++ b/forest/oak/base.py @@ -655,7 +655,7 @@ def run(study_folder: str, output_folder: str, tz_str: Optional[str] = None, 'walking_time': walkingtime_daily[:, -1], 'steps': steps_daily[:, -1], 'cadence': cadence_daily[:, -1]}) - output_file = user + "_gait_daily.csv" + output_file = user + ".csv" dest_path = os.path.join(output_folder, "daily", output_file) summary_stats.to_csv(dest_path, index=False) if frequency != Frequency.DAILY: From fd210a91523338c075fd4fa50dfa9e6acacbe009 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Wed, 15 Nov 2023 14:54:30 -0500 Subject: [PATCH 03/10] mostly reduces indentation but also removes dumb code? --- forest/willow/log_stats.py | 207 ++++++++++++++++++------------------- 1 file changed, 103 insertions(+), 104 deletions(-) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index a40e1be0..e309401d 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -414,7 +414,7 @@ def log_stats_main( frequency: Frequency, time_start: Optional[List] = None, time_end: Optional[List] = None, - beiwe_id: Optional[List[str]] = None, + beiwe_ids: Optional[List[str]] = None, ) -> None: """Main function for calculating the summary statistics for the communication logs. @@ -427,7 +427,7 @@ def log_stats_main( determining resolution of the summary stats time_start: starting timestamp of the study time_end: ending timestamp of the study - beiwe_id: list of Beiwe IDs to be processed + beiwe_ids: list of Beiwe IDs to be processed """ os.makedirs(output_folder, exist_ok=True) @@ -436,114 +436,113 @@ def log_stats_main( os.makedirs(output_folder + "/daily", exist_ok=True) # beiwe_id should be a list of str - if beiwe_id is None: - beiwe_id = [ - i for i in os.listdir(study_folder) - if os.path.isdir(f"{study_folder}/{i}") + if beiwe_ids is None: + beiwe_ids = [ + participant_id for participant_id in os.listdir(study_folder) + if os.path.isdir(f"{study_folder}/{participant_id}") ] - if len(beiwe_id) > 0: - for bid in beiwe_id: - logger.info("User: %s", bid) - try: - # read data - text_data, text_stamp_start, text_stamp_end = read_data( - bid, study_folder, "texts", tz_str, time_start, time_end - ) - call_data, call_stamp_start, call_stamp_end = read_data( - bid, study_folder, "calls", tz_str, time_start, time_end - ) + for bid in beiwe_ids: + logger.info("User: %s", bid) + try: + # read data + text_data, text_stamp_start, text_stamp_end = read_data( + bid, study_folder, "texts", tz_str, time_start, time_end + ) + call_data, call_stamp_start, call_stamp_end = read_data( + bid, study_folder, "calls", tz_str, time_start, time_end + ) - if text_data.shape[0] > 0 or call_data.shape[0] > 0: - # stamps from call and text should be the stamp_end - logger.info("Data imported ...") - stamp_start = min(text_stamp_start, call_stamp_start) - stamp_end = max(text_stamp_end, call_stamp_end) - - # process data - if frequency == Frequency.HOURLY_AND_DAILY: - stats_pdframe1 = comm_logs_summaries( - text_data, - call_data, - stamp_start, - stamp_end, - tz_str, - Frequency.HOURLY, - ) - stats_pdframe2 = comm_logs_summaries( - text_data, - call_data, - stamp_start, - stamp_end, - tz_str, - Frequency.DAILY, - ) - - write_all_summaries( - bid, stats_pdframe1, output_folder + "/hourly" - ) - write_all_summaries( - bid, stats_pdframe2, output_folder + "/daily" - ) - else: - stats_pdframe = comm_logs_summaries( - text_data, - call_data, - stamp_start, - stamp_end, - tz_str, - frequency, - ) - # num_uniq_individuals_call_or_text is the cardinality - # of the union of several sets. It should should always - # be at least as large as the cardinality of any one of - # the sets, and it should never be larger than the sum - # of the cardinalities of all of the sets - # (it may be equal if all the sets are disjoint) - sum_all_set_cols = pd.Series( - [0]*stats_pdframe.shape[0] - ) - for col in [ - "num_s_tel", "num_r_tel", "num_in_caller", - "num_out_caller", "num_mis_caller" - ]: - sum_all_set_cols += stats_pdframe[col] - if ( - stats_pdframe[ - "num_uniq_individuals_call_or_text" - ] < stats_pdframe[col] - ).any(): - logger.error( - "Error: " - "num_uniq_individuals_call_or_text " - "was found to be less than %s for at " - "least one time interval. This error " - "comes from an issue with the code," - " not an issue with the input data", - col - ) + if text_data.shape[0] > 0 or call_data.shape[0] > 0: + # stamps from call and text should be the stamp_end + logger.info("Data imported ...") + stamp_start = min(text_stamp_start, call_stamp_start) + stamp_end = max(text_stamp_end, call_stamp_end) + + # process data + if frequency == Frequency.HOURLY_AND_DAILY: + stats_pdframe1 = comm_logs_summaries( + text_data, + call_data, + stamp_start, + stamp_end, + tz_str, + Frequency.HOURLY, + ) + stats_pdframe2 = comm_logs_summaries( + text_data, + call_data, + stamp_start, + stamp_end, + tz_str, + Frequency.DAILY, + ) + + write_all_summaries( + bid, stats_pdframe1, output_folder + "/hourly" + ) + write_all_summaries( + bid, stats_pdframe2, output_folder + "/daily" + ) + else: + stats_pdframe = comm_logs_summaries( + text_data, + call_data, + stamp_start, + stamp_end, + tz_str, + frequency, + ) + # num_uniq_individuals_call_or_text is the cardinality + # of the union of several sets. It should should always + # be at least as large as the cardinality of any one of + # the sets, and it should never be larger than the sum + # of the cardinalities of all of the sets + # (it may be equal if all the sets are disjoint) + sum_all_set_cols = pd.Series( + [0]*stats_pdframe.shape[0] + ) + for col in [ + "num_s_tel", "num_r_tel", "num_in_caller", + "num_out_caller", "num_mis_caller" + ]: + sum_all_set_cols += stats_pdframe[col] if ( stats_pdframe[ "num_uniq_individuals_call_or_text" - ] > sum_all_set_cols + ] < stats_pdframe[col] ).any(): logger.error( - "Error: " - "num_uniq_individuals_call_or_text " - "was found to be larger than the sum " - "of individual cardinalities for at " - "least one time interval. This error " - "comes from an issue with the code," - " not an issue with the input data" - ) - - write_all_summaries(bid, stats_pdframe, output_folder) - - logger.info( - "Summary statistics obtained. Finished." - ) - - except Exception as err: - logger.error( - "An error occurred when processing the data: %s", err + "Error: " + "num_uniq_individuals_call_or_text " + "was found to be less than %s for at " + "least one time interval. This error " + "comes from an issue with the code," + " not an issue with the input data", + col + ) + if ( + stats_pdframe[ + "num_uniq_individuals_call_or_text" + ] > sum_all_set_cols + ).any(): + logger.error( + "Error: " + "num_uniq_individuals_call_or_text " + "was found to be larger than the sum " + "of individual cardinalities for at " + "least one time interval. This error " + "comes from an issue with the code," + " not an issue with the input data" + ) + + write_all_summaries(bid, stats_pdframe, output_folder) + + logger.info( + "Summary statistics obtained. Finished." ) + + except Exception as err: + logger.error( + "An error occurred when processing the data: %s", err + ) From b0d57941e33bf345b9347c5224da2ec18f86ed30 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Wed, 15 Nov 2023 16:23:46 -0500 Subject: [PATCH 04/10] Refactor on willow for the beiwe forest taskrunner. Dedents a lot of code. --- forest/willow/log_stats.py | 192 +++++++++++++++++-------------------- 1 file changed, 88 insertions(+), 104 deletions(-) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index e309401d..bb32a451 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -429,11 +429,14 @@ def log_stats_main( time_end: ending timestamp of the study beiwe_ids: list of Beiwe IDs to be processed """ - os.makedirs(output_folder, exist_ok=True) - if frequency == Frequency.HOURLY_AND_DAILY: - os.makedirs(output_folder + "/hourly", exist_ok=True) - os.makedirs(output_folder + "/daily", exist_ok=True) + frequencies = [Frequency.HOURLY, Frequency.DAILY] + else: + frequencies = [frequency] + + os.makedirs(output_folder, exist_ok=True) + for freq in frequencies: + os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True) # beiwe_id should be a list of str if beiwe_ids is None: @@ -442,107 +445,88 @@ def log_stats_main( if os.path.isdir(f"{study_folder}/{participant_id}") ] - for bid in beiwe_ids: - logger.info("User: %s", bid) - try: - # read data - text_data, text_stamp_start, text_stamp_end = read_data( - bid, study_folder, "texts", tz_str, time_start, time_end - ) - call_data, call_stamp_start, call_stamp_end = read_data( - bid, study_folder, "calls", tz_str, time_start, time_end - ) - - if text_data.shape[0] > 0 or call_data.shape[0] > 0: - # stamps from call and text should be the stamp_end - logger.info("Data imported ...") - stamp_start = min(text_stamp_start, call_stamp_start) - stamp_end = max(text_stamp_end, call_stamp_end) - - # process data - if frequency == Frequency.HOURLY_AND_DAILY: - stats_pdframe1 = comm_logs_summaries( - text_data, - call_data, - stamp_start, - stamp_end, - tz_str, - Frequency.HOURLY, - ) - stats_pdframe2 = comm_logs_summaries( - text_data, - call_data, - stamp_start, - stamp_end, - tz_str, - Frequency.DAILY, - ) - - write_all_summaries( - bid, stats_pdframe1, output_folder + "/hourly" - ) - write_all_summaries( - bid, stats_pdframe2, output_folder + "/daily" - ) - else: - stats_pdframe = comm_logs_summaries( - text_data, - call_data, - stamp_start, - stamp_end, - tz_str, - frequency, - ) - # num_uniq_individuals_call_or_text is the cardinality - # of the union of several sets. It should should always - # be at least as large as the cardinality of any one of - # the sets, and it should never be larger than the sum - # of the cardinalities of all of the sets - # (it may be equal if all the sets are disjoint) - sum_all_set_cols = pd.Series( - [0]*stats_pdframe.shape[0] - ) - for col in [ - "num_s_tel", "num_r_tel", "num_in_caller", - "num_out_caller", "num_mis_caller" - ]: - sum_all_set_cols += stats_pdframe[col] - if ( - stats_pdframe[ - "num_uniq_individuals_call_or_text" - ] < stats_pdframe[col] - ).any(): - logger.error( - "Error: " - "num_uniq_individuals_call_or_text " - "was found to be less than %s for at " - "least one time interval. This error " - "comes from an issue with the code," - " not an issue with the input data", - col - ) - if ( - stats_pdframe[ - "num_uniq_individuals_call_or_text" - ] > sum_all_set_cols - ).any(): - logger.error( - "Error: " - "num_uniq_individuals_call_or_text " - "was found to be larger than the sum " - "of individual cardinalities for at " - "least one time interval. This error " - "comes from an issue with the code," - " not an issue with the input data" - ) - - write_all_summaries(bid, stats_pdframe, output_folder) - - logger.info( - "Summary statistics obtained. Finished." + # process the data for each participant in each frequency into a folder of + # the corresponding frequency. + for beiwe_id in beiwe_ids: + for freq in frequencies: + logger.info(f"({freq.name.lower()}) Participant: {beiwe_id}") + try: + log_stats_inner( + beiwe_id, + f"{output_folder}/{freq.name.lower()}", + study_folder, + frequency, + tz_str, + time_start, + time_end ) + except Exception as err: + logger.error(f"An error occurred when processing data: {err}") + + logger.info("Summary statistics obtained. Finished.") + + +def log_stats_inner( + beiwe_id: str, + output_folder: str, + study_folder: str, + frequency: Frequency, + tz_str: str, + time_start: Optional[List] = None, + time_end: Optional[List] = None, +): + """ Inner functionality of log_stats_main """ + # read data + text_data, text_stamp_start, text_stamp_end = read_data( + beiwe_id, study_folder, "texts", tz_str, time_start, time_end + ) + call_data, call_stamp_start, call_stamp_end = read_data( + beiwe_id, study_folder, "calls", tz_str, time_start, time_end + ) + + # give up early if there is no data + if text_data.shape[0] <= 0 and call_data.shape[0] <= 0: + logger.info(f"There was no data for participant {beiwe_id}") + return - except Exception as err: + # stamps from call and text should be the stamp_end + logger.info("Data imported ...") + stamp_start = min(text_stamp_start, call_stamp_start) + stamp_end = max(text_stamp_end, call_stamp_end) + + # process the data + stats_pdframe = comm_logs_summaries( + text_data, call_data, stamp_start, stamp_end, tz_str, frequency + ) + + # num_uniq_individuals_call_or_text is the cardinality of the union of + # several sets. It should should always be at least as large as the + # cardinality of any one of the sets, and it should never be larger than the + # sum of the cardinalities of all of the sets. (it may be equal if all the + # sets are disjoint) + num_uniq_column = "num_uniq_individuals_call_or_text" # legibility hax. + sum_all_set_cols = pd.Series([0]*stats_pdframe.shape[0]) + for column in [ + "num_s_tel", "num_r_tel", "num_in_caller", + "num_out_caller", "num_mis_caller" + ]: + sum_all_set_cols += stats_pdframe[column] + if (stats_pdframe[num_uniq_column] < stats_pdframe[column]).any(): logger.error( - "An error occurred when processing the data: %s", err + "Error: " + "num_uniq_individuals_call_or_text was found to be less than " + "%s for at least one time interval. This error comes from an " + "issue with the code, not an issue with the input data." % + column ) + + if (stats_pdframe[num_uniq_column] > sum_all_set_cols).any(): + logger.error( + "Error: " + "num_uniq_individuals_call_or_text was found to be larger than the" + "sum of individual cardinalities for at least one time interval. " + "This error comes from an issue with the code, not an issue with " + "the input data." + ) + + write_all_summaries(beiwe_id, stats_pdframe, output_folder) From 810ef6c1f2779c46be402819fd807402b6769387 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Wed, 15 Nov 2023 17:37:03 -0500 Subject: [PATCH 05/10] Sycamore refactor/changes to have output in canonical beiwe forest ruvnner locations, and definitely a big bug fix. --- forest/sycamore/base.py | 90 +++++++++++++++++----------------------- forest/sycamore/utils.py | 3 +- 2 files changed, 40 insertions(+), 53 deletions(-) diff --git a/forest/sycamore/base.py b/forest/sycamore/base.py index 853ab2d8..a5d4dde6 100644 --- a/forest/sycamore/base.py +++ b/forest/sycamore/base.py @@ -217,12 +217,16 @@ def compute_survey_stats( def get_submits_for_tableau( - study_folder: str, output_folder: str, config_path: str, - tz_str: str = "UTC", start_date: str = EARLIEST_DATE, - end_date: Optional[str] = None, users: Optional[List] = None, - interventions_filepath: Optional[str] = None, - submits_timeframe: Frequency = Frequency.DAILY, - history_path: Optional[str] = None + study_folder: str, + output_folder: str, + config_path: str, + tz_str: str = "UTC", + start_date: str = EARLIEST_DATE, + end_date: Optional[str] = None, + users: Optional[List] = None, + interventions_filepath: Optional[str] = None, + submits_timeframe: Frequency = Frequency.DAILY, + history_path: Optional[str] = None ) -> None: """Get survey submissions per day for integration into Tableau WDC @@ -240,8 +244,7 @@ def get_submits_for_tableau( end_date: The latest survey data to read in, in YYYY-MM-DD format users: - List of users in study for which we - are generating a survey schedule + List of users in study for which we are generating a survey schedule interventions_filepath: filepath where interventions json file is. submits_timeframe: @@ -250,58 +253,43 @@ def get_submits_for_tableau( history_path: Filepath to the survey history file. If this is not included, audio survey timings cannot be estimated. """ + + if submits_timeframe == Frequency.HOURLY_AND_DAILY: + submits_timeframes = [Frequency.HOURLY, Frequency.DAILY] + else: + submits_timeframes = [submits_timeframe] os.makedirs(output_folder, exist_ok=True) + for freq in submits_timeframes: + os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True) if users is None: users = get_ids(study_folder) - if end_date is None: end_date = get_month_from_today() # Read, aggregate and clean data - else: - agg_data = aggregate_surveys_config( - study_folder, config_path, tz_str, users, start_date, - end_date, augment_with_answers=True, include_audio_surveys=True - ) - - if agg_data.shape[0] == 0: - logger.error("Error: No survey data found in %s", study_folder) - return - - # Create survey submits detail and summary - ss_detail = survey_submits( - config_path, start_date, end_date, - users, agg_data, interventions_filepath, history_path - ) - - if ss_detail.shape[0] == 0: - logger.error("Error: no submission data found") - return + agg_data = aggregate_surveys_config( + study_folder, config_path, tz_str, users, start_date, + end_date, augment_with_answers=True, include_audio_surveys=True + ) - if submits_timeframe == Frequency.HOURLY_AND_DAILY: - ss_summary_h = summarize_submits( - ss_detail, Frequency.HOURLY, False - ) - ss_summary_d = summarize_submits( - ss_detail, Frequency.DAILY, False - ) + if agg_data.shape[0] == 0: + logger.error("Error: No survey data found in %s", study_folder) + return - write_data_by_user(ss_summary_d, - os.path.join(output_folder, "both", "daily"), - users) - write_data_by_user(ss_summary_h, - os.path.join(output_folder, "both", "hourly"), - users) + # Create survey submits detail and summary + ss_detail = survey_submits( + config_path, start_date, end_date, + users, agg_data, interventions_filepath, history_path + ) - elif submits_timeframe == Frequency.HOURLY: - ss_summary_h = summarize_submits( - ss_detail, Frequency.HOURLY, False - ) - write_data_by_user(ss_summary_h, output_folder, users) + if ss_detail.shape[0] == 0: + logger.error("Error: no submission data found") + return - elif submits_timeframe == Frequency.DAILY: - ss_summary_d = summarize_submits( - ss_detail, Frequency.DAILY, False - ) - write_data_by_user(ss_summary_d, output_folder, users) + # run once for every submits_timeframe, per-user is handled internally + for freq in submits_timeframes: + ss_summary = summarize_submits(ss_detail, freq, False) + write_data_by_user( + ss_summary, f"{output_folder}/{freq.name.lower()}", users + ) \ No newline at end of file diff --git a/forest/sycamore/utils.py b/forest/sycamore/utils.py index a4f365cd..9b69b70f 100644 --- a/forest/sycamore/utils.py +++ b/forest/sycamore/utils.py @@ -31,8 +31,7 @@ def get_month_from_today(): datetime.timedelta(31)).strftime("%Y-%m-%d") -def filename_to_timestamp(filename: str, tz_str: str = "UTC" - ) -> pd.Timestamp: +def filename_to_timestamp(filename: str, tz_str: str = "UTC") -> pd.Timestamp: """Extract a datetime from a filepath. Args: From 3db2547d380768a74b8bf410de5f08ee19c8e418 Mon Sep 17 00:00:00 2001 From: Ilya Sytchev Date: Wed, 29 Nov 2023 12:40:54 -0500 Subject: [PATCH 06/10] Avoid using the root logger --- docs/source/logging.md | 7 ------- forest/bonsai/simulate_gps_data.py | 4 ++-- forest/jasmine/data2mobmat.py | 4 ++-- forest/jasmine/mobmat2traj.py | 4 ++-- forest/jasmine/sogp_gps.py | 4 ++-- forest/jasmine/traj2stats.py | 4 ++-- forest/willow/log_stats.py | 4 ++-- 7 files changed, 12 insertions(+), 19 deletions(-) diff --git a/docs/source/logging.md b/docs/source/logging.md index 581f345b..a47d9e87 100644 --- a/docs/source/logging.md +++ b/docs/source/logging.md @@ -37,13 +37,6 @@ import logging logger = logging.getLogger(__name__) ``` -Or like this: - -``` -from logging import getLogger -logger = getLogger(__name__) -``` - ## 3. How to insert log messages into definitions Basic `logging` messages: diff --git a/forest/bonsai/simulate_gps_data.py b/forest/bonsai/simulate_gps_data.py index 1f0c6249..2e3b77a3 100644 --- a/forest/bonsai/simulate_gps_data.py +++ b/forest/bonsai/simulate_gps_data.py @@ -27,8 +27,8 @@ TRAVELLING_STATUS_LIST = range(11) -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) class PossibleExits(Enum): diff --git a/forest/jasmine/data2mobmat.py b/forest/jasmine/data2mobmat.py index 2d8a78a5..da282397 100644 --- a/forest/jasmine/data2mobmat.py +++ b/forest/jasmine/data2mobmat.py @@ -15,8 +15,8 @@ TOLERANCE = 1e-6 -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) def cartesian( diff --git a/forest/jasmine/mobmat2traj.py b/forest/jasmine/mobmat2traj.py index 1527a982..e4579ed8 100644 --- a/forest/jasmine/mobmat2traj.py +++ b/forest/jasmine/mobmat2traj.py @@ -13,8 +13,8 @@ from .data2mobmat import great_circle_dist, exist_knot -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) # the details of the functions are in paper [Liu and Onnela (2020)] diff --git a/forest/jasmine/sogp_gps.py b/forest/jasmine/sogp_gps.py index 86931e72..e5532e95 100644 --- a/forest/jasmine/sogp_gps.py +++ b/forest/jasmine/sogp_gps.py @@ -14,8 +14,8 @@ import numpy as np -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) def calculate_k0(x1: np.ndarray, x2: np.ndarray, pars: list) -> float: diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index da6ca15c..ca13b384 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -32,8 +32,8 @@ from forest.utils import get_ids -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) @dataclass diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 6bf12ae5..3eee8a25 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -17,8 +17,8 @@ ) -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) def text_analysis( From de77f9b7292d5a8318349372df344737dbe6e330 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Thu, 30 Nov 2023 17:31:57 -0500 Subject: [PATCH 07/10] fixes incorrect type annotation on gps_stats_generate_summary --- forest/jasmine/traj2stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index 08f39cf9..4cbc990a 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -1764,7 +1764,7 @@ def gps_stats_generate_summary( participant_id: str, output_folder: str, logs_folder: str, - parameters: Optional[Hyperparameters] = None, + parameters: Hyperparameters, places_of_interest: Optional[list] = None, osm_tags: Optional[List[OSMTags]] = None, ): From 985a4d18d65c037a6773829499c05b9b43f54278 Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Thu, 30 Nov 2023 18:00:36 -0500 Subject: [PATCH 08/10] addresses flake8 errors. --- forest/jasmine/traj2stats.py | 12 ++++++------ forest/sycamore/base.py | 4 ++-- forest/willow/log_stats.py | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index 4cbc990a..97ee20ad 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -1611,8 +1611,8 @@ def gps_stats_main( else: frequencies = [frequency] - # Ensure that the correct output folder structures exist, centralize folder names. - # Note that frequencies + # Ensure that the correct output folder structures exist, centralize folder + # names. Note that frequencies trajectory_folder = f"{output_folder}/trajectory" logs_folder = f"{output_folder}/logs" os.makedirs(output_folder, exist_ok=True) @@ -1636,7 +1636,7 @@ def gps_stats_main( if participant_ids is None: participant_ids = get_ids(study_folder) - # Create a record of processed user participant_id and starting/ending time. + # Create a record of processed participant_id and starting/ending time. # These are updated and saved to disk after each participant is processed. all_memory_dict_file = f"{output_folder}/all_memory_dict.pkl" all_bv_set_file = f"{output_folder}/all_bv_set.pkl" @@ -1737,7 +1737,8 @@ def gps_stats_main( index=False ) - # generate summary stats. (variable "frequency" is already declared in signature) + # generate summary stats. + # (variable "frequency" is already declared in signature) for freq in frequencies: gps_stats_generate_summary( traj=traj, @@ -1766,8 +1767,7 @@ def gps_stats_generate_summary( logs_folder: str, parameters: Hyperparameters, places_of_interest: Optional[list] = None, - osm_tags: Optional[List[OSMTags]] = None, - ): + osm_tags: Optional[List[OSMTags]] = None): """This is simply the inner functionality of gps_stats_main. Runs summaries code, writes to disk, saves logs if required. """ summary_stats, logs = gps_summaries( diff --git a/forest/sycamore/base.py b/forest/sycamore/base.py index ff9a03b5..86649926 100644 --- a/forest/sycamore/base.py +++ b/forest/sycamore/base.py @@ -251,7 +251,7 @@ def get_submits_for_tableau( end_date: The latest survey data to read in, in YYYY-MM-DD format users: - List of users in study for which we are generating a survey schedule + List of users in study that we are generating a survey schedule for interventions_filepath: filepath where interventions json file is. submits_timeframe: @@ -305,4 +305,4 @@ def get_submits_for_tableau( ss_summary = summarize_submits(ss_detail, freq, False) write_data_by_user( ss_summary, f"{output_folder}/{freq.name.lower()}", users - ) \ No newline at end of file + ) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index d162ba9a..12c192b8 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -509,9 +509,9 @@ def log_stats_inner( # num_uniq_individuals_call_or_text is the cardinality of the union of # several sets. It should should always be at least as large as the - # cardinality of any one of the sets, and it should never be larger than the - # sum of the cardinalities of all of the sets. (it may be equal if all the - # sets are disjoint) + # cardinality of any one of the sets, and it should never be larger than + # the sum of the cardinalities of all of the sets. (it may be equal if all + # the sets are disjoint) num_uniq_column = "num_uniq_individuals_call_or_text" # legibility hax. sum_all_set_cols = pd.Series([0]*stats_pdframe.shape[0]) for column in [ From f2a8205e8eedcd44589128bc3351f7ccb8ef9376 Mon Sep 17 00:00:00 2001 From: Ilya Sytchev Date: Thu, 14 Dec 2023 14:42:13 -0500 Subject: [PATCH 09/10] Add ffmpeg for Windows builds --- .github/workflows/build.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6bbc304a..90b8374b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,12 +19,17 @@ jobs: uses: actions/setup-python@v4 with: python-version: 3.8 - - name: Install Forest dependencies + - name: Install Forest dependencies for Linux # required by librosa if: ${{ startsWith(matrix.os, 'ubuntu') }} run: | sudo apt-get update sudo apt-get install -y ffmpeg libsndfile1 + - name: Install Forest dependencies for Windows + # required by librosa + if: ${{ startsWith(matrix.os, 'windows') }} + uses: FedericoCarboni/setup-ffmpeg@v2 + id: setup-ffmpeg - name: Install Forest run: pip install -e . - name: Install dev dependencies From 1dc532b591ff3ac05a0289829c34ffe1d69c3bfe Mon Sep 17 00:00:00 2001 From: Eli Jones Date: Tue, 16 Jan 2024 12:59:06 -0500 Subject: [PATCH 10/10] should resolve remaining logger formatting issues on https://github.com/onnela-lab/forest/pull/228 --- forest/willow/log_stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 12c192b8..b797e981 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -457,7 +457,7 @@ def log_stats_main( # the corresponding frequency. for beiwe_id in beiwe_ids: for freq in frequencies: - logger.info(f"({freq.name.lower()}) Participant: {beiwe_id}") + logger.info("(%s) Participant: %s", freq.name.lower(), beiwe_id) try: log_stats_inner( beiwe_id, @@ -469,7 +469,7 @@ def log_stats_main( time_end ) except Exception as err: - logger.error(f"An error occurred when processing data: {err}") + logger.error("An error occurred when processing data: %s", err) logger.info("Summary statistics obtained. Finished.") @@ -494,7 +494,7 @@ def log_stats_inner( # give up early if there is no data if text_data.shape[0] <= 0 and call_data.shape[0] <= 0: - logger.info(f"There was no data for participant {beiwe_id}") + logger.info("There was no data for participant %s", beiwe_id) return # stamps from call and text should be the stamp_end @@ -524,7 +524,7 @@ def log_stats_inner( "Error: " "num_uniq_individuals_call_or_text was found to be less than " "%s for at least one time interval. This error comes from an " - "issue with the code, not an issue with the input data." % + "issue with the code, not an issue with the input data.", column )