diff --git a/README.md b/README.md index 1dc9d610..1a2df8f7 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ sample_gps_data = sim_gps_data(n_persons, location, start_date, end_date, cycle, gps_to_csv(sample_gps_data, path_to_synthetic_gps_data, start_date, end_date) # 2. Specify parameters for imputation -# See https://github.com/onnela-lab/forest/wiki/Jasmine-documentation#input for details +# See https://forest.beiwe.org/en/latest/jasmine.html for details # time zone where the study took place (assumes that all participants were always in this time zone) tz_str = "Etc/GMT-1" # Generate summary metrics e.g. Frequency.HOURLY, Frequency.DAILY or Frequency.HOURLY_AND_DAILY (see Frequency class in constants.py) @@ -127,12 +127,8 @@ save_traj = False parameters = None # list of locations to track if visited, leave None if don't want these summary statistics places_of_interest = ['cafe', 'bar', 'hospital'] -# True if want to save a log of all locations and attributes of those locations visited -save_osm_log = True # list of OpenStreetMap tags to use for identifying locations, leave None to default to amenity and leisure tagged locations or if you don't want to use OSM (see OSMTags class in constants.py) osm_tags = None -# threshold of time spent in a location to count as being in that location, in minutes -threshold = 15 # 3. Impute location data and generate mobility summary metrics using the simulated data above gps_stats_main( @@ -143,9 +139,7 @@ gps_stats_main( save_traj = save_traj, parameters = parameters, places_of_interest = places_of_interest, - save_osm_log = save_osm_log, - osm_tags = None, - threshold = threshold, + osm_tags = osm_tags, ) # 4. Generate daily summary metrics for call/text logs diff --git a/docs/source/index.md b/docs/source/index.md index 0c8f29da..25d0c46b 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -170,15 +170,15 @@ The summary statistics that are generated are listed below: - Entropy measure based on the proportion of time spent at significant locations over the course of a day - Letting p_i be the proportion of the day spent at significant location I, significant location entropy is calculated as -\sum_{i} p_i*log(p_i), where the sum occurs over all non-zero p_i for that day. * - mis_duration - - Float + - Not Available - Number of hours of GPS data missing over the course of a day - * - Physical circadian rhythm - - Not Available + - Float - A continuous measurement of routine in the interval [0,1] that scores a day with 0 if there was a complete break from routine and 1 if the person followed the exact same routine as have in every other day of follow up - For a detailed description of how this measure is calculated, see Canzian and Musolesi's 2015 paper in the Proceedings of the 2015 ACM International Joint Conference on Pervasive and Ubiquitous Computing, titled "Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis." Their procedure was followed using 30-min increments as a bin size. * - Physical circadian rhythm stratified - - Not Available + - Float - A continuous measurement of routine in the interval [0,1] that scores a day with 0 if there was a complete break from routine and 1 if the person followed the exact same routine as have in every other day of follow up - Calculated in the same way as Physical circadian rhythm, except the procedure is repeated separately for weekends and weekdays. ``` diff --git a/docs/source/jasmine.md b/docs/source/jasmine.md index 4e3381cd..6e4f0517 100644 --- a/docs/source/jasmine.md +++ b/docs/source/jasmine.md @@ -11,7 +11,7 @@ For instructions on how to install forest, please visit [here](https://github.co ### Input -When using jasmine, you should call function `gps_stats_main(study_folder, output_folder, tz_str, frequency, save_traj, parameters = None, save_osm_log = None, osm_tags = None, threshold, split_day_night, person_point_radius = 2, place_point_radius = 7.5, time_start = None, time_end = None, participant_ids = None, all_memory_dict = None, all_BV_set = None, quality_threshold = 0.05)` in the `traj2stats` module and specify: +When using jasmine, you should call function `gps_stats_main(study_folder, output_folder, tz_str, frequency, save_traj, places_of_interest = None, osm_tags = None, time_start = None, time_end = None, participant_ids = None, parameters = None, all_memory_dict = None, all_bv_set = None)` in the `traj2stats` module and specify: - `study_folder`, string, the path of the study folder. The study folder should contain individual participant folder with a subfolder `gps` inside - `output_folder`, string, the path of the folder where you want to save results @@ -27,17 +27,13 @@ In addition, the main function takes four arguments that provide further flexibi - `tz_str`, string, the timezone where the study is/was conducted. Please use "`pytz.all_timezones`" to check all options. For example, "America/New_York". - `frequency`, Frequency class, the frequency of the summary stats (resolution for summary statistics) e.g. Frequency.HOURLY, Frequency.DAILY, etc. - `save_traj`, bool, True if you want to save the trajectories as a csv file, False if you don't (default: False). - - `parameters`, a list of parameters, by default it is set to None. The details are as below. - `places_of_interest`, a list of places of interest, by default it is set to None. The details are as used in openstreetmaps - - `save_osm_log`, bool, True if you want to output a log of locations visited and their tags(default: False). - `osm_tags`, list of OSMTags class, a list of tags to filter the places of interest, by default it is set to None. The details are as used in openstreetmaps. Avoid using a lot of them if large area is covered. - - `threshold`, int, time spent in a pause needs to exceed the threshold to be placed in the log - - `split_day_night`, bool, True if you want to split all metrics to datetime and nighttime patterns (only for Frequency.DAILY) - - person_point_radius, float, radius of the person's circle when discovering places near him in pauses (default: 2) - - `place_point_radius`, float, radius of place's circle when place is returned as centre coordinates from osm (default: 7.5) - - `all_memory_dict` and `all_BV_set` are dictionaries from previous run (none if it's the first time). + - `parameters`, a list of parameters, by default it is set to None. The details are as below. + - `all_memory_dict` and `all_bv_set` are dictionaries from previous run (none if it's the first time). + +You can also tweak the parameters that change the assumptions of the imputation and summary statistics. The parameters are -You can also tweak the parameters that change the assumptions of the imputation and summary statistics. The parameters are (1) `l1`: the scale parameter in the abs function in the daily kernel; (2) `l2`: the scale parameter in the abs function in the weekly kernel; (3) `l3`: the scale parameter in the geographical kernel if only latitude or longitude is used; @@ -58,7 +54,17 @@ You can also tweak the parameters that change the assumptions of the imputation (18) `accuracylim`: we filter out GPS record with accuracy higher than this threshold. (19) `r`: the maximum radius of a pause; (20) `w`: a threshold for distance, if the distance to the great circle is greater than this threshold, we consider there is a knot; -(21) `h`: a threshold of distance, if the movement between two timestamps is less than h, consider it as a pause and a knot +(21) `h`: a threshold of distance, if the movement between two timestamps is less than h, consider it as a pause and a knot +(22) `save_osm_log`: bool, True if you want to output a log of locations visited and their tags(default: False). +(23) `log_threshold`: int, time spent in a pause needs to exceed the threshold to be placed in the log +(24) `split_day_night`: bool, True if you want to split all metrics to datetime and nighttime patterns (only for Frequency.DAILY) +(25) `person_point_radius`: float, radius of the person's circle when discovering places near him in pauses (default: 2) +(26) `place_point_radius`: float, radius of place's circle when place is returned as centre coordinates from osm (default: 7.5) +(27) `pcr_bool`: bool, True if you want to calculate the physical cyrcadian rhythm (default: False) +(28) `pcr_window`: int, number of days to look back and forward for calculating the physical cyrcadian rhythm (default: 14) +(29) `pcr_sample_rate`: int, number of seconds between each sample for calculating the physical cyrcadian rhythm (default: 30) + + ### Output (1) summary statistics for all specified participants (.csv) @@ -70,8 +76,8 @@ You can also tweak the parameters that change the assumptions of the imputation - Contains start date/time and end date/time for each participant.\ - Is useful for tracking whose data during which time range have been processed, especially for the online algorithm. -(4) all_BV_set (.pkl)\ - - It is a dictionary, with the key as user ID and the value as a numpy array with size, where each column represents [start_timestamp, start_latitude, start_longitude, end_timestamp, end_latitude, end_longitude]. If it is your first time run the code, it is set to NULL by default. If you want to continue your analysis from here in the future, all_BV_set is expected to be an input in your new analysis and it will be updated in that run. The size of the file should be fixed overtime. +(4) all_bv_set (.pkl)\ + - It is a dictionary, with the key as user ID and the value as a numpy array with size, where each column represents [start_timestamp, start_latitude, start_longitude, end_timestamp, end_latitude, end_longitude]. If it is your first time run the code, it is set to NULL by default. If you want to continue your analysis from here in the future, all_bv_set is expected to be an input in your new analysis and it will be updated in that run. The size of the file should be fixed overtime. (5) all_memory_dict (.pkl)\ - It is also a dictionary, with the key as user ID and the value as a numpy array of other parameters for the user. If it is your first time run the code, it is set to NULL by default. If you want to continue your analysis from here in the future, all_memory_dict is expected to be an input in your new analysis and it will be updated in that run. The size of the file should be fixed overtime. @@ -115,8 +121,7 @@ This file imputes the missing trajectories based on the observed trajectory matr `traj2stats.py` This file converts the imputed trajectory matrix to summary statistics. - -- `Hyperparameters`: @dataclass to store the hyperparameters for the imputation process. +- `Hyperparameters`: dataclass to store the hyperparameters for the imputation and summary statistics. - `transform_point_to_circle`: transform a transforms a set of cooordinates to a shapely circle with a provided radius. - `get_nearby_locations`: return a dictionary of nearby locations, a dictionary of nearby locations' names, and a dictionary of nearby locations' coordinates. - `gps_summaries`: converts the imputed trajectory matrix to summary statistics. @@ -147,9 +152,9 @@ The summary statistics that are generated are listed below: | Average pause duration | Float | Average of the duration of all pauses that took place over the course of a day (in hour) | We consider that a participant has a pause if the distance that he has moved during a 30-s period is less than `r` m. By default, `r`=10.| | Standard deviation of flight duration | Float | Standard deviation of the duration of all pauses that took place over the course of a day (in hour) | GPS is converted into a sequence of flights (straight line movement) and pauses (time spent stationary). The standard deviation of duration of pauses over the course of a day is reported. | | Significant location entropy | Float | Entropy measure based on the proportion of time spent at significant locations over the course of a day | Letting p_i be the proportion of the day spent at significant location I, significant location entropy is calculated as -\sum_{i} p_i*log(p_i), where the sum occurs over all non-zero p_i for that day. | -| Minutes of GPS data missing | Float | Number of minutes of GPS data missing over the course of a day | | -| Physical circadian rhythm | Not Available | A continuous measurement of routine in the interval [0,1] that scores a day with 0 if there was a complete break from routine and 1 if the person followed the exact same routine as have in every other day of follow up | For a detailed description of how this measure is calculated, see Canzian and Musolesi's 2015 paper in the Proceedings of the 2015 ACM International Joint Conference on Pervasive and Ubiquitous Computing, titled "Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis." Their procedure was followed using 30-min increments as a bin size.| -| Physical circadian rhythm stratified | Not Available | A continuous measurement of routine in the interval [0,1] that scores a day with 0 if there was a complete break from routine and 1 if the person followed the exact same routine as have in every other day of follow up | Calculated in the same way as Physical circadian rhythm, except the procedure is repeated separately for weekends and weekdays. | +| Minutes of GPS data missing | Not Available | Number of minutes of GPS data missing over the course of a day | | +| Physical circadian rhythm | Float | A continuous measurement of routine in the interval [0,1] that scores a day with 0 if there was a complete break from routine and 1 if the person followed the exact same routine as have in every other day of follow up | For a detailed description of how this measure is calculated, see Canzian and Musolesi's 2015 paper in the Proceedings of the 2015 ACM International Joint Conference on Pervasive and Ubiquitous Computing, titled "Trajectories of depression: unobtrusive monitoring of depressive states by means of smartphone mobility traces analysis." Their procedure was followed using 30-min increments as a bin size.| +| Physical circadian rhythm stratified | Float | A continuous measurement of routine in the interval [0,1] that scores a day with 0 if there was a complete break from routine and 1 if the person followed the exact same routine as have in every other day of follow up | Calculated in the same way as Physical circadian rhythm, except the procedure is repeated separately for weekends and weekdays. | ### Other technical details diff --git a/forest/jasmine/tests/test_traj2stats.py b/forest/jasmine/tests/test_traj2stats.py index 31925f13..48988f13 100644 --- a/forest/jasmine/tests/test_traj2stats.py +++ b/forest/jasmine/tests/test_traj2stats.py @@ -5,8 +5,11 @@ from shapely.geometry import Point from forest.jasmine.data2mobmat import great_circle_dist -from forest.jasmine.traj2stats import (Frequency, transform_point_to_circle, - gps_summaries) +from forest.jasmine.traj2stats import ( + Frequency, gps_summaries, Hyperparameters, transform_point_to_circle, + avg_mobility_trace_difference, create_mobility_trace, get_pause_array, + extract_pause_from_row, compute_window_and_count +) @pytest.fixture() @@ -48,7 +51,7 @@ def test_transform_point_to_circle_radius(coords1): ] distance = great_circle_dist(*coords1, *point_in_edge)[0] - assert distance >= 4 and distance <= 5 + assert 4 <= distance <= 5 @pytest.fixture() @@ -273,16 +276,16 @@ def test_gps_summaries_shape( return_value=sample_nearby_locations, ) mocker.patch("forest.jasmine.traj2stats.locate_home", return_value=coords1) + + parameters = Hyperparameters() + parameters.save_osm_log = True + summary, _ = gps_summaries( traj=sample_trajectory, tz_str="Europe/London", frequency=Frequency.HOURLY, + parameters=parameters, places_of_interest=["pub", "fast_food"], - save_osm_log=True, - threshold=None, - split_day_night=False, - person_point_radius=2, - place_point_radius=7.5, ) assert summary.shape == (24, 21) @@ -296,16 +299,16 @@ def test_gps_summaries_places_of_interest( return_value=sample_nearby_locations, ) mocker.patch("forest.jasmine.traj2stats.locate_home", return_value=coords1) + + parameters = Hyperparameters() + parameters.save_osm_log = True + summary, _ = gps_summaries( traj=sample_trajectory, tz_str="Europe/London", frequency=Frequency.HOURLY, + parameters=parameters, places_of_interest=["pub", "fast_food"], - save_osm_log=True, - threshold=None, - split_day_night=False, - person_point_radius=2, - place_point_radius=7.5, ) time_in_places_of_interest = ( summary["pub"] + summary["fast_food"] + summary["other"] @@ -324,16 +327,16 @@ def test_gps_summaries_obs_day_night( return_value=sample_nearby_locations, ) mocker.patch("forest.jasmine.traj2stats.locate_home", return_value=coords1) + + parameters = Hyperparameters() + parameters.save_osm_log = True + summary, _ = gps_summaries( traj=sample_trajectory, tz_str="Europe/London", frequency=Frequency.DAILY, + parameters=parameters, places_of_interest=["pub", "fast_food"], - save_osm_log=True, - threshold=None, - split_day_night=False, - person_point_radius=2, - place_point_radius=7.5, ) total_obs = summary["obs_day"] + summary["obs_night"] assert np.all(round(total_obs, 4) == round(summary["obs_duration"], 4)) @@ -348,16 +351,17 @@ def test_gps_summaries_datetime_nighttime_shape( return_value=sample_nearby_locations, ) mocker.patch("forest.jasmine.traj2stats.locate_home", return_value=coords1) + + parameters = Hyperparameters() + parameters.save_osm_log = True + parameters.split_day_night = True + summary, _ = gps_summaries( traj=sample_trajectory, tz_str="Europe/London", frequency=Frequency.DAILY, + parameters=parameters, places_of_interest=["pub", "fast_food"], - save_osm_log=True, - threshold=None, - split_day_night=True, - person_point_radius=2, - place_point_radius=7.5, ) assert summary.shape == (2, 46) @@ -373,16 +377,16 @@ def test_gps_summaries_log_format( return_value=sample_nearby_locations, ) mocker.patch("forest.jasmine.traj2stats.locate_home", return_value=coords1) + + parameters = Hyperparameters() + parameters.save_osm_log = True + summary, log = gps_summaries( traj=sample_trajectory, tz_str="Europe/London", frequency=Frequency.DAILY, + parameters=parameters, places_of_interest=["pub", "fast_food"], - save_osm_log=True, - threshold=None, - split_day_night=False, - person_point_radius=2, - place_point_radius=7.5, ) dates_stats = ( summary["day"].astype(int).astype(str) @@ -393,3 +397,213 @@ def test_gps_summaries_log_format( ) dates_log = np.array(list(log.keys())) assert np.all(dates_stats == dates_log) + + +@pytest.fixture() +def mobmat1(): + """mobility matrix 1""" + return np.array( + [ + [16.49835, -142.72462, 1], + [16.49521, -142.72461, 2], + [51.45435654, -2.58555554, 3], + [51.45435621, -2.58555524, 4], + [51.45435632, -2.58555544, 5] + ] + ) + + +@pytest.fixture() +def mobmat2(): + """mobility matrix 2""" + return np.array( + [ + [51.45435654, -2.58555554, 1], + [51.45435654, -2.58555554, 2], + [51.45435654, -2.58555554, 3], + [51.45435654, -2.58555554, 4], + [51.45435654, -2.58555554, 5] + ] + ) + + +@pytest.fixture() +def mobmat3(): + """mobility matrix 3""" + return np.array( + [ + [51.45435654, -2.58555554, 7], + [51.45435654, -2.58555554, 8], + [51.45435654, -2.58555554, 9], + [51.45435654, -2.58555554, 10], + [51.45435654, -2.58555554, 11] + ] + ) + + +def test_avg_mobility_trace_difference_common_timestamps( + mobmat1, mobmat2 +): + """Testing avg mobility trace difference + when there are common timestamps and all points are close + """ + + time_range = (3, 5) + res = avg_mobility_trace_difference( + time_range, mobmat1, mobmat2 + ) + + assert res == 1 + + +def test_avg_mobility_trace_difference_common_timestamps2( + mobmat1, mobmat2 +): + """Testing avg mobility trace difference + when there are common timestamps and some points are close + """ + + time_range = (1, 5) + res = avg_mobility_trace_difference( + time_range, mobmat1, mobmat2 + ) + + assert res == 0.6 + + +def test_avg_mobility_trace_difference_no_common_timestamps( + mobmat1, mobmat3 +): + """Testing avg mobility trace difference + when there are no common timestamps + """ + + time_range = (1, 5) + res = avg_mobility_trace_difference( + time_range, mobmat1, mobmat3 + ) + + assert res == 0 + + +def test_create_mobility_trace_shape(sample_trajectory): + """Testing shape of mobility trace""" + + res = create_mobility_trace(sample_trajectory) + + assert res.shape == (81200, 3) + + +def test_create_mobility_trace_start_end_times(sample_trajectory): + """Testing start and end times of mobility trace""" + + res = create_mobility_trace(sample_trajectory) + + assert res[0, 2] == 1633042800.0 + assert res[-1, 2] == 1633129499.0 + + +def test_get_pause_array_shape(sample_trajectory, coords2): + """Testing shape of pause array""" + + parameters = Hyperparameters() + + pause_array = get_pause_array( + sample_trajectory[sample_trajectory[:, 0] == 2, :], + *coords2, + parameters + ) + + assert pause_array.shape == (3, 3) + + +def test_get_pause_array_times(sample_trajectory, coords2): + """Testing times spent in places of pause array""" + + parameters = Hyperparameters() + + pause_array = get_pause_array( + sample_trajectory[sample_trajectory[:, 0] == 2, :], + *coords2, + parameters + ) + + assert pause_array[0, 2] == 1113.3333333333333 + assert pause_array[-1, 2] == 180 + + +def test_get_pause_array_house(sample_trajectory): + """Testing case where house is in pause array""" + + house_coords = (51.45435654, -2.58555554) + parameters = Hyperparameters() + + pause_array = get_pause_array( + sample_trajectory[sample_trajectory[:, 0] == 2, :], + *house_coords, + parameters + ) + + assert pause_array.shape == (2, 3) + + +def test_extract_pause_from_row_shape(sample_trajectory): + """Testing shape of pause array""" + + pause_list = extract_pause_from_row( + sample_trajectory[0, :] + ) + + assert len(pause_list) == 3 + + +def test_extract_pause_from_row_time(sample_trajectory): + """Testing pause time of row""" + + pause_list = extract_pause_from_row( + sample_trajectory[0, :] + ) + + true_val = sample_trajectory[0, 6] - sample_trajectory[0, 3] + + assert pause_list[2] == true_val / 60 + + +def test_compute_window_size(sample_trajectory): + """Testing window size is correct""" + + window, _ = compute_window_and_count( + sample_trajectory[0, 3], sample_trajectory[-1, 6], 1 + ) + + assert window == 3600 + + +def test_compute_window_count(sample_trajectory): + """Testing number of windows is correct""" + + _, num_windows = compute_window_and_count( + sample_trajectory[0, 3], sample_trajectory[-1, 6], 1 + ) + + assert num_windows == 24 + + +def test_compute_window_size_6_hour(sample_trajectory): + """Testing window size is correct 6 hour window""" + + window, _ = compute_window_and_count( + sample_trajectory[0, 3], sample_trajectory[-1, 6], 6 + ) + + assert window == 3600 * 6 + + +def test_compute_window_count_6_hour(sample_trajectory): + """Testing number of windows is correct 6 hour window""" + + _, num_windows = compute_window_and_count( + sample_trajectory[0, 3], sample_trajectory[-1, 6], 6 + ) + + assert num_windows == 4 diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index 67c0b70c..ed2d8a92 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -3,6 +3,7 @@ """ from dataclasses import dataclass +from datetime import datetime import json import logging import os @@ -37,7 +38,8 @@ @dataclass class Hyperparameters: - """Class containing hyperparemeters for imputation of trajectories. + """Class containing hyperparemeters for gps imputation and trajectory + summary statistics calculation. Args: itrvl, accuracylim, r, w, h: hyperparameters for the @@ -48,7 +50,28 @@ class Hyperparameters: l1, l2, a1, a2, b1, b2, b3, g, method, switch, num, linearity: hyperparameters for the impute_gps function. itrvl, r, w, h: hyperparameters for the imp_to_traj function. + log_threshold: int, time spent in a pause needs to exceed the + log_threshold to be placed in the log + only if save_osm_log True, in minutes + split_day_night: bool, True if you want to split all metrics to + datetime and nighttime patterns + only for daily frequency + person_point_radius: float, radius of the person's circle when + discovering places near him in pauses + place_point_radius: float, radius of place's circle + when place is returned as centre coordinates from osm + save_osm_log: bool, True if you want to output a log of locations + visited and their tags + quality_threshold: float, a percentage value of the fraction of data + required for a summary to be created + pcr_bool: bool, True if you want to calculate the physical + circadian rhythm + pcr_window: int, number of days to look back and forward + for calculating the physical circadian rhythm + pcr_sample_rate: int, number of seconds between each sample + for calculating the physical circadian rhythm """ + # imputation hyperparameters l1: int = 60 * 60 * 24 * 10 l2: int = 60 * 60 * 24 * 30 l3: float = 0.002 @@ -71,6 +94,17 @@ class Hyperparameters: w: Optional[float] = None h: Optional[float] = None + # summary statistics hyperparameters + save_osm_log: bool = False + log_threshold: int = 60 + split_day_night: bool = False + person_point_radius: float = 2 + place_point_radius: float = 7.5 + quality_threshold: float = 0.05 + pcr_bool: bool = False + pcr_window: int = 14 + pcr_sample_rate: int = 30 + def transform_point_to_circle(lat: float, lon: float, radius: float ) -> Polygon: @@ -116,9 +150,10 @@ def get_nearby_locations( types of nearby locations supported by Overpass API defaults to [OSMTags.AMENITY, OSMTags.LEISURE] Returns: - ids: dictionary, contains nearby locations' ids - locations: dictionary, contains nearby locations' coordinates - tags: dictionary, contains nearby locations' tags + A tuple of: + dictionary, contains nearby locations' ids + dictionary, contains nearby locations' coordinates + dictionary, contains nearby locations' tags Raises: RuntimeError: if the query to Overpass API fails """ @@ -223,17 +258,835 @@ def get_nearby_locations( return ids, locations, tags +def avg_mobility_trace_difference( + time_range: Tuple[int, int], mobility_trace1: np.ndarray, + mobility_trace2: np.ndarray +) -> float: + """This function calculates the average mobility trace difference + + Args: + time_range: tuple of two ints, time range of mobility_trace + mobility_trace1: numpy array, mobility trace 1 + contains 3 columns: [x, y, t] + mobility_trace2: numpy array, mobility trace 2 + contains 3 columns: [x, y, t] + Returns: + float, average mobility trace difference + Raises: + ValueError: if the calculation fails + """ + + # Create masks for timestamps that lie within the specified time range + mask1 = ( + (mobility_trace1[:, 2] >= time_range[0]) + & (mobility_trace1[:, 2] <= time_range[1]) + ) + mask2 = ( + (mobility_trace2[:, 2] >= time_range[0]) + & (mobility_trace2[:, 2] <= time_range[1]) + ) + + # Create a set of common timestamps for efficient lookup + common_times = ( + set(mobility_trace1[mask1, 2]) & set(mobility_trace2[mask2, 2]) + ) + + # Create masks for the common timestamps + mask1_common = np.isin(mobility_trace1[:, 2], list(common_times)) + mask2_common = np.isin(mobility_trace2[:, 2], list(common_times)) + + if not any(mask1_common) or not any(mask2_common): + return 0 + + # Calculate distances using the common timestamp masks + dists = great_circle_dist( + mobility_trace1[mask1_common, 0], mobility_trace1[mask1_common, 1], + mobility_trace2[mask2_common, 0], mobility_trace2[mask2_common, 1] + ) + + dist_flag = dists <= 10 + res = np.mean(dist_flag) + if np.isnan(res): + raise ValueError("PCR calculation failed") + + return float(res) + + +def routine_index( + time_range: Tuple[int, int], mobility_trace: np.ndarray, + pcr_window: int = 14, pcr_sample_rate: int = 30, + stratified: bool = False, timezone: str = "US/Eastern", +) -> float: + """This function calculates the routine index of a trajectory + + Description of routine index can be found in the paper: + Canzian and Musolesi's 2015 paper in the Proceedings of the 2015 + ACM International Joint Conference on Pervasive and Ubiquitous Computing, + titled “Trajectories of depression: unobtrusive monitoring of depressive + states by means of smartphone mobility traces analysis.” + + Args: + time_range: tuple of two ints, time range of mobility_trace + mobility_trace: numpy array, trajectory + contains 3 columns: [x, y, t] + pcr_window: int, number of days to look back and forward + for calculating the physical circadian rhythm + pcr_sample_rate: int, number of seconds between each sample + for calculating the physical circadian rhythm + stratified: bool, True if you want to calculate the routine index + for weekdays and weekends separately + timezone: str, timezone of the mobility trace + Returns: + float, routine index + """ + + t_1, t_2 = time_range + + t_init = mobility_trace[:, 2].min() + t_fin = mobility_trace[:, 2].max() + + t_1 = max(t_1, t_init) + t_2 = min(t_2, t_fin) + + # n1, n2 are the number of days before and after the time range + n1 = int(round((t_1 - t_init) / (24 * 60 * 60))) + n2 = int(round((t_fin - t_2) / (24 * 60 * 60))) + + # to avoid long computational times + # only look at the last window days and next window days + n1 = min(n1, pcr_window) + n2 = min(n2, pcr_window) + + if max(n1, n2) == 0: + return 0 + + shifts = list(range(1, n1 + 1)) + list(range(-n2, 0)) + if stratified: + time_mid = int((t_1 + t_2) / 2) + weekend_today = datetime( + *stamp2datetime(time_mid, timezone) + ).weekday() >= 5 + if weekend_today: + shifts = [ + s for s in shifts + if datetime( + *stamp2datetime( + time_mid - s * 24 * 60 * 60, timezone + ) + ).weekday() >= 5 + ] + else: + shifts = [ + s for s in shifts + if datetime( + *stamp2datetime( + time_mid - s * 24 * 60 * 60, timezone + ) + ).weekday() < 5 + ] + + res = sum( + avg_mobility_trace_difference( + time_range, mobility_trace[::pcr_sample_rate], + np.column_stack( + [ + mobility_trace[:, :2], + mobility_trace[:, 2] + i * 24 * 60 * 60 + ] + ) + ) + for i in shifts + ) + + return res / (n1 + n2) + + +def create_mobility_trace(traj: np.ndarray) -> np.ndarray: + """This function creates a mobility trace from a trajectory + + Args: + traj: numpy array, trajectory + contains 8 columns: [s,x0,y0,t0,x1,y1,t1,obs] + Returns: + numpy array, mobility trace + contains 3 columns: [x, y, t] + """ + + pause_vec = traj[traj[:, 0] == 2] + + # Calculate the time ranges for all pauses + start_times = pause_vec[:, 3].astype(int) + end_times = pause_vec[:, 6].astype(int) + time_ranges = [np.arange(s, e) for s, e in zip(start_times, end_times)] + + # Flatten time_ranges and get the corresponding locations + flat_time_ranges = np.concatenate(time_ranges) + repeats = [len(r) for r in time_ranges] + locs = np.repeat(pause_vec[:, 1:3], repeats, axis=0) + + # Stack locations and time_ranges to get the mobility trace + mobility_trace = np.column_stack([locs, flat_time_ranges]) + + # check if duplicate timestamps exist + _, unique_indices = np.unique(mobility_trace[:, 2], return_index=True) + + return mobility_trace[unique_indices] + + +def get_day_night_indices( + traj: np.ndarray, tz_str: str, index: int, start_time: int, end_time: int, + current_time_list: List[int] +) -> Tuple[np.ndarray, int, int, int, int]: + """This function returns the indices of the rows in the trajectory + if the trajectory is split into day and night. + + Args: + traj: numpy array, trajectory + contains 8 columns: [s,x0,y0,t0,x1,y1,t1,obs] + tz_str: str, timezone + index: int, index of the window + start_time: int, starting time of the window + end_time: int, ending time of the window + current_time_list: list of int, current time + Returns: + A tuple of: + numpy array, indices of the rows in the trajectory + if the trajectory is split into day and night + int, index of the row in the trajectory + where the first part of the trajectory ends + int, index of the row in the trajectory + where the second part of the trajectory starts + int, starting time of the second part of the trajectory + int, ending time of the second part of the trajectory + """ + + current_time_list2 = current_time_list.copy() + current_time_list3 = current_time_list.copy() + current_time_list2[3] = 8 + current_time_list3[3] = 20 + start_time2 = datetime2stamp(current_time_list2, tz_str) + end_time2 = datetime2stamp(current_time_list3, tz_str) + if index % 2 == 0: + # daytime + index_rows = (traj[:, 3] <= end_time2) * (traj[:, 6] >= start_time2) + + return index_rows, 0, 0, start_time2, end_time2 + + # nighttime + index1 = ( + (traj[:, 6] < start_time2) + * (traj[:, 3] < end_time) + * (traj[:, 6] > start_time) + ) + index2 = ( + (traj[:, 3] > end_time2) + * (traj[:, 3] < end_time) + * (traj[:, 6] > start_time) + ) + stop1 = sum(index1) - 1 + stop2 = sum(index1) + index_rows = index1 + index2 + + return index_rows, stop1, stop2, start_time2, end_time2 + + +def smooth_temp_ends( + temp: np.ndarray, index_rows: np.ndarray, t0_temp: float, + t1_temp: float, parameters: Hyperparameters, i: int, start_time: int, + end_time2: int, start_time2: int, end_time: int, stop1: int, stop2: int +) -> np.ndarray: + """This function smooths the starting and ending points of the + trajectory. + + Args: + temp: numpy array, trajectory + contains 8 columns: [s,x0,y0,t0,x1,y1,t1,obs] + index_rows: numpy array, indices of the rows in the trajectory + if the trajectory is split into day and night + t0_temp: float, starting time of the trajectory + t1_temp: float, ending time of the trajectory + parameters: Hyperparameters, hyperparameters in functions + recommend to set it to default + i: int, index of the window + start_time: int, starting time of the window + end_time2: int, ending time of the second part of the trajectory + start_time2: int, starting time of the second part of the trajectory + end_time: int, ending time of the window + stop1: int, index of the row in the trajectory + where the first part of the trajectory ends + stop2: int, index of the row in the trajectory + where the second part of the trajectory starts + Returns: + temp: numpy array, trajectory + contains 8 columns: [s,x0,y0,t0,x1,y1,t1,obs] + """ + if sum(index_rows) == 1: + p0 = (t0_temp - temp[0, 3]) / (temp[0, 6] - temp[0, 3]) + p1 = (t1_temp - temp[0, 3]) / (temp[0, 6] - temp[0, 3]) + x0, y0 = temp[0, [1, 2]] + x1, y1 = temp[0, [4, 5]] + temp[0, 1] = (1 - p0) * x0 + p0 * x1 + temp[0, 2] = (1 - p0) * y0 + p0 * y1 + temp[0, 3] = t0_temp + temp[0, 4] = (1 - p1) * x0 + p1 * x1 + temp[0, 5] = (1 - p1) * y0 + p1 * y1 + temp[0, 6] = t1_temp + else: + if parameters.split_day_night and i % 2 != 0: + t0_temp_l = [start_time, end_time2] + t1_temp_l = [start_time2, end_time] + start_temp = [0, stop2] + end_temp = [stop1, -1] + for j in range(2): + p0 = (temp[start_temp[j], 6] - t0_temp_l[j]) / ( + temp[start_temp[j], 6] - temp[start_temp[j], 3] + ) + p1 = (t1_temp_l[j] - temp[end_temp[j], 3]) / ( + temp[end_temp[j], 6] - temp[end_temp[j], 3] + ) + temp[start_temp[j], 1] = (1 - p0) * temp[ + start_temp[j], 4 + ] + p0 * temp[start_temp[j], 1] + temp[start_temp[j], 2] = (1 - p0) * temp[ + start_temp[j], 5 + ] + p0 * temp[start_temp[j], 2] + temp[start_temp[j], 3] = t0_temp_l[j] + temp[end_temp[j], 4] = (1 - p1) * temp[ + end_temp[j], 1 + ] + p1 * temp[end_temp[j], 4] + temp[end_temp[j], 5] = (1 - p1) * temp[ + end_temp[j], 2 + ] + p1 * temp[end_temp[j], 5] + temp[end_temp[j], 6] = t1_temp_l[j] + else: + p0 = (temp[0, 6] - t0_temp) / (temp[0, 6] - temp[0, 3]) + p1 = ( + (t1_temp - temp[-1, 3]) + / (temp[-1, 6] - temp[-1, 3]) + ) + temp[0, 1] = (1 - p0) * temp[0, 4] + p0 * temp[0, 1] + temp[0, 2] = (1 - p0) * temp[0, 5] + p0 * temp[0, 2] + temp[0, 3] = t0_temp + temp[-1, 4] = (1 - p1) * temp[-1, 1] + p1 * temp[-1, 4] + temp[-1, 5] = (1 - p1) * temp[-1, 2] + p1 * temp[-1, 5] + temp[-1, 6] = t1_temp + + return temp + + +def get_pause_array(pause_vec: np.ndarray, home_lat: float, home_lon: float, + parameters: Hyperparameters) -> np.ndarray: + """This function returns a numpy array of pauses. + + Args: + pause_vec: numpy array, contains 8 columns: [s,x0,y0,t0,x1,y1,t1,obs] + home_lat: float, latitude of the home + home_lon: float, longitude of the home + parameters: Hyperparameters, hyperparameters in functions + Returns: + pause_array: numpy array, contains 3 columns: [x, y, t] + """ + pause_array: np.ndarray = np.array([]) + for row in pause_vec: + if ( + great_circle_dist(row[1], row[2], home_lat, home_lon)[0] + > 2*parameters.place_point_radius + ): + if len(pause_array) == 0: + pause_array = np.array( + [extract_pause_from_row(row)] + ) + elif ( + np.min( + great_circle_dist( + row[1], row[2], + pause_array[:, 0], pause_array[:, 1], + ) + ) + > 2*parameters.place_point_radius + ): + pause_array = np.append( + pause_array, + [extract_pause_from_row(row)], + axis=0, + ) + else: + pause_array[ + np.argmin( + great_circle_dist( + row[1], row[2], + pause_array[:, 0], pause_array[:, 1], + ) + ), + -1, + ] += (row[6] - row[3]) / 60 + + return pause_array + + +def extract_pause_from_row(row: np.ndarray) -> list: + """This function extracts the pause from a row in a trajectory. + + Args: + row: numpy array, contains 8 columns: [s,x0,y0,t0,x1,y1,t1,obs] + Returns: + list, pause + """ + return [row[1], row[2], (row[6] - row[3]) / 60] + + +def get_polygon(saved_polygons: dict, lat: float, lon: float, label: str, + radius: float) -> Tuple[Polygon, dict]: + """This function returns a saved polygon if it exists, + otherwise it computes a polygon and saves it. + + Args: + saved_polygons: dict, contains saved polygons + lat: float, latitude of the center of the circle + lon: float, longitude of the center of the circle + label: str, label of the location + radius: float, radius of the circle + Returns: + A tuple with the following elements: + shapely polygon + dict, contains saved polygons + """ + loc_str = f"{lat}, {lon} - {label}" + if loc_str in saved_polygons.keys(): + return saved_polygons[loc_str], saved_polygons + + circle = transform_point_to_circle(lat, lon, radius) + saved_polygons[loc_str] = circle + return circle, saved_polygons + + +def intersect_with_places_of_interest( + pause: list, places_of_interest: list, saved_polygons: dict, + parameters: Hyperparameters, ids: dict, locations: dict, + ids_keys_list: list +) -> Tuple[list, bool]: + """This function computes the intersection between a pause and + places of interest. + + Args: + pause: list, pause + places_of_interest: list of str, places of interest + saved_polygons: dict, contains saved polygons + parameters: Hyperparameters, hyperparameters in functions + ids: dict, contains nearby locations' ids + locations: dict, contains nearby locations' coordinates + ids_keys_list: list of str, keys of ids + Returns: + A tuple with the following elements: + list of float, intersection between a pause and + places of interest + bool, True if the pause is not intersected with + any place of interest + """ + all_place_probs = [0] * len(places_of_interest) + pause_circle, saved_polygons = get_polygon( + saved_polygons, pause[0], pause[1], "person", + parameters.person_point_radius + ) + add_to_other = True + for j, place in enumerate(places_of_interest): + if place not in ids_keys_list: + continue + for element_id in ids[place]: + intersection_area = 0 + + if len(locations[element_id]) == 1: + loc_lat, loc_lon = locations[element_id][0] + + loc_circle = get_polygon( + saved_polygons, loc_lat, loc_lon, "place", + parameters.place_point_radius + ) + + intersection_area = pause_circle.intersection( + loc_circle + ).area + elif len(locations[element_id]) >= 3: + polygon = Polygon(locations[element_id]) + + intersection_area = pause_circle.intersection( + polygon + ).area + + if intersection_area > 0: + all_place_probs[j] += intersection_area + add_to_other = False + + return all_place_probs, add_to_other + + +def compute_flight_pause_stats( + flight_d_vec: np.ndarray, flight_t_vec: np.ndarray, + pause_t_vec: np.ndarray, +) -> list: + """This function computes the flight and pause statistics. + + Args: + flight_d_vec: numpy array, contains flight distances + flight_t_vec: numpy array, contains flight durations + pause_t_vec: numpy array, contains pause durations + Returns: + list with the following elements: + av_f_len: float, average flight length + sd_f_len: float, standard deviation of flight length + av_f_dur: float, average flight duration + sd_f_dur: float, standard deviation of flight duration + av_p_dur: float, average pause duration + sd_p_dur: float, standard deviation of pause duration + """ + if len(flight_d_vec) > 0: + av_f_len = np.mean(flight_d_vec) + sd_f_len = np.std(flight_d_vec) + av_f_dur = np.mean(flight_t_vec) + sd_f_dur = np.std(flight_t_vec) + else: + av_f_len = 0 + sd_f_len = 0 + av_f_dur = 0 + sd_f_dur = 0 + + if len(pause_t_vec) > 0: + av_p_dur = np.mean(pause_t_vec) + sd_p_dur = np.std(pause_t_vec) + else: + av_p_dur = 0 + sd_p_dur = 0 + + return [av_f_len, sd_f_len, av_f_dur, sd_f_dur, av_p_dur, sd_p_dur] + + +def final_hourly_prep( + obs_dur: float, time_at_home: float, dist_traveled: float, + max_dist_home: float, total_flight_time: float, total_pause_time: float, + flight_pause_stats: list, all_place_times: list, + all_place_times_adjusted: list, summary_stats: list, log_tags: dict, + log_tags_temp: list, datetime_list: List[int], + places_of_interest: Optional[List[str]] +) -> Tuple[list, dict]: + """This function prepares the final hourly summary statistics. + + Args: + obs_dur: float, observed duration + time_at_home: float, time at home + dist_traveled: float, distance traveled + max_dist_home: float, maximum distance from home + total_flight_time: float, total flight time + total_pause_time: float, total pause time + flight_pause_stats: list, flight and pause statistics + all_place_times: list of float, time spent at places of interest + all_place_times_adjusted: list of float, adjusted time spent at + places of interest + summary_stats: list, summary statistics + log_tags: dict, contains log of tags of all locations visited + from openstreetmap + log_tags_temp: list, log of tags of all locations visited + from openstreetmap + datetime_list: list of int, current time + places_of_interest: list of str, places of interest + Returns: + A tuple of: + a list, summary statistics + a dict, contains log of tags of all locations visited + from openstreetmap + """ + + year, month, day, hour = datetime_list[:4] + ( + av_f_len, sd_f_len, av_f_dur, sd_f_dur, av_p_dur, sd_p_dur + ) = flight_pause_stats + + if obs_dur == 0: + res = [ + year, + month, + day, + hour, + 0, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] + if places_of_interest is not None: + for place_int in range(2 * len(places_of_interest) + 1): + res.append(pd.NA) + summary_stats.append(res) + log_tags[f"{day}/{month}/{year} {hour}:00"] = [] + else: + res = [ + year, + month, + day, + hour, + obs_dur / 60, + time_at_home / 60, + dist_traveled / 1000, + max_dist_home / 1000, + total_flight_time / 60, + av_f_len, + sd_f_len, + av_f_dur / 60, + sd_f_dur / 60, + total_pause_time / 60, + av_p_dur / 60, + sd_p_dur / 60, + ] + if places_of_interest is not None: + res += all_place_times + res += all_place_times_adjusted + log_tags[f"{day}/{month}/{year} {hour}:00"] = log_tags_temp + + summary_stats.append(res) + + return summary_stats, log_tags + + +def final_daily_prep( + obs_dur: float, obs_day: float, obs_night: float, time_at_home: float, + dist_traveled: float, max_dist_home: float, radius: float, + diameter: float, num_sig: int, entropy: float, total_flight_time: float, + total_pause_time: float, flight_pause_stats: list, + all_place_times: list, all_place_times_adjusted: list, + summary_stats: list, log_tags: dict, log_tags_temp: list, + datetime_list: List[int], places_of_interest: Optional[List[str]], + parameters: Hyperparameters, pcr: float, pcr_stratified: float, i: int +) -> Tuple[list, dict]: + """This function prepares the final daily summary statistics. + + Args: + obs_dur: float, observed duration + obs_day: float, observed duration during the day + obs_night: float, observed duration during the night + time_at_home: float, time at home + dist_traveled: float, distance traveled + max_dist_home: float, maximum distance from home + radius: float, radius of gyration + diameter: float, diameter of gyration + num_sig: int, number of significant places + entropy: float, entropy of the trajectory + total_flight_time: float, total flight time + total_pause_time: float, total pause time + flight_pause_stats: list, flight and pause statistics + all_place_times: list of float, time spent at places of interest + all_place_times_adjusted: list of float, adjusted time spent at + places of interest + summary_stats: list, summary statistics + log_tags: dict, contains log of tags of all locations visited + from openstreetmap + log_tags_temp: list, log of tags of all locations visited + from openstreetmap + datetime_list: list of int, current time + places_of_interest: list of str, places of interest + parameters: Hyperparameters, hyperparameters in functions + pcr: float, physical circadian rhythm + pcr_stratified: float, physical circadian rhythm stratified + i: int, index of the window + Returns: + A tuple of: + a list, summary statistics + a dict, contains log of tags of all locations visited + from openstreetmap + """ + + year, month, day = datetime_list[:3] + ( + av_f_len, sd_f_len, av_f_dur, sd_f_dur, av_p_dur, sd_p_dur + ) = flight_pause_stats + + if obs_dur == 0: + res = [ + year, + month, + day, + 0, + 0, + 0, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] + if parameters.pcr_bool: + res += [pcr, pcr_stratified] + if places_of_interest is not None: + for place_int in range(2 * len(places_of_interest) + 1): + res.append(pd.NA) + summary_stats.append(res) + log_tags[f"{day}/{month}/{year}"] = [] + else: + res = [ + year, + month, + day, + obs_dur / 3600, + obs_day / 3600, + obs_night / 3600, + time_at_home / 3600, + dist_traveled / 1000, + max_dist_home / 1000, + radius / 1000, + diameter / 1000, + num_sig, + entropy, + total_flight_time / 3600, + av_f_len / 1000, + sd_f_len / 1000, + av_f_dur / 3600, + sd_f_dur / 3600, + total_pause_time / 3600, + av_p_dur / 3600, + sd_p_dur / 3600, + ] + if parameters.pcr_bool: + res += [pcr, pcr_stratified] + if places_of_interest is not None: + res += all_place_times + res += all_place_times_adjusted + summary_stats.append(res) + if parameters.split_day_night: + if i % 2 == 0: + time_cat = "daytime" + else: + time_cat = "nighttime" + log_tags[f"{day}/{month}/{year}, {time_cat}"] = ( + log_tags_temp + ) + else: + log_tags[f"{day}/{month}/{year}"] = log_tags_temp + + return summary_stats, log_tags + + +def format_summary_stats( + summary_stats: list, log_tags: dict, frequency: Frequency, + parameters: Hyperparameters, places_of_interest: Optional[List[str]] +) -> Tuple[pd.DataFrame, dict]: + """This function formats the summary statistics. + + Args: + summary_stats: list, summary statistics + log_tags: dict, contains log of tags of all locations visited + from openstreetmap + frequency: Frequency, the time windows of the summary statistics + parameters: Hyperparameters, hyperparameters in functions + recommend to set it to default + places_of_interest: list of str, places of interest + Returns: + A tuple of: + a pd dataframe, summary statistics + a dict, contains log of tags of all locations visited + from openstreetmap + """ + + summary_stats_df = pd.DataFrame(summary_stats) + + if places_of_interest is None: + places_of_interest2 = [] + places_of_interest3 = [] + else: + places_of_interest2 = places_of_interest.copy() + places_of_interest2.append("other") + places_of_interest3 = [f"{pl}_adjusted" for pl in places_of_interest] + + if parameters.pcr_bool: + pcr_cols = [ + "physical_circadian_rhythm", + "physical_circadian_rhythm_stratified", + ] + else: + pcr_cols = [] + + if frequency != Frequency.DAILY: + summary_stats_df.columns = ( + [ + "year", + "month", + "day", + "hour", + "obs_duration", + "home_time", + "dist_traveled", + "max_dist_home", + "total_flight_time", + "av_flight_length", + "sd_flight_length", + "av_flight_duration", + "sd_flight_duration", + "total_pause_time", + "av_pause_duration", + "sd_pause_duration", + ] + + places_of_interest2 + + places_of_interest3 + ) + else: + summary_stats_df.columns = ( + [ + "year", + "month", + "day", + "obs_duration", + "obs_day", + "obs_night", + "home_time", + "dist_traveled", + "max_dist_home", + "radius", + "diameter", + "num_sig_places", + "entropy", + "total_flight_time", + "av_flight_length", + "sd_flight_length", + "av_flight_duration", + "sd_flight_duration", + "total_pause_time", + "av_pause_duration", + "sd_pause_duration", + ] + + pcr_cols + + places_of_interest2 + + places_of_interest3 + ) + + if parameters.split_day_night: + summary_stats_df2 = split_day_night_cols(summary_stats_df) + else: + summary_stats_df2 = summary_stats_df + + return summary_stats_df2, log_tags + + def gps_summaries( traj: np.ndarray, tz_str: str, frequency: Frequency, + parameters: Hyperparameters, places_of_interest: Optional[List[str]] = None, - save_osm_log: bool = False, osm_tags: Optional[List[OSMTags]] = None, - threshold: Optional[int] = None, - split_day_night: bool = False, - person_point_radius: float = 2, - place_point_radius: float = 7.5, ) -> Tuple[pd.DataFrame, dict]: """This function derives summary statistics from the imputed trajectories @@ -242,7 +1095,8 @@ def gps_summaries( "max_dist_home", "dist_traveled","av_flight_length","sd_flight_length", "av_flight_duration","sd_flight_duration"] if the frequency is daily, it additionally returns - ["obs_day","obs_night","radius","diameter","num_sig_places","entropy"] + ["obs_day","obs_night","radius","diameter""num_sig_places","entropy", + "physical_circadian_rhythm","physical_circadian_rhythm_stratified"] Args: traj: 2d array, output from imp_to_traj(), which is a n by 8 mat, @@ -253,27 +1107,18 @@ def gps_summaries( obs (1 as observed and 0 as imputed) tz_str: timezone frequency: Frequency, the time windows of the summary statistics + parameters: Hyperparameters, hyperparameters in functions + recommend to set it to default places_of_interest: list of "osm_tags" places to watch, keywords as used in openstreetmaps e.g. ["cafe", "hospital", "restaurant"] - save_osm_log: bool, True if you want to output a log of locations - visited and their tags osm_tags: list of tags to search for in openstreetmaps avoid using a lot of them if large area is covered - threshold: int, time spent in a pause needs to exceed the threshold - to be placed in the log - only if save_osm_log True, in minutes - split_day_night: bool, True if you want to split all metrics to - daytime and nighttime patterns - only for daily frequency - person_point_radius: float, radius of the person's circle when - discovering places near him in pauses - place_point_radius: float, radius of place's circle - when place is returned as centre coordinates from osm Returns: - a pd dataframe, with each row as an hour/day, + A tuple of: + a pd dataframe, with each row as an hour/day, and each col as a feature/stat - a dictionary, contains log of tags of all locations visited + a dictionary, contains log of tags of all locations visited from openstreetmap Raises: RuntimeError: if the query to Overpass API fails @@ -284,14 +1129,16 @@ def gps_summaries( raise ValueError("Frequency must be 'hourly' or 'daily'") if frequency != Frequency.DAILY: - split_day_night = False + parameters.split_day_night = False ids: Dict[str, List[int]] = {} locations: Dict[int, List[List[float]]] = {} tags: Dict[int, Dict[str, str]] = {} - if places_of_interest is not None or save_osm_log: + if places_of_interest is not None or parameters.save_osm_log: ids, locations, tags = get_nearby_locations(traj, osm_tags) ids_keys_list = list(ids.keys()) + else: + ids_keys_list = [] obs_traj = traj[traj[:, 7] == 1, :] home_lat, home_lon = locate_home(obs_traj, tz_str) @@ -301,38 +1148,27 @@ def gps_summaries( if frequency != Frequency.DAILY: # find starting and ending time logger.info("Calculating the hourly summary stats...") - time_list = stamp2datetime(traj[0, 3], tz_str) - time_list[4:6] = [0, 0] - start_stamp = datetime2stamp(time_list, tz_str) - time_list = stamp2datetime(traj[-1, 6], tz_str) - time_list[4:6] = [0, 0] - end_stamp = datetime2stamp(time_list, tz_str) - # start_time, end_time are exact points - # (if it ends at 2019-3-8 11 o'clock, then 11 shouldn't be included) - window = frequency.value * 60 * 60 - no_windows = (end_stamp - start_stamp) // window + start_stamp, end_stamp = get_time_range( + traj, [4, 5], tz_str + ) + window, num_windows = compute_window_and_count( + start_stamp, end_stamp, frequency.value + ) else: # find starting and ending time logger.info("Calculating the daily summary stats...") - time_list = stamp2datetime(traj[0, 3], tz_str) - time_list[3:6] = [0, 0, 0] - start_stamp = datetime2stamp(time_list, tz_str) - time_list = stamp2datetime(traj[-1, 6], tz_str) - time_list[3:6] = [0, 0, 0] - end_stamp = datetime2stamp(time_list, tz_str) + 3600 * 24 - # if it starts from 2019-3-8 11 o'clock, - # then our daily summary starts from 2019-3-9) - window = 60 * 60 * 24 - no_windows = (end_stamp - start_stamp) // window - if split_day_night: - no_windows *= 2 - - if no_windows <= 0: + start_stamp, end_stamp = get_time_range( + traj, [3, 4, 5], tz_str, 3600*24 + ) + window, num_windows = compute_window_and_count( + start_stamp, end_stamp, 24, parameters.split_day_night + ) + + if num_windows <= 0: raise ValueError("start time and end time are not correct") - summary_stats_df = pd.DataFrame([]) - for i in range(no_windows): - if split_day_night: + for i in range(num_windows): + if parameters.split_day_night: i2 = i // 2 else: i2 = i @@ -349,36 +1185,14 @@ def gps_summaries( stop1 = 0 stop2 = 0 - if split_day_night: - current_time_list2 = current_time_list.copy() - current_time_list3 = current_time_list.copy() - current_time_list2[3] = 8 - current_time_list3[3] = 20 - start_time2 = datetime2stamp(current_time_list2, tz_str) - end_time2 = datetime2stamp(current_time_list3, tz_str) - if i % 2 == 0: - # daytime - index_rows = ( - (traj[:, 3] <= end_time2) - * (traj[:, 6] >= start_time2) - ) - else: - # nighttime - index1 = ( - (traj[:, 6] < start_time2) - * (traj[:, 3] < end_time) - * (traj[:, 6] > start_time) - ) - index2 = ( - (traj[:, 3] > end_time2) - * (traj[:, 3] < end_time) - * (traj[:, 6] > start_time) + if parameters.split_day_night: + index_rows, stop1, stop2, start_time2, end_time2 = ( + get_day_night_indices( + traj, tz_str, i, start_time, end_time, current_time_list ) - stop1 = sum(index1) - 1 - stop2 = sum(index1) - index_rows = index1 + index2 + ) - if sum(index_rows) == 0 and split_day_night: + if sum(index_rows) == 0 and parameters.split_day_night: # if there is no data in the day, then we need to # to add empty rows to the dataframe with 21 columns res = [year, month, day] + [0] * 18 @@ -388,7 +1202,7 @@ def gps_summaries( res += [0] * (2 * len(places_of_interest) + 1) summary_stats.append(res) continue - elif sum(index_rows) == 0 and not split_day_night: + elif sum(index_rows) == 0 and not parameters.split_day_night: # There is no data and it is daily data, so we need to add empty # rows res = [year, month, day] + [0] * 3 + [pd.NA] * 15 @@ -403,63 +1217,17 @@ def gps_summaries( temp = traj[index_rows, :] # take a subset which is exactly one hour/day, # cut the trajs at two ends proportionally - if split_day_night and i % 2 == 0: + if parameters.split_day_night and i % 2 == 0: t0_temp = start_time2 t1_temp = end_time2 else: t0_temp = start_time t1_temp = end_time - if sum(index_rows) == 1: - p0 = (t0_temp - temp[0, 3]) / (temp[0, 6] - temp[0, 3]) - p1 = (t1_temp - temp[0, 3]) / (temp[0, 6] - temp[0, 3]) - x0, y0 = temp[0, [1, 2]] - x1, y1 = temp[0, [4, 5]] - temp[0, 1] = (1 - p0) * x0 + p0 * x1 - temp[0, 2] = (1 - p0) * y0 + p0 * y1 - temp[0, 3] = t0_temp - temp[0, 4] = (1 - p1) * x0 + p1 * x1 - temp[0, 5] = (1 - p1) * y0 + p1 * y1 - temp[0, 6] = t1_temp - else: - if split_day_night and i % 2 != 0: - t0_temp_l = [start_time, end_time2] - t1_temp_l = [start_time2, end_time] - start_temp = [0, stop2] - end_temp = [stop1, -1] - for j in range(2): - p0 = (temp[start_temp[j], 6] - t0_temp_l[j]) / ( - temp[start_temp[j], 6] - temp[start_temp[j], 3] - ) - p1 = (t1_temp_l[j] - temp[end_temp[j], 3]) / ( - temp[end_temp[j], 6] - temp[end_temp[j], 3] - ) - temp[start_temp[j], 1] = (1 - p0) * temp[ - start_temp[j], 4 - ] + p0 * temp[start_temp[j], 1] - temp[start_temp[j], 2] = (1 - p0) * temp[ - start_temp[j], 5 - ] + p0 * temp[start_temp[j], 2] - temp[start_temp[j], 3] = t0_temp_l[j] - temp[end_temp[j], 4] = (1 - p1) * temp[ - end_temp[j], 1 - ] + p1 * temp[end_temp[j], 4] - temp[end_temp[j], 5] = (1 - p1) * temp[ - end_temp[j], 2 - ] + p1 * temp[end_temp[j], 5] - temp[end_temp[j], 6] = t1_temp_l[j] - else: - p0 = (temp[0, 6] - t0_temp) / (temp[0, 6] - temp[0, 3]) - p1 = ( - (t1_temp - temp[-1, 3]) - / (temp[-1, 6] - temp[-1, 3]) - ) - temp[0, 1] = (1 - p0) * temp[0, 4] + p0 * temp[0, 1] - temp[0, 2] = (1 - p0) * temp[0, 5] + p0 * temp[0, 2] - temp[0, 3] = t0_temp - temp[-1, 4] = (1 - p1) * temp[-1, 1] + p1 * temp[-1, 4] - temp[-1, 5] = (1 - p1) * temp[-1, 2] + p1 * temp[-1, 5] - temp[-1, 6] = t1_temp + temp = smooth_temp_ends( + temp, index_rows, t0_temp, t1_temp, parameters, i, start_time, + end_time2, start_time2, end_time, stop1, stop2 + ) obs_dur = sum((temp[:, 6] - temp[:, 3])[temp[:, 7] == 1]) d_home_1 = great_circle_dist( @@ -487,42 +1255,11 @@ def gps_summaries( all_place_times = [] all_place_times_adjusted = [] log_tags_temp = [] - if places_of_interest is not None or save_osm_log: + if places_of_interest is not None or parameters.save_osm_log: pause_vec = temp[temp[:, 0] == 2] - pause_array: np.ndarray = np.array([]) - for row in pause_vec: - if ( - great_circle_dist(row[1], row[2], home_lat, home_lon)[0] - > 2*place_point_radius - ): - if len(pause_array) == 0: - pause_array = np.array( - [[row[1], row[2], (row[6] - row[3]) / 60]] - ) - elif ( - np.min( - great_circle_dist( - row[1], row[2], - pause_array[:, 0], pause_array[:, 1], - ) - ) - > 2*place_point_radius - ): - pause_array = np.append( - pause_array, - [[row[1], row[2], (row[6] - row[3]) / 60]], - axis=0, - ) - else: - pause_array[ - np.argmin( - great_circle_dist( - row[1], row[2], - pause_array[:, 0], pause_array[:, 1], - ) - ), - -1, - ] += (row[6] - row[3]) / 60 + pause_array = get_pause_array( + pause_vec, home_lat, home_lon, parameters + ) if places_of_interest is not None: all_place_times = [0] * (len(places_of_interest) + 1) @@ -530,52 +1267,12 @@ def gps_summaries( for pause in pause_array: if places_of_interest is not None: - all_place_probs = [0] * len(places_of_interest) - pause_str = f"{pause[0]}, {pause[1]} - person" - if pause_str in saved_polygons.keys(): - pause_circle = saved_polygons[pause_str] - else: - pause_circle = transform_point_to_circle( - pause[0], pause[1], person_point_radius + all_place_probs, add_to_other = ( + intersect_with_places_of_interest( + pause, places_of_interest, saved_polygons, + parameters, ids, locations, ids_keys_list ) - saved_polygons[pause_str] = pause_circle - add_to_other = True - for j, place in enumerate(places_of_interest): - # if place of interest not in nearby locations of - # the current pause, skip - if place not in ids_keys_list: - continue - for element_id in ids[place]: - if len(locations[element_id]) == 1: - loc_lat = locations[element_id][0][0] - loc_lon = locations[element_id][0][1] - loc_str = f"{loc_lat}, {loc_lon} - place" - if loc_str in saved_polygons.keys(): - loc_circle = saved_polygons[loc_str] - else: - loc_circle = transform_point_to_circle( - loc_lat, - loc_lon, - place_point_radius, - ) - saved_polygons[loc_str] = loc_circle - - intersection_area = pause_circle.intersection( - loc_circle - ).area - if intersection_area > 0: - all_place_probs[j] += intersection_area - add_to_other = False - - elif len(locations[element_id]) >= 3: - polygon = Polygon(locations[element_id]) - - intersection_area = pause_circle.intersection( - polygon - ).area - if intersection_area > 0: - all_place_probs[j] += intersection_area - add_to_other = False + ) # in case of pause not in places of interest if add_to_other: @@ -591,14 +1288,8 @@ def gps_summaries( prob * pause[2] / 60 ) - if save_osm_log: - if threshold is None: - threshold = 60 - logger.info( - "threshold parameter set to None," - " automatically converted to 60min." - ) - if pause[2] >= threshold: + if parameters.save_osm_log: + if pause[2] >= parameters.log_threshold: for place_id, place_coordinates in locations.items(): if len(place_coordinates) == 1: if ( @@ -607,7 +1298,7 @@ def gps_summaries( place_coordinates[0][0], place_coordinates[0][1], )[0] - < place_point_radius + < parameters.place_point_radius ): log_tags_temp.append(tags[place_id]) elif len(place_coordinates) >= 3: @@ -616,72 +1307,18 @@ def gps_summaries( if polygon.contains(point): log_tags_temp.append(tags[place_id]) - if len(flight_d_vec) > 0: - av_f_len = np.mean(flight_d_vec) - sd_f_len = np.std(flight_d_vec) - av_f_dur = np.mean(flight_t_vec) - sd_f_dur = np.std(flight_t_vec) - else: - av_f_len = 0 - sd_f_len = 0 - av_f_dur = 0 - sd_f_dur = 0 - if len(pause_t_vec) > 0: - av_p_dur = np.mean(pause_t_vec) - sd_p_dur = np.std(pause_t_vec) - else: - av_p_dur = 0 - sd_p_dur = 0 - if frequency != Frequency.DAILY: - if obs_dur == 0: - res = [ - year, - month, - day, - hour, - 0, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - ] - if places_of_interest is not None: - for place_int in range(2 * len(places_of_interest) + 1): - res.append(pd.NA) - summary_stats.append(res) - log_tags[f"{day}/{month}/{year} {hour}:00"] = [] - else: - res = [ - year, - month, - day, - hour, - obs_dur / 60, - time_at_home / 60, - dist_traveled / 1000, - max_dist_home / 1000, - total_flight_time / 60, - av_f_len, - sd_f_len, - av_f_dur / 60, - sd_f_dur / 60, - total_pause_time / 60, - av_p_dur / 60, - sd_p_dur / 60, - ] - if places_of_interest is not None: - res += all_place_times - res += all_place_times_adjusted - log_tags[f"{day}/{month}/{year} {hour}:00"] = log_tags_temp + flight_pause_stats = compute_flight_pause_stats( + flight_d_vec, flight_t_vec, pause_t_vec + ) + datetime_list = [year, month, day, hour, 0, 0] - summary_stats.append(res) + if frequency != Frequency.DAILY: + summary_stats, log_tags = final_hourly_prep( + obs_dur, time_at_home, dist_traveled, max_dist_home, + total_flight_time, total_pause_time, flight_pause_stats, + all_place_times, all_place_times_adjusted, summary_stats, + log_tags, log_tags_temp, datetime_list, places_of_interest + ) else: hours = [] for j in range(temp.shape[0]): @@ -721,6 +1358,22 @@ def gps_summaries( t_sig = np.array(t_xy)[np.array(t_xy) / 60 > 15] p = t_sig / sum(t_sig) entropy = -sum(p * np.log(p + 0.00001)) + # physical circadian rhythm + if obs_dur != 0 and parameters.pcr_bool: + mobility_trace = create_mobility_trace(traj) + pcr = routine_index( + (start_time, end_time), mobility_trace, + parameters.pcr_window, parameters.pcr_sample_rate + ) + pcr_stratified = routine_index( + (start_time, end_time), mobility_trace, + parameters.pcr_window, parameters.pcr_sample_rate, + True, tz_str + ) + else: + pcr = pd.NA + pcr_stratified = pd.NA + # if there is only one significant place, the entropy is zero # but here it is -log(1.00001) < 0 # but the small value is added to avoid log(0) @@ -732,182 +1385,128 @@ def gps_summaries( else: diameters = pairwise_great_circle_dist(temp[:, [1, 2]]) diameter = max(diameters) - if obs_dur == 0: - res = [ - year, - month, - day, - 0, - 0, - 0, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - pd.NA, - ] - if places_of_interest is not None: - for place_int in range(2 * len(places_of_interest) + 1): - res.append(pd.NA) - summary_stats.append(res) - log_tags[f"{day}/{month}/{year}"] = [] - else: - res = [ - year, - month, - day, - obs_dur / 3600, - obs_day / 3600, - obs_night / 3600, - time_at_home / 3600, - dist_traveled / 1000, - max_dist_home / 1000, - radius / 1000, - diameter / 1000, - num_sig, - entropy, - total_flight_time / 3600, - av_f_len / 1000, - sd_f_len / 1000, - av_f_dur / 3600, - sd_f_dur / 3600, - total_pause_time / 3600, - av_p_dur / 3600, - sd_p_dur / 3600, - ] - if places_of_interest is not None: - res += all_place_times - res += all_place_times_adjusted - summary_stats.append(res) - if split_day_night: - if i % 2 == 0: - time_cat = "daytime" - else: - time_cat = "nighttime" - log_tags[f"{day}/{month}/{year}, {time_cat}"] = ( - log_tags_temp - ) - else: - log_tags[f"{day}/{month}/{year}"] = log_tags_temp - summary_stats_df = pd.DataFrame(summary_stats) - if places_of_interest is None: - places_of_interest2 = [] - places_of_interest3 = [] - else: - places_of_interest2 = places_of_interest.copy() - places_of_interest2.append("other") - places_of_interest3 = [ - f"{pl}_adjusted" for pl in places_of_interest - ] - if frequency != Frequency.DAILY: - summary_stats_df.columns = ( - [ - "year", - "month", - "day", - "hour", - "obs_duration", - "home_time", - "dist_traveled", - "max_dist_home", - "total_flight_time", - "av_flight_length", - "sd_flight_length", - "av_flight_duration", - "sd_flight_duration", - "total_pause_time", - "av_pause_duration", - "sd_pause_duration", - ] - + places_of_interest2 - + places_of_interest3 - ) - else: - summary_stats_df.columns = ( - [ - "year", - "month", - "day", - "obs_duration", - "obs_day", - "obs_night", - "home_time", - "dist_traveled", - "max_dist_home", - "radius", - "diameter", - "num_sig_places", - "entropy", - "total_flight_time", - "av_flight_length", - "sd_flight_length", - "av_flight_duration", - "sd_flight_duration", - "total_pause_time", - "av_pause_duration", - "sd_pause_duration", - ] - + places_of_interest2 - + places_of_interest3 - ) - if split_day_night: - summary_stats_df_daytime = summary_stats_df[::2].reset_index( - drop=True - ) - summary_stats_df_nighttime = summary_stats_df[1::2].reset_index( - drop=True + summary_stats, log_tags = final_daily_prep( + obs_dur, obs_day, obs_night, time_at_home, dist_traveled, + max_dist_home, radius, diameter, num_sig, entropy, + total_flight_time, total_pause_time, flight_pause_stats, + all_place_times, all_place_times_adjusted, summary_stats, + log_tags, log_tags_temp, datetime_list, places_of_interest, + parameters, pcr, pcr_stratified, i ) - summary_stats_df2 = pd.concat( - [ - summary_stats_df_daytime, - summary_stats_df_nighttime.iloc[:, 3:], - ], - axis=1, - ) - summary_stats_df2.columns = ( - list(summary_stats_df.columns)[:3] - + [ - f"{cname}_daytime" - for cname in list(summary_stats_df.columns)[3:] - ] - + [ - f"{cname}_nighttime" - for cname in list(summary_stats_df.columns)[3:] - ] - ) - summary_stats_df2 = summary_stats_df2.drop( - [ - "obs_day_daytime", - "obs_night_daytime", - "obs_day_nighttime", - "obs_night_nighttime", - ], - axis=1, - ) - summary_stats_df2.insert( - 3, - "obs_duration", - summary_stats_df2["obs_duration_daytime"] - + summary_stats_df2["obs_duration_nighttime"], - ) - else: - summary_stats_df2 = summary_stats_df + summary_stats_df2, log_tags = format_summary_stats( + summary_stats, log_tags, frequency, parameters, places_of_interest + ) return summary_stats_df2, log_tags +def split_day_night_cols(summary_stats_df: pd.DataFrame) -> pd.DataFrame: + """This function splits the summary statistics dataframe + into daytime and nighttime columns. + + Args: + summary_stats_df: pandas dataframe with summary statistics + Returns: + pandas dataframe with summary statistics + split into daytime and nighttime columns + """ + + summary_stats_df_daytime = summary_stats_df[::2].reset_index(drop=True) + summary_stats_df_nighttime = summary_stats_df[1::2].reset_index(drop=True) + + summary_stats_df2 = pd.concat( + [ + summary_stats_df_daytime, + summary_stats_df_nighttime.iloc[:, 3:], + ], + axis=1, + ) + summary_stats_df2.columns = ( + list(summary_stats_df.columns)[:3] + + [ + f"{cname}_daytime" + for cname in list(summary_stats_df.columns)[3:] + ] + + [ + f"{cname}_nighttime" + for cname in list(summary_stats_df.columns)[3:] + ] + ) + summary_stats_df2 = summary_stats_df2.drop( + [ + "obs_day_daytime", + "obs_night_daytime", + "obs_day_nighttime", + "obs_night_nighttime", + ], + axis=1, + ) + summary_stats_df2.insert( + 3, + "obs_duration", + summary_stats_df2["obs_duration_daytime"] + + summary_stats_df2["obs_duration_nighttime"], + ) + + return summary_stats_df2 + + +def get_time_range( + traj: np.ndarray, time_reset_indices: list, + tz_str: str, offset_seconds: int = 0, +) -> Tuple[int, int]: + """Computes the starting and ending time stamps + based on given trajectory and indices. + + Args: + traj: numpy array of trajectory + time_reset_indices: list of indices to reset time + offset_seconds: int, offset in seconds + tz_str: str, timezone + Returns: + A tuple of two integers (start_stamp, end_stamp): + start_stamp: int, starting time stamp + end_stamp: int, ending time stamp + """ + time_list = stamp2datetime(traj[0, 3], tz_str) + for idx in time_reset_indices: + time_list[idx] = 0 + start_stamp = datetime2stamp(time_list, tz_str) + + time_list = stamp2datetime(traj[-1, 6], tz_str) + for idx in time_reset_indices: + time_list[idx] = 0 + end_stamp = datetime2stamp(time_list, tz_str) + offset_seconds + + return start_stamp, end_stamp + + +def compute_window_and_count( + start_stamp: int, end_stamp: int, window_hours: int, + split_day_night: bool = False +) -> Tuple[int, int]: + """Computes the window and number of windows based on given time stamps. + + Args: + start_stamp: int, starting time stamp + end_stamp: int, ending time stamp + window_hours: int, window in hours + split_day_night: bool, True if split day and night + Returns: + A tuple of two integers (window, num_windows): + window: int, window in seconds + num_windows: int, number of windows + """ + + window = window_hours * 60 * 60 + num_windows = (end_stamp - start_stamp) // window + if split_day_night: + num_windows *= 2 + return window, num_windows + + def gps_quality_check(study_folder: str, study_id: str) -> float: """The function checks the gps data quality. @@ -947,20 +1546,14 @@ def gps_stats_main( tz_str: str, frequency: Frequency, save_traj: bool, - parameters: Optional[Hyperparameters] = None, places_of_interest: Optional[list] = None, - save_osm_log: bool = False, osm_tags: Optional[List[OSMTags]] = None, - threshold: Optional[int] = None, - split_day_night: bool = False, - person_point_radius: float = 2, - place_point_radius: float = 7.5, time_start: Optional[list] = None, time_end: Optional[list] = None, participant_ids: Optional[list] = None, + parameters: Optional[Hyperparameters] = None, all_memory_dict: Optional[dict] = None, all_bv_set: Optional[dict] = None, - quality_threshold: float = 0.05, ): """This the main function to do the GPS imputation. It calls every function defined before. @@ -976,20 +1569,8 @@ def gps_stats_main( csv file, False if you don't places_of_interest: list of places to watch, keywords as used in openstreetmaps - save_osm_log: bool, True if you want to output a log of locations - visited and their tags osm_tags: list of tags to search for in openstreetmaps avoid using a lot of them if large area is covered - threshold: int, time spent in a pause needs to exceed the - threshold to be placed in the log - only if save_osm_log True, in minutes - split_day_night: bool, True if you want to split all metrics to - datetime and nighttime patterns - only for daily frequency - person_point_radius: float, radius of the person's circle when - discovering places near him in pauses - place_point_radius: float, radius of place's circle - when place is returned as centre coordinates from osm time_start: list, starting time of window of interest time_end: list ending time of the window of interest time should be a list of integers with format @@ -1005,8 +1586,6 @@ def gps_stats_main( recommend to set it to default all_memory_dict: dict, from previous run (none if it's the first time) all_bv_set: dict, from previous run (none if it's the first time) - quality_threshold: float, a percentage value of the fraction of data - required for a summary to be created. Returns: write summary stats as csv for each user during the specified period @@ -1057,7 +1636,7 @@ def gps_stats_main( logger.info("User: %s", participant_id) # data quality check quality = gps_quality_check(study_folder, participant_id) - if quality > quality_threshold: + if quality > parameters.quality_threshold: # read data logger.info("Read in the csv files ...") data, _, _ = read_data( @@ -1141,11 +1720,9 @@ def gps_stats_main( traj, tz_str, Frequency.HOURLY, + parameters, places_of_interest, - save_osm_log, osm_tags, - threshold, - split_day_night, ) write_all_summaries(participant_id, summary_stats1, f"{output_folder}/hourly") @@ -1153,17 +1730,13 @@ def gps_stats_main( traj, tz_str, Frequency.DAILY, + parameters, places_of_interest, - save_osm_log, osm_tags, - threshold, - split_day_night, - person_point_radius, - place_point_radius, ) write_all_summaries(participant_id, summary_stats2, f"{output_folder}/daily") - if save_osm_log: + if parameters.save_osm_log: os.makedirs(f"{output_folder}/logs", exist_ok=True) with open( f"{output_folder}/logs/locations_logs_hourly.json", @@ -1180,16 +1753,14 @@ def gps_stats_main( traj, tz_str, frequency, + parameters, places_of_interest, - save_osm_log, osm_tags, - threshold, - split_day_night, ) write_all_summaries( participant_id, summary_stats, output_folder ) - if save_osm_log: + if parameters.save_osm_log: os.makedirs(f"{output_folder}/logs", exist_ok=True) with open( f"{output_folder}/logs/locations_logs.json",