Skip to content

Commit

Permalink
Merge branch 'develop' into jasmine-empty_row_size
Browse files Browse the repository at this point in the history
  • Loading branch information
GeorgeEfstathiadis authored Feb 13, 2024
2 parents 671d363 + 1c0e7bc commit 5e90f91
Show file tree
Hide file tree
Showing 8 changed files with 276 additions and 245 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
__pycache__/
.DS_Store

# IntelliJ project files
# IntelliJ, VsCode project files
.idea
.vscode

# for installing Forest in editable mode when developing
/forest.egg-info/
Expand All @@ -18,3 +19,6 @@ __pycache__/

#sphinx build
docs/_build/

# any python environment files
.python-version
27 changes: 23 additions & 4 deletions forest/jasmine/data2mobmat.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ def collapse_data(
# Filter out rows where the GPS accuracy is beyond
# the provided accuracy_limit
data = data[data.accuracy < accuracy_limit]
if data.shape[0] == 0:
raise ValueError(
f"No GPS record with accuracy less than {accuracy_limit}."
)

# Get the start and end timestamps in seconds
t_start = sorted(np.array(data.timestamp))[0] / 1000
Expand Down Expand Up @@ -615,6 +619,21 @@ def gps_to_mobmat(
return mobmat


def force_valid_longitude(longitude: float) -> float:
"""Forces a longitude coordinate to be within -180 and 180
In some cases, the imputation code seems to yield out-of-range
GPS coordinates. This function wrps longitude coordinates to be back
in the correct range so an error isn't thrown.
For example, 190 would get transformed into -170.
Args:
longitude: float. The longitude to be coerced
"""
return (longitude + 180) % 360 - 180


def compute_flight_positions(
index: int, mobmat: np.ndarray, interval: float
) -> np.ndarray:
Expand Down Expand Up @@ -656,8 +675,8 @@ def compute_flight_positions(
# Update the mobility matrix with the new start and end positions
mobmat[index, 1] = start_x
mobmat[index, 4] = end_x
mobmat[index, 2] = start_y
mobmat[index, 5] = end_y
mobmat[index, 2] = force_valid_longitude(start_y)
mobmat[index, 5] = force_valid_longitude(end_y)

return mobmat

Expand Down Expand Up @@ -704,8 +723,8 @@ def compute_future_flight_positions(
# Update the mobility matrix with the new start and end positions
mobmat[index, 1] = start_x
mobmat[index, 4] = end_x
mobmat[index, 2] = start_y
mobmat[index, 5] = end_y
mobmat[index, 2] = force_valid_longitude(start_y)
mobmat[index, 5] = force_valid_longitude(end_y)

return mobmat

Expand Down
20 changes: 12 additions & 8 deletions forest/jasmine/mobmat2traj.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""
import logging
import math
import sys
from typing import Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -278,7 +277,7 @@ def indicate_flight(
# Calculate k1 using the specified method
k1 = calculate_k1(method, current_t, current_x, current_y, bv_subset, pars)
if k1 is None:
sys.exit("Invalid method for calculate_k1.")
raise ValueError("Invalid method for calculate_k1.")

# Select flight and pause indicators from the bv_subset
flight_k = k1[bv_subset[:, 0] == 1]
Expand Down Expand Up @@ -662,8 +661,8 @@ def forward_impute(
method, start_t, start_x, start_y,
flight_table, pars
)
if weight is None:
sys.exit("Invalid method for calculate_k1.")
if weight is None or len(weight) == 0:
raise ValueError("Invalid method for calculate_k1.")

normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
flight_index = np.random.choice(flight_table.shape[0], p=normalize_w)
Expand Down Expand Up @@ -743,7 +742,7 @@ def forward_impute(
pause_table, pars
)
if weight is None:
sys.exit("Invalid method for calculate_k1.")
raise ValueError("Invalid method for calculate_k1.")

normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
pause_index = np.random.choice(pause_table.shape[0], p=normalize_w)
Expand Down Expand Up @@ -832,7 +831,7 @@ def backward_impute(
flight_table, pars
)
if weight is None:
sys.exit("Invalid method for calculate_k1.")
raise ValueError("Invalid method for calculate_k1.")

normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
flight_index = np.random.choice(flight_table.shape[0], p=normalize_w)
Expand Down Expand Up @@ -907,8 +906,8 @@ def backward_impute(
method, end_t, end_x, end_y,
pause_table, pars
)
if weight is None:
sys.exit("Invalid method for calculate_k1.")
if weight is None or len(weight) == 0:
raise ValueError("Invalid method for calculate_k1.")

normalize_w = (weight + 1e-5) / float(sum(weight + 1e-5))
pause_index = np.random.choice(pause_table.shape[0], p=normalize_w)
Expand Down Expand Up @@ -972,6 +971,11 @@ def impute_gps(
# for observed flights, observed pauses, and missing intervals
flight_table, pause_table, mis_table = create_tables(mob_mat, bv_subset)

if len(flight_table) == 0:
raise ValueError("No flight observed in the data.")
if len(pause_table) == 0:
raise ValueError("No pause observed in the data.")

# initialize the imputed trajectory table
imp_table = np.zeros((1, 7))

Expand Down
167 changes: 101 additions & 66 deletions forest/jasmine/traj2stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,7 +1576,8 @@ def gps_stats_main(
Args:
study_folder: str, the path of the study folder
output_folder: str, the path of the folder
where you want to save results
where you want to save results. A folder named jasmine
will be created containing all output.
tz_str: str, timezone
frequency: Frequency, the frequency of the summary stats
(resolution for summary statistics)
Expand Down Expand Up @@ -1613,16 +1614,30 @@ def gps_stats_main(
Raises:
ValueError: Frequency is not valid
"""

# no minutely analysis on GPS data
if frequency == Frequency.MINUTE:
raise ValueError("Frequency cannot be minutely.")

os.makedirs(output_folder, exist_ok=True)

if parameters is None:
parameters = Hyperparameters()

if frequency == Frequency.HOURLY_AND_DAILY:
frequencies = [Frequency.HOURLY, Frequency.DAILY]
else:
frequencies = [frequency]

# Ensure that the correct output folder structures exist, centralize folder
# names. Note that frequencies
trajectory_folder = f"{output_folder}/trajectory"
logs_folder = f"{output_folder}/logs"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(logs_folder, exist_ok=True)
for freq in frequencies:
os.makedirs(f"{output_folder}/{freq.name.lower()}", exist_ok=True)
if save_traj:
os.makedirs(trajectory_folder, exist_ok=True)

# pars0 is passed to bv_select, pars1 to impute_gps
pars0 = [
parameters.l1, parameters.l2, parameters.l3, parameters.a1,
parameters.a2, parameters.b1, parameters.b2, parameters.b3
Expand All @@ -1635,24 +1650,20 @@ def gps_stats_main(
# participant_ids should be a list of str
if participant_ids is None:
participant_ids = get_ids(study_folder)
# create a record of processed user participant_id and starting/ending time

# Create a record of processed participant_id and starting/ending time.
# These are updated and saved to disk after each participant is processed.
all_memory_dict_file = f"{output_folder}/all_memory_dict.pkl"
all_bv_set_file = f"{output_folder}/all_bv_set.pkl"
if all_memory_dict is None:
all_memory_dict = {}
for participant_id in participant_ids:
all_memory_dict[str(participant_id)] = None

if all_bv_set is None:
all_bv_set = {}
for participant_id in participant_ids:
all_bv_set[str(participant_id)] = None

if frequency == Frequency.HOURLY_AND_DAILY:
os.makedirs(f"{output_folder}/hourly", exist_ok=True)
os.makedirs(f"{output_folder}/daily", exist_ok=True)
if save_traj:
os.makedirs(f"{output_folder}/trajectory", exist_ok=True)

for participant_id in participant_ids:
logger.info("User: %s", participant_id)
# data quality check
Expand All @@ -1664,6 +1675,34 @@ def gps_stats_main(
participant_id, study_folder, "gps",
tz_str, time_start, time_end,
)
# If the data comes from a study thata hada GPS fuzzing,
# and the study was prior to March 2023, the longitude
# coordinates may be outside of the required range of
# (-180, 180). This chunk of code wraps out of range
# coordinates to be in that range
if (
("longitude" in data.columns)
and (
(data["longitude"].max() > 180)
or (data["longitude"].min() < -180)
)
):
logger.info("Reconciled bad longitude data for user %s",
participant_id)
data["longitude"] = (data["longitude"] + 180) % 360 - 180
if ((places_of_interest is not None)
or (osm_tags is not None)):
logger.warning("Warning: user %s had longitude values "
"outside the valid range [-180, 180] "
"but OSM location summaries were "
"requested. Longitude values outside "
"the valid range may signify that GPS "
"fuzzing was directed to be used in "
"the study setup file. If GPS "
"coordinates were fuzzed, OSM "
"location summaries are meaningless",
participant_id)

if data.shape == (0, 0):
logger.info("No data available.")
continue
Expand All @@ -1679,6 +1718,7 @@ def gps_stats_main(
params_w = np.mean(data.accuracy)
else:
params_w = parameters.w

# process data
mobmat1 = gps_to_mobmat(
data, parameters.itrvl, parameters.accuracylim,
Expand All @@ -1696,6 +1736,8 @@ def gps_stats_main(
)
all_bv_set[str(participant_id)] = bv_set = out_dict["BV_set"]
all_memory_dict[str(participant_id)] = out_dict["memory_dict"]

# impute_gps can fail, if so we skip this participant.
try:
imp_table = impute_gps(
mobmat2, bv_set, parameters.method,
Expand All @@ -1705,6 +1747,7 @@ def gps_stats_main(
except RuntimeError as e:
logger.error("Error: %s", e)
continue

traj = imp_to_traj(imp_table, mobmat2, params_w)
# raise error if traj coordinates are not in the range of
# [-90, 90] and [-180, 180]
Expand All @@ -1724,72 +1767,64 @@ def gps_stats_main(
"[-90, 90] and [-180, 180]."
)
# save all_memory_dict and all_bv_set
with open(f"{output_folder}/all_memory_dict.pkl", "wb") as f:
with open(all_memory_dict_file, "wb") as f:
pickle.dump(all_memory_dict, f)
with open(f"{output_folder}/all_bv_set.pkl", "wb") as f:
with open(all_bv_set_file, "wb") as f:
pickle.dump(all_bv_set, f)
if save_traj is True:
pd_traj = pd.DataFrame(traj)
pd_traj.columns = ["status", "x0", "y0", "t0", "x1", "y1",
"t1", "obs"]
pd_traj.to_csv(
f"{output_folder}/trajectory/{participant_id}.csv",
f"{trajectory_folder}/{participant_id}.csv",
index=False
)
if frequency == Frequency.HOURLY_AND_DAILY:
summary_stats1, logs1 = gps_summaries(
traj,
tz_str,
Frequency.HOURLY,
parameters,
places_of_interest,
osm_tags,
)
write_all_summaries(participant_id, summary_stats1,
f"{output_folder}/hourly")
summary_stats2, logs2 = gps_summaries(
traj,
tz_str,
Frequency.DAILY,
parameters,
places_of_interest,
osm_tags,
)
write_all_summaries(participant_id, summary_stats2,
f"{output_folder}/daily")
if parameters.save_osm_log:
os.makedirs(f"{output_folder}/logs", exist_ok=True)
with open(
f"{output_folder}/logs/locations_logs_hourly.json",
"w",
) as hourly:
json.dump(logs1, hourly, indent=4)
with open(
f"{output_folder}/logs/locations_logs_daily.json",
"w",
) as daily:
json.dump(logs2, daily, indent=4)
else:
summary_stats, logs = gps_summaries(
traj,
tz_str,
frequency,
parameters,
places_of_interest,
osm_tags,
)
write_all_summaries(
participant_id, summary_stats, output_folder

# generate summary stats.
# (variable "frequency" is already declared in signature)
for freq in frequencies:
gps_stats_generate_summary(
traj=traj,
tz_str=tz_str,
frequency=freq,
participant_id=participant_id,
output_folder=f"{output_folder}/{freq.name.lower()}",
logs_folder=logs_folder,
parameters=parameters,
places_of_interest=places_of_interest,
osm_tags=osm_tags,
)
if parameters.save_osm_log:
os.makedirs(f"{output_folder}/logs", exist_ok=True)
with open(
f"{output_folder}/logs/locations_logs.json",
"w",
) as loc:
json.dump(logs, loc, indent=4)
else:
logger.info(
"GPS data are not collected"
" or the data quality is too low"
)


def gps_stats_generate_summary(
traj: np.ndarray,
tz_str: str,
frequency: Frequency,
participant_id: str,
output_folder: str,
logs_folder: str,
parameters: Hyperparameters,
places_of_interest: Optional[list] = None,
osm_tags: Optional[List[OSMTags]] = None):
"""This is simply the inner functionality of gps_stats_main.
Runs summaries code, writes to disk, saves logs if required. """
summary_stats, logs = gps_summaries(
traj,
tz_str,
frequency,
parameters,
places_of_interest,
osm_tags,
)
write_all_summaries(participant_id, summary_stats, output_folder)
if parameters.save_osm_log:
with open(
f"{logs_folder}/locations_logs_{frequency.name.lower()}.json",
"wa",
) as loc:
json.dump(logs, loc, indent=4)
Loading

0 comments on commit 5e90f91

Please sign in to comment.