diff --git a/docs/source/index.md b/docs/source/index.md index 2bc7c2f2..872ae6b1 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -216,6 +216,9 @@ The summary statistics that are generated are listed below: * - total_mins_out_call - float - The duration (minute) of all outgoing calls. + * - num_uniq_individuals_call_or_text + - int + - The total number of unique individuals who called or texted the subject, or who the subject called or texted. The total number of individuals who the subject had any kind of communication with. * - num_s - int - The total number of sent SMS. diff --git a/docs/source/willow.md b/docs/source/willow.md index ccf79fcb..e0ce225c 100644 --- a/docs/source/willow.md +++ b/docs/source/willow.md @@ -36,11 +36,12 @@ ___ | num_in_call | int | Total number of incoming calls | | num_out_call | int | Total number of outgoing calls | | num_mis_call | int | Total number of missed calls -| num_uniq_in_call | float | Total number of unique incoming callers | -| num_uniq_out_call | int | Total number of unique outgoing calls | -| num_uniq_mis_call | float | Total number of unique callers missed | +| num_in_caller | float | Total number of unique incoming callers | +| num_out_caller | int | Total number of unique outgoing calls | +| num_mis_caller | float | Total number of unique callers missed | | total_time_in_call | int | Total amount of minutes spent on incoming calls | | total_time_out_call | int | Total amount of minutes spent on outgoing calls | +| num_uniq_individuals_call_or_text | float | Total number of unique individuals who called or texted the Beiwe user, or who the Beiwe user called or texted. The total number of individuals with any communication contact with the Beiwe user | | num_s | float | Total number of sent SMS texts | | num_r | int | Total number of received SMS texts | | num_mms_s | int | Total number of sent MMS texts | @@ -52,6 +53,7 @@ ___ | text_reciprocity_incoming | int | The total number of times a text is sent to a unique person without response | | text_reciprocity_outgoing | int | The total number of times a text is received by a unique person without response | + ## References ## Contact information for questions: diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index 0e7fdfd1..67c0b70c 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -388,6 +388,17 @@ def gps_summaries( res += [0] * (2 * len(places_of_interest) + 1) summary_stats.append(res) continue + elif sum(index_rows) == 0 and not split_day_night: + # There is no data and it is daily data, so we need to add empty + # rows + res = [year, month, day] + [0] * 3 + [pd.NA] * 15 + + if places_of_interest is not None: + # add empty data for places of interest + # for daytime/nighttime + other + res += [0] * (2 * len(places_of_interest) + 1) + summary_stats.append(res) + continue temp = traj[index_rows, :] # take a subset which is exactly one hour/day, diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 1418871b..a40e1be0 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -140,6 +140,68 @@ def text_analysis( ) +def text_and_call_analysis( + df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int +) -> tuple: + """Calculate the summary statistics for anything requiring both call and + text data in the given time interval. + Args: + df_call: pd.DataFrame + dataframe of the call data + df_text: pd.DataFrame + dataframe of the text data + stamp: int + starting timestamp of the interval + step_size: int + ending timestamp of the interval + + Returns: + tuple of summary statistics containing: + num_uniq_individuals_call_or_text: int + number of people making incoming calls or texts to the Beiwe + user or who the Beiwe user made outgoing calls or texts to + + + """ + # filter the data based on the timestamp + if df_call.shape[0] > 0: + temp_call = df_call[ + (df_call["timestamp"] / 1000 >= stamp) + & (df_call["timestamp"] / 1000 < stamp + step_size) + ] + index_in_call = np.array(temp_call["call type"]) == "Incoming Call" + index_out_call = np.array(temp_call["call type"]) == "Outgoing Call" + index_mis_call = np.array(temp_call["call type"]) == "Missed Call" + calls_in = np.array(temp_call["hashed phone number"])[index_in_call] + calls_out = np.array(temp_call["hashed phone number"])[index_out_call] + calls_mis = np.array(temp_call["hashed phone number"])[index_mis_call] + + else: # no calls were received, so no unique numbers will be used + calls_in = np.array([]) + calls_out = np.array([]) + + if df_text.shape[0] > 0: + temp_text = df_text[ + (df_text["timestamp"] / 1000 >= stamp) + & (df_text["timestamp"] / 1000 < stamp + step_size) + ] + + index_s = np.array(temp_text["sent vs received"]) == "sent SMS" + index_r = np.array(temp_text["sent vs received"]) == "received SMS" + texts_in = np.array(temp_text["hashed phone number"])[index_r] + texts_out = np.array(temp_text["hashed phone number"])[index_s] + else: # no texts were received, so no unique numbers will be used + texts_in = np.array([]) + texts_out = np.array([]) + + num_uniq_individuals_call_or_text = len(np.unique(np.hstack( + [calls_in, texts_in, texts_out, calls_out, calls_mis] + ))) + return ( + num_uniq_individuals_call_or_text, + ) + + def call_analysis(df_call: pd.DataFrame, stamp: int, step_size: int) -> tuple: """Calculate the summary statistics for the call data in the given time interval. @@ -148,9 +210,9 @@ def call_analysis(df_call: pd.DataFrame, stamp: int, step_size: int) -> tuple: df_call: pd.DataFrame dataframe of the call data stamp: int - starting timestamp of the study + starting timestamp of the interval step_size: int - ending timestamp of the study + ending timestamp of the interval Returns: tuple of summary statistics containing: @@ -232,9 +294,9 @@ def comm_logs_summaries( df_call: pd.DataFrame dataframe of the call data stamp_start: int - starting timestamp of the study + starting timestamp of the interval stamp_end: int - ending timestamp of the study + ending timestamp of the interval tz_str: str timezone where the study was/is conducted frequency: Frequency class, @@ -288,13 +350,19 @@ def comm_logs_summaries( newline += list(call_stats) else: newline += [pd.NA] * 8 + if df_text.shape[0] > 0 or df_call.shape[0] > 0: + text_and_call_stats = text_and_call_analysis( + df_call, df_text, stamp, step_size + ) + newline += list(text_and_call_stats) + else: + newline += [pd.NA] if df_text.shape[0] > 0: text_stats = text_analysis(df_text, stamp, step_size, frequency) newline += list(text_stats) else: newline += [pd.NA] * 10 - if frequency == Frequency.DAILY: newline = [year, month, day] + newline else: @@ -311,6 +379,7 @@ def comm_logs_summaries( "num_mis_caller", "total_mins_in_call", "total_mins_out_call", + "num_uniq_individuals_call_or_text", "num_s", "num_r", "num_mms_s", @@ -425,6 +494,48 @@ def log_stats_main( tz_str, frequency, ) + # num_uniq_individuals_call_or_text is the cardinality + # of the union of several sets. It should should always + # be at least as large as the cardinality of any one of + # the sets, and it should never be larger than the sum + # of the cardinalities of all of the sets + # (it may be equal if all the sets are disjoint) + sum_all_set_cols = pd.Series( + [0]*stats_pdframe.shape[0] + ) + for col in [ + "num_s_tel", "num_r_tel", "num_in_caller", + "num_out_caller", "num_mis_caller" + ]: + sum_all_set_cols += stats_pdframe[col] + if ( + stats_pdframe[ + "num_uniq_individuals_call_or_text" + ] < stats_pdframe[col] + ).any(): + logger.error( + "Error: " + "num_uniq_individuals_call_or_text " + "was found to be less than %s for at " + "least one time interval. This error " + "comes from an issue with the code," + " not an issue with the input data", + col + ) + if ( + stats_pdframe[ + "num_uniq_individuals_call_or_text" + ] > sum_all_set_cols + ).any(): + logger.error( + "Error: " + "num_uniq_individuals_call_or_text " + "was found to be larger than the sum " + "of individual cardinalities for at " + "least one time interval. This error " + "comes from an issue with the code," + " not an issue with the input data" + ) write_all_summaries(bid, stats_pdframe, output_folder)