From f95e7875e8f30f539b2853739c16a82f2700e8e3 Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Fri, 15 Sep 2023 11:21:54 -0400 Subject: [PATCH 1/7] add indegree and outdegree for calls OR texts --- forest/willow/log_stats.py | 69 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 1418871b..2ec4a502 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -140,6 +140,67 @@ def text_analysis( ) +def text_and_call_analysis(df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int) -> tuple: + """Calculate the summary statistics for the call data + in the given time interval. + + Args: + df_call: pd.DataFrame + dataframe of the call data + df_text: pd.DataFrame + dataframe of the text data + stamp: int + starting timestamp of the study + step_size: int + ending timestamp of the study + + Returns: + tuple of summary statistics containing: + num_uniq_in_call_or_text: int + number of people making incoming calls or texts to the Beiwe + user + num_uniq_out_call_or_text: int + number of people receiving outgoing calls or texts from the + Beiwe user + + """ + # filter the data based on the timestamp + if df_call.shape > 0 + temp_call = df_call[ + (df_call["timestamp"] / 1000 >= stamp) + & (df_call["timestamp"] / 1000 < stamp + step_size) + ] + index_in_call = np.array(temp_call["call type"]) == "Incoming Call" + index_out_call = np.array(temp_call["call type"]) == "Outgoing Call" + calls_in = np.array(temp_call["hashed phone number"])[index_in_call] + calls_out = np.array(temp_call["hashed phone number"])[index_out_call] + else: ## no calls were received, so no unique numbers will be used + calls_in = np.array([]) + calls_out = np.array([]) + + if df_text.shape > 0: + temp_text = df_text[ + (df_text["timestamp"] / 1000 >= stamp) + & (df_text["timestamp"] / 1000 < stamp + step_size) + ] + + index_s = np.array(temp_text["sent vs received"]) == "sent SMS" + index_r = np.array(temp_text["sent vs received"]) == "received SMS" + texts_in = np.array(temp_text["hashed phone number"])[index_r] + texts_out = np.array(temp_text["hashed phone number"])[index_s] + else: ## no texts were received, so no unique numbers will be used + texts_in = np.array([]) + texts_out = np.array([]) + + num_uniq_in_call_or_text = len(np.unique(np.hstack(calls_in, texts_in))) + num_uniq_out_call_or_text = len(np.unique(np.hstack(texts_out, calls_out))) + + return ( + num_uniq_in_call_or_text, + num_uniq_out_call_or_text + ) + + def call_analysis(df_call: pd.DataFrame, stamp: int, step_size: int) -> tuple: """Calculate the summary statistics for the call data in the given time interval. @@ -288,6 +349,11 @@ def comm_logs_summaries( newline += list(call_stats) else: newline += [pd.NA] * 8 + if df_text.shape[0] > 0 or df_call.shape[0] > 0: + text_and_call_stats = text_and_call_analysis(df_text, df_call, stamp, step_size, frequency) + newline += list(text_and_call_stats) + else: + newline += [pd.NA] * 2 if df_text.shape[0] > 0: text_stats = text_analysis(df_text, stamp, step_size, frequency) @@ -295,6 +361,7 @@ def comm_logs_summaries( else: newline += [pd.NA] * 10 + if frequency == Frequency.DAILY: newline = [year, month, day] + newline else: @@ -311,6 +378,8 @@ def comm_logs_summaries( "num_mis_caller", "total_mins_in_call", "total_mins_out_call", + "num_uniq_in_call_or_text", + "num_uniq_out_call_or_text", "num_s", "num_r", "num_mms_s", From e0b6cd8aeb5c548fa54fde752d561f8fdbbe755d Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Fri, 15 Sep 2023 11:22:33 -0400 Subject: [PATCH 2/7] add summary statistics for willow --- docs/source/index.md | 6 ++++++ docs/source/willow.md | 3 +++ 2 files changed, 9 insertions(+) diff --git a/docs/source/index.md b/docs/source/index.md index 2bc7c2f2..5ed87a46 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -216,6 +216,12 @@ The summary statistics that are generated are listed below: * - total_mins_out_call - float - The duration (minute) of all outgoing calls. + * - num_uniq_in_call_or_text + - int + - The total number of unique individuals who called or texted the subject. +* - num_uniq_out_call_or_text + - int + - The total number of unique individuals called or texted by the subject. * - num_s - int - The total number of sent SMS. diff --git a/docs/source/willow.md b/docs/source/willow.md index ccf79fcb..c2647148 100644 --- a/docs/source/willow.md +++ b/docs/source/willow.md @@ -41,6 +41,8 @@ ___ | num_uniq_mis_call | float | Total number of unique callers missed | | total_time_in_call | int | Total amount of minutes spent on incoming calls | | total_time_out_call | int | Total amount of minutes spent on outgoing calls | +| num_uniq_in_call_or_text | float | Total number of unique individuals who called or texted the Beiwe user | +| num_uniq_out_call_or_text | int | Total number of unique individuals who the Beiwe user called or texted | | num_s | float | Total number of sent SMS texts | | num_r | int | Total number of received SMS texts | | num_mms_s | int | Total number of sent MMS texts | @@ -52,6 +54,7 @@ ___ | text_reciprocity_incoming | int | The total number of times a text is sent to a unique person without response | | text_reciprocity_outgoing | int | The total number of times a text is received by a unique person without response | + ## References ## Contact information for questions: From 0f552f5047aff23c105dc9328b9342d02f9649c5 Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Fri, 15 Sep 2023 11:29:29 -0400 Subject: [PATCH 3/7] don't distinguish between incoming and outgoing calls/texts --- docs/source/index.md | 7 ++----- docs/source/willow.md | 9 ++++----- forest/willow/log_stats.py | 22 ++++++++++------------ 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/docs/source/index.md b/docs/source/index.md index 5ed87a46..872ae6b1 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -216,12 +216,9 @@ The summary statistics that are generated are listed below: * - total_mins_out_call - float - The duration (minute) of all outgoing calls. - * - num_uniq_in_call_or_text + * - num_uniq_individuals_call_or_text - int - - The total number of unique individuals who called or texted the subject. -* - num_uniq_out_call_or_text - - int - - The total number of unique individuals called or texted by the subject. + - The total number of unique individuals who called or texted the subject, or who the subject called or texted. The total number of individuals who the subject had any kind of communication with. * - num_s - int - The total number of sent SMS. diff --git a/docs/source/willow.md b/docs/source/willow.md index c2647148..e0ce225c 100644 --- a/docs/source/willow.md +++ b/docs/source/willow.md @@ -36,13 +36,12 @@ ___ | num_in_call | int | Total number of incoming calls | | num_out_call | int | Total number of outgoing calls | | num_mis_call | int | Total number of missed calls -| num_uniq_in_call | float | Total number of unique incoming callers | -| num_uniq_out_call | int | Total number of unique outgoing calls | -| num_uniq_mis_call | float | Total number of unique callers missed | +| num_in_caller | float | Total number of unique incoming callers | +| num_out_caller | int | Total number of unique outgoing calls | +| num_mis_caller | float | Total number of unique callers missed | | total_time_in_call | int | Total amount of minutes spent on incoming calls | | total_time_out_call | int | Total amount of minutes spent on outgoing calls | -| num_uniq_in_call_or_text | float | Total number of unique individuals who called or texted the Beiwe user | -| num_uniq_out_call_or_text | int | Total number of unique individuals who the Beiwe user called or texted | +| num_uniq_individuals_call_or_text | float | Total number of unique individuals who called or texted the Beiwe user, or who the Beiwe user called or texted. The total number of individuals with any communication contact with the Beiwe user | | num_s | float | Total number of sent SMS texts | | num_r | int | Total number of received SMS texts | | num_mms_s | int | Total number of sent MMS texts | diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 2ec4a502..9c42a86a 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -156,12 +156,10 @@ def text_and_call_analysis(df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: Returns: tuple of summary statistics containing: - num_uniq_in_call_or_text: int + num_uniq_individuals_call_or_text: int number of people making incoming calls or texts to the Beiwe - user - num_uniq_out_call_or_text: int - number of people receiving outgoing calls or texts from the - Beiwe user + user or who the Beiwe user made outgoing calls or texts to + """ # filter the data based on the timestamp @@ -192,12 +190,13 @@ def text_and_call_analysis(df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: texts_in = np.array([]) texts_out = np.array([]) - num_uniq_in_call_or_text = len(np.unique(np.hstack(calls_in, texts_in))) - num_uniq_out_call_or_text = len(np.unique(np.hstack(texts_out, calls_out))) + num_uniq_individuals_call_or_text = len(np.unique(np.hstack( + calls_in, texts_in, texts_out, calls_out + ))) + return ( - num_uniq_in_call_or_text, - num_uniq_out_call_or_text + num_uniq_individuals_call_or_text, ) @@ -353,7 +352,7 @@ def comm_logs_summaries( text_and_call_stats = text_and_call_analysis(df_text, df_call, stamp, step_size, frequency) newline += list(text_and_call_stats) else: - newline += [pd.NA] * 2 + newline += [pd.NA] if df_text.shape[0] > 0: text_stats = text_analysis(df_text, stamp, step_size, frequency) @@ -378,8 +377,7 @@ def comm_logs_summaries( "num_mis_caller", "total_mins_in_call", "total_mins_out_call", - "num_uniq_in_call_or_text", - "num_uniq_out_call_or_text", + "num_uniq_individuals_call_or_text", "num_s", "num_r", "num_mms_s", From f86e20fa73fcce1b01a9bf011eee0a3fed43d284 Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Mon, 18 Sep 2023 20:27:59 -0400 Subject: [PATCH 4/7] mypy fixes, data validation checks --- forest/willow/log_stats.py | 67 +++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 9c42a86a..d931bf30 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -140,7 +140,9 @@ def text_analysis( ) -def text_and_call_analysis(df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int) -> tuple: +def text_and_call_analysis( + df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int +) -> tuple: """Calculate the summary statistics for the call data in the given time interval. @@ -163,20 +165,23 @@ def text_and_call_analysis(df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: """ # filter the data based on the timestamp - if df_call.shape > 0 + if df_call.shape[0] > 0: temp_call = df_call[ (df_call["timestamp"] / 1000 >= stamp) & (df_call["timestamp"] / 1000 < stamp + step_size) ] index_in_call = np.array(temp_call["call type"]) == "Incoming Call" index_out_call = np.array(temp_call["call type"]) == "Outgoing Call" + index_mis_call = np.array(temp_call["call type"]) == "Missed Call" calls_in = np.array(temp_call["hashed phone number"])[index_in_call] calls_out = np.array(temp_call["hashed phone number"])[index_out_call] - else: ## no calls were received, so no unique numbers will be used + calls_mis = np.array(temp_call["hashed phone number"])[index_mis_call] + + else: # no calls were received, so no unique numbers will be used calls_in = np.array([]) calls_out = np.array([]) - if df_text.shape > 0: + if df_text.shape[0] > 0: temp_text = df_text[ (df_text["timestamp"] / 1000 >= stamp) & (df_text["timestamp"] / 1000 < stamp + step_size) @@ -186,15 +191,13 @@ def text_and_call_analysis(df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: index_r = np.array(temp_text["sent vs received"]) == "received SMS" texts_in = np.array(temp_text["hashed phone number"])[index_r] texts_out = np.array(temp_text["hashed phone number"])[index_s] - else: ## no texts were received, so no unique numbers will be used + else: # no texts were received, so no unique numbers will be used texts_in = np.array([]) texts_out = np.array([]) num_uniq_individuals_call_or_text = len(np.unique(np.hstack( - calls_in, texts_in, texts_out, calls_out + [calls_in, texts_in, texts_out, calls_out, calls_mis] ))) - - return ( num_uniq_individuals_call_or_text, ) @@ -349,7 +352,9 @@ def comm_logs_summaries( else: newline += [pd.NA] * 8 if df_text.shape[0] > 0 or df_call.shape[0] > 0: - text_and_call_stats = text_and_call_analysis(df_text, df_call, stamp, step_size, frequency) + text_and_call_stats = text_and_call_analysis( + df_call, df_text, stamp, step_size + ) newline += list(text_and_call_stats) else: newline += [pd.NA] @@ -359,8 +364,6 @@ def comm_logs_summaries( newline += list(text_stats) else: newline += [pd.NA] * 10 - - if frequency == Frequency.DAILY: newline = [year, month, day] + newline else: @@ -492,6 +495,48 @@ def log_stats_main( tz_str, frequency, ) + # num_uniq_individuals_call_or_text is the cardinality + # of the union of several sets. It should should always + # be at least as large as the cardinality of any one of + # the sets, and it should never be larger than the sum + # of the cardinalities of all of the sets + # (it may be equal if all the sets are disjoint) + sum_all_set_cols = pd.Series( + [0]*stats_pdframe.shape[0] + ) + for col in [ + "num_s_tel", "num_r_tel", "num_in_caller", + "num_out_caller", "num_mis_caller" + ]: + sum_all_set_cols += stats_pdframe[col] + if ( + stats_pdframe[ + "num_uniq_individuals_call_or_text" + ] < stats_pdframe[col] + ).any(): + logger.error( + "Error: " + "num_uniq_individuals_call_or_text " + "was found to be less than %s for at " + "least one time interval. This error " + "comes from an issue with the code," + " not an issue with the input data", + col + ) + if ( + stats_pdframe[ + "num_uniq_individuals_call_or_text" + ] > sum_all_set_cols + ).any(): + logger.error( + "Error: " + "num_uniq_individuals_call_or_text " + "was found to be larger than the sum " + "of individual cardinalities for at " + "least one time interval. This error " + "comes from an issue with the code," + " not an issue with the input data" + ) write_all_summaries(bid, stats_pdframe, output_folder) From 800eb218ca8e23ec7c9da925b6c052b7a899be5d Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Tue, 19 Sep 2023 19:43:50 -0400 Subject: [PATCH 5/7] addition to jasmine to stop an error that happened, also updated docstring --- forest/jasmine/traj2stats.py | 10 ++++++++++ forest/willow/log_stats.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index 0e7fdfd1..877d2477 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -388,6 +388,16 @@ def gps_summaries( res += [0] * (2 * len(places_of_interest) + 1) summary_stats.append(res) continue + elif sum(index_rows) == 0 and not split_day_night: + ## There is no data and it is daily data, so we need to add empty rows + res = [year, month, day] + [0] * 3 + [pd.NA] * 15 + + if places_of_interest is not None: + # add empty data for places of interest + # for daytime/nighttime + other + res += [0] * (2 * len(places_of_interest) + 1) + summary_stats.append(res) + continue temp = traj[index_rows, :] # take a subset which is exactly one hour/day, diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index d931bf30..aae5fd82 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -143,7 +143,7 @@ def text_analysis( def text_and_call_analysis( df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int ) -> tuple: - """Calculate the summary statistics for the call data + """Calculate the summary statistics for the call and text data in the given time interval. Args: From 837ac83af13fbcbd8a2d1c11d8febd661f7cf965 Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Tue, 19 Sep 2023 19:47:52 -0400 Subject: [PATCH 6/7] more docstring updates --- forest/willow/log_stats.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index aae5fd82..08b948a9 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -143,7 +143,7 @@ def text_analysis( def text_and_call_analysis( df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int ) -> tuple: - """Calculate the summary statistics for the call and text data + """Calculate the summary statistics for anything requiring both call and text data in the given time interval. Args: @@ -152,9 +152,9 @@ def text_and_call_analysis( df_text: pd.DataFrame dataframe of the text data stamp: int - starting timestamp of the study + starting timestamp of the interval step_size: int - ending timestamp of the study + ending timestamp of the interval Returns: tuple of summary statistics containing: @@ -211,9 +211,9 @@ def call_analysis(df_call: pd.DataFrame, stamp: int, step_size: int) -> tuple: df_call: pd.DataFrame dataframe of the call data stamp: int - starting timestamp of the study + starting timestamp of the interval step_size: int - ending timestamp of the study + ending timestamp of the interval Returns: tuple of summary statistics containing: @@ -295,9 +295,9 @@ def comm_logs_summaries( df_call: pd.DataFrame dataframe of the call data stamp_start: int - starting timestamp of the study + starting timestamp of the interval stamp_end: int - ending timestamp of the study + ending timestamp of the interval tz_str: str timezone where the study was/is conducted frequency: Frequency class, From 26bf3e31350dae38309bcf318e4153e1f1a02c66 Mon Sep 17 00:00:00 2001 From: Zachary Clement Date: Tue, 19 Sep 2023 19:53:18 -0400 Subject: [PATCH 7/7] pep8 fixes --- forest/jasmine/traj2stats.py | 3 ++- forest/willow/log_stats.py | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/forest/jasmine/traj2stats.py b/forest/jasmine/traj2stats.py index 877d2477..67c0b70c 100644 --- a/forest/jasmine/traj2stats.py +++ b/forest/jasmine/traj2stats.py @@ -389,7 +389,8 @@ def gps_summaries( summary_stats.append(res) continue elif sum(index_rows) == 0 and not split_day_night: - ## There is no data and it is daily data, so we need to add empty rows + # There is no data and it is daily data, so we need to add empty + # rows res = [year, month, day] + [0] * 3 + [pd.NA] * 15 if places_of_interest is not None: diff --git a/forest/willow/log_stats.py b/forest/willow/log_stats.py index 08b948a9..a40e1be0 100644 --- a/forest/willow/log_stats.py +++ b/forest/willow/log_stats.py @@ -143,9 +143,8 @@ def text_analysis( def text_and_call_analysis( df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int ) -> tuple: - """Calculate the summary statistics for anything requiring both call and text data - in the given time interval. - + """Calculate the summary statistics for anything requiring both call and + text data in the given time interval. Args: df_call: pd.DataFrame dataframe of the call data