Skip to content

Commit

Permalink
Merge pull request #200 from onnela-lab/add_union_degrees_willow
Browse files Browse the repository at this point in the history
Willow add num_uniq_individuals_call_or_text
  • Loading branch information
GeorgeEfstathiadis authored Sep 20, 2023
2 parents 73ce68b + 26bf3e3 commit 7a916d8
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 8 deletions.
3 changes: 3 additions & 0 deletions docs/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ The summary statistics that are generated are listed below:
* - total_mins_out_call
- float
- The duration (minute) of all outgoing calls.
* - num_uniq_individuals_call_or_text
- int
- The total number of unique individuals who called or texted the subject, or who the subject called or texted. The total number of individuals who the subject had any kind of communication with.
* - num_s
- int
- The total number of sent SMS.
Expand Down
8 changes: 5 additions & 3 deletions docs/source/willow.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ ___
| num_in_call | int | Total number of incoming calls |
| num_out_call | int | Total number of outgoing calls |
| num_mis_call | int | Total number of missed calls
| num_uniq_in_call | float | Total number of unique incoming callers |
| num_uniq_out_call | int | Total number of unique outgoing calls |
| num_uniq_mis_call | float | Total number of unique callers missed |
| num_in_caller | float | Total number of unique incoming callers |
| num_out_caller | int | Total number of unique outgoing calls |
| num_mis_caller | float | Total number of unique callers missed |
| total_time_in_call | int | Total amount of minutes spent on incoming calls |
| total_time_out_call | int | Total amount of minutes spent on outgoing calls |
| num_uniq_individuals_call_or_text | float | Total number of unique individuals who called or texted the Beiwe user, or who the Beiwe user called or texted. The total number of individuals with any communication contact with the Beiwe user |
| num_s | float | Total number of sent SMS texts |
| num_r | int | Total number of received SMS texts |
| num_mms_s | int | Total number of sent MMS texts |
Expand All @@ -52,6 +53,7 @@ ___
| text_reciprocity_incoming | int | The total number of times a text is sent to a unique person without response |
| text_reciprocity_outgoing | int | The total number of times a text is received by a unique person without response |


## References

## Contact information for questions:
Expand Down
11 changes: 11 additions & 0 deletions forest/jasmine/traj2stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,17 @@ def gps_summaries(
res += [0] * (2 * len(places_of_interest) + 1)
summary_stats.append(res)
continue
elif sum(index_rows) == 0 and not split_day_night:
# There is no data and it is daily data, so we need to add empty
# rows
res = [year, month, day] + [0] * 3 + [pd.NA] * 15

if places_of_interest is not None:
# add empty data for places of interest
# for daytime/nighttime + other
res += [0] * (2 * len(places_of_interest) + 1)
summary_stats.append(res)
continue

temp = traj[index_rows, :]
# take a subset which is exactly one hour/day,
Expand Down
121 changes: 116 additions & 5 deletions forest/willow/log_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,68 @@ def text_analysis(
)


def text_and_call_analysis(
df_call: pd.DataFrame, df_text: pd.DataFrame, stamp: int, step_size: int
) -> tuple:
"""Calculate the summary statistics for anything requiring both call and
text data in the given time interval.
Args:
df_call: pd.DataFrame
dataframe of the call data
df_text: pd.DataFrame
dataframe of the text data
stamp: int
starting timestamp of the interval
step_size: int
ending timestamp of the interval
Returns:
tuple of summary statistics containing:
num_uniq_individuals_call_or_text: int
number of people making incoming calls or texts to the Beiwe
user or who the Beiwe user made outgoing calls or texts to
"""
# filter the data based on the timestamp
if df_call.shape[0] > 0:
temp_call = df_call[
(df_call["timestamp"] / 1000 >= stamp)
& (df_call["timestamp"] / 1000 < stamp + step_size)
]
index_in_call = np.array(temp_call["call type"]) == "Incoming Call"
index_out_call = np.array(temp_call["call type"]) == "Outgoing Call"
index_mis_call = np.array(temp_call["call type"]) == "Missed Call"
calls_in = np.array(temp_call["hashed phone number"])[index_in_call]
calls_out = np.array(temp_call["hashed phone number"])[index_out_call]
calls_mis = np.array(temp_call["hashed phone number"])[index_mis_call]

else: # no calls were received, so no unique numbers will be used
calls_in = np.array([])
calls_out = np.array([])

if df_text.shape[0] > 0:
temp_text = df_text[
(df_text["timestamp"] / 1000 >= stamp)
& (df_text["timestamp"] / 1000 < stamp + step_size)
]

index_s = np.array(temp_text["sent vs received"]) == "sent SMS"
index_r = np.array(temp_text["sent vs received"]) == "received SMS"
texts_in = np.array(temp_text["hashed phone number"])[index_r]
texts_out = np.array(temp_text["hashed phone number"])[index_s]
else: # no texts were received, so no unique numbers will be used
texts_in = np.array([])
texts_out = np.array([])

num_uniq_individuals_call_or_text = len(np.unique(np.hstack(
[calls_in, texts_in, texts_out, calls_out, calls_mis]
)))
return (
num_uniq_individuals_call_or_text,
)


def call_analysis(df_call: pd.DataFrame, stamp: int, step_size: int) -> tuple:
"""Calculate the summary statistics for the call data
in the given time interval.
Expand All @@ -148,9 +210,9 @@ def call_analysis(df_call: pd.DataFrame, stamp: int, step_size: int) -> tuple:
df_call: pd.DataFrame
dataframe of the call data
stamp: int
starting timestamp of the study
starting timestamp of the interval
step_size: int
ending timestamp of the study
ending timestamp of the interval
Returns:
tuple of summary statistics containing:
Expand Down Expand Up @@ -232,9 +294,9 @@ def comm_logs_summaries(
df_call: pd.DataFrame
dataframe of the call data
stamp_start: int
starting timestamp of the study
starting timestamp of the interval
stamp_end: int
ending timestamp of the study
ending timestamp of the interval
tz_str: str
timezone where the study was/is conducted
frequency: Frequency class,
Expand Down Expand Up @@ -288,13 +350,19 @@ def comm_logs_summaries(
newline += list(call_stats)
else:
newline += [pd.NA] * 8
if df_text.shape[0] > 0 or df_call.shape[0] > 0:
text_and_call_stats = text_and_call_analysis(
df_call, df_text, stamp, step_size
)
newline += list(text_and_call_stats)
else:
newline += [pd.NA]

if df_text.shape[0] > 0:
text_stats = text_analysis(df_text, stamp, step_size, frequency)
newline += list(text_stats)
else:
newline += [pd.NA] * 10

if frequency == Frequency.DAILY:
newline = [year, month, day] + newline
else:
Expand All @@ -311,6 +379,7 @@ def comm_logs_summaries(
"num_mis_caller",
"total_mins_in_call",
"total_mins_out_call",
"num_uniq_individuals_call_or_text",
"num_s",
"num_r",
"num_mms_s",
Expand Down Expand Up @@ -425,6 +494,48 @@ def log_stats_main(
tz_str,
frequency,
)
# num_uniq_individuals_call_or_text is the cardinality
# of the union of several sets. It should should always
# be at least as large as the cardinality of any one of
# the sets, and it should never be larger than the sum
# of the cardinalities of all of the sets
# (it may be equal if all the sets are disjoint)
sum_all_set_cols = pd.Series(
[0]*stats_pdframe.shape[0]
)
for col in [
"num_s_tel", "num_r_tel", "num_in_caller",
"num_out_caller", "num_mis_caller"
]:
sum_all_set_cols += stats_pdframe[col]
if (
stats_pdframe[
"num_uniq_individuals_call_or_text"
] < stats_pdframe[col]
).any():
logger.error(
"Error: "
"num_uniq_individuals_call_or_text "
"was found to be less than %s for at "
"least one time interval. This error "
"comes from an issue with the code,"
" not an issue with the input data",
col
)
if (
stats_pdframe[
"num_uniq_individuals_call_or_text"
] > sum_all_set_cols
).any():
logger.error(
"Error: "
"num_uniq_individuals_call_or_text "
"was found to be larger than the sum "
"of individual cardinalities for at "
"least one time interval. This error "
"comes from an issue with the code,"
" not an issue with the input data"
)

write_all_summaries(bid, stats_pdframe, output_folder)

Expand Down

0 comments on commit 7a916d8

Please sign in to comment.