From 1a9117992106c4dcc19153cbeb97d227a552f797 Mon Sep 17 00:00:00 2001 From: Julia Poncela-Casasnovas Date: Mon, 15 Oct 2018 15:17:16 -0500 Subject: [PATCH] added more codes CK time series& cutting --- Haroldo_weigh_history_sel.py | 59 ++ ...equency_gaps_in_time_series_frequencies.py | 220 +++++++ ...gaps_in_time_series_frequencies_EDIT_DB.py | 266 ++++++++ ...s_average_first_isolated_points_EDIT_DB.py | 244 +++++++ ...equencies_first_isolated_points_EDIT_DB.py | 291 +++++++++ auxiliar.py | 64 ++ ...segments_time_series_for_Mech_Turk_task.py | 62 ++ baduserslist.py | 54 ++ ...es_Bonnies_lab_objective_weight_history.py | 157 +++++ ...s_Bonnies_lab_subjective_weight_history.py | 218 +++++++ ...e_and_populate_tables_MT_time_series_DB.py | 164 +++++ ...bles_MT_time_series_DB_free_time_stamps.py | 168 +++++ ...plot_scripts_for_multiplot_time_series_.py | 105 +++ get_more_statistics_time_series_cutting.py | 246 ++----- plot-freq-vs-fit-cuts.py | 253 ++++++++ ..._histogram_DW_score_cutting_time_series.py | 156 +++++ ...togram_number_cuts_per_user_time_series.py | 101 +++ plot_scatter_plots.py | 110 ++++ plottools_pygrace_Laura.py | 394 ++++++++++++ queries_weight_ins_and_activity_histogram.py | 12 +- rearrange_fake_time_series.py | 66 ++ segmentation_time_series.py | 5 +- ...ation_time_series_frequencies_iterative.py | 4 +- ...quencies_iterative_meaningful_threshold.py | 22 +- ..._iterative_meaningful_threshold_EDIT_DB.py | 400 ++++++++++++ ...gful_threshold_include_gap_info_EDIT_DB.py | 576 +++++++++++++++++ segmentation_time_series_iterative.py | 3 +- ...s_queries_weight_loss_50top_frequencies.py | 16 +- ...eight_loss_with_frequencies_and_filters.py | 8 +- ...n_only_positive_negative_plus_freq_info.py | 20 +- ...or_time_series_freq_weigh_cuts_and_gaps.py | 455 +++++++++++++ weight_change_after_gap.py | 330 ++++++++++ weight_change_after_gap_R6comparison.py | 607 ++++++++++++++++++ 33 files changed, 5633 insertions(+), 223 deletions(-) create mode 100644 Haroldo_weigh_history_sel.py create mode 100644 analyze_frequency_gaps_in_time_series_frequencies.py create mode 100644 analyze_frequency_gaps_in_time_series_frequencies_EDIT_DB.py create mode 100644 analyze_frequency_gaps_in_time_series_frequencies_defined_as6times_average_first_isolated_points_EDIT_DB.py create mode 100644 analyze_frequency_gaps_in_time_series_frequencies_first_isolated_points_EDIT_DB.py create mode 100644 auxiliar.py create mode 100644 average_segments_time_series_for_Mech_Turk_task.py create mode 100644 baduserslist.py create mode 100644 create_and_populate_tables_Bonnies_lab_objective_weight_history.py create mode 100644 create_and_populate_tables_Bonnies_lab_subjective_weight_history.py create mode 100644 create_and_populate_tables_MT_time_series_DB.py create mode 100644 create_and_populate_tables_MT_time_series_DB_free_time_stamps.py create mode 100644 generate_gnuplot_scripts_for_multiplot_time_series_.py create mode 100644 plot-freq-vs-fit-cuts.py create mode 100644 plot_histogram_DW_score_cutting_time_series.py create mode 100644 plot_histogram_number_cuts_per_user_time_series.py create mode 100644 plot_scatter_plots.py create mode 100644 plottools_pygrace_Laura.py create mode 100644 rearrange_fake_time_series.py create mode 100644 segmentation_time_series_frequencies_iterative_meaningful_threshold_EDIT_DB.py create mode 100644 segmentation_time_series_frequencies_iterative_meaningful_threshold_include_gap_info_EDIT_DB.py create mode 100644 transition_matrix_for_time_series_freq_weigh_cuts_and_gaps.py create mode 100644 weight_change_after_gap.py create mode 100644 weight_change_after_gap_R6comparison.py diff --git a/Haroldo_weigh_history_sel.py b/Haroldo_weigh_history_sel.py new file mode 100644 index 0000000..96ace29 --- /dev/null +++ b/Haroldo_weigh_history_sel.py @@ -0,0 +1,59 @@ +#! /usr/bin/env python + +import sys +import os +from database import * +from datetime import * + +def main (): + + database="calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db=Connection(server, database, user, passwd) + + query="""Select `weigh_in_cuts`.`id` , `weigh_in_cuts`.`ck_id`, `weigh_in_cuts`.`fit_type`, `weigh_in_cuts`.`start_idx`, `weigh_in_cuts`.`stop_idx`, + `weigh_in_cuts`.`param1`,`weigh_in_cuts`.`param2`,`weigh_in_cuts`.`param3` + From `weigh_in_cuts` + Order by `weigh_in_cuts`.`ck_id`""" + selfile=db.query(query) + + + + + for row in selfile: + query="SELECT weigh_in_history.ck_id, weigh_in_history.on_day,weigh_in_history.weight FROM weigh_in_history WHERE weigh_in_history.ck_id='" + row['ck_id'] + "' Order by weigh_in_history.on_day" + res=db.query(query) + + + first_day=row['start_day'] + last_day=row['stop_day'] + + fist_date=res[0]['on_date'] + last_date=res[-1]['on_date'] + + + count=0 + for his in res: + + + day=his['on_day'] + if (day>=) and (count> file, ck_id + "\t" + str(on_day) +"\t" + str(weigh) + + +if __name__== "__main__": + main() diff --git a/analyze_frequency_gaps_in_time_series_frequencies.py b/analyze_frequency_gaps_in_time_series_frequencies.py new file mode 100644 index 0000000..03e2689 --- /dev/null +++ b/analyze_frequency_gaps_in_time_series_frequencies.py @@ -0,0 +1,220 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela of October 2011 + +Given a file for a non-stationary time serie, it calculates the optimum points to cut it, that mark different trends. + +More info: It follows the method proposed by Fukuda, Stanley and Amaral PRL 69, 2004. + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #codigo para manejar bases de datos + + +def main (): + + + top=50 #max: 8921 with filters + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS gaps_by_frequency") #i remove the (old) table + + + db.execute (""" + CREATE TABLE gaps_by_frequency + ( + file_index INT, + ck_id CHAR (20), + start_date INT, + end_date INT, + start_day INT, + end_day INT, + days_gap INT, + zscore_gap FLOAT + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + #query="""describe gaps_by_frequency""" + #db.execute ("DROP TABLE IF EXISTS animal") + # query="""show tables""" + + + + + + query="""select * from gaps_by_frequency""" + + + + # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ") + + # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ") + + + + + #query="""show tables""" + + + # query="""select * from gaps_by_frequency""" + # result1 = db.query(query) # is a list of dict. + # for r1 in result1: + # print r1 + + + + + + list_all_average_frequencies=[] + histogram_all_freq_no_averaged=[0]*1000 + num_events_all_freq_no_averaged=0. + + + for index_file in range(top): + index_file+=1 + + list_average_frequencies_one_user=[] + histogram_idiv=[0]*1000 + num_events_indiv=0. + + +#input file: + file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" + #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" + + + + file=open(file_name+".dat",'r') + list_lines_file=file.readlines() + + + list_dates=[] + list_days=[] + list_frequencies=[] + cont=0 + for line in list_lines_file: + if cont>0: # i skip the first line,cos it doesnt have an associated freq. + + list=line.split(" ") + + ck_id=list[10] + + print line + try: + list_frequencies.append(float(list[9])) #frequency + list_days.append(float(list[4])) #relative day + list_dates.append(list[7]) #dates + + + except IndexError: + + list_frequencies.append(float(0.0)) #frequency + list_days.append(float(list[4])) #day + list_dates.append(list[7]) #dates + + cont+=1 + + print list_dates + + print "\n\n" + + list_zscores= stats.zs(list_frequencies) + + for i in range(len(list_zscores)): + + if list_zscores[i] >=3.0: # statistically significant gap if zs>=3 std + if list_frequencies[i] >15.:# dont consider it a gap if it is shorter than 2weeks + if i>2: #or happens for the very second measurement + + print "on file",index_file,"between days:",list_days[i-1],"-",list_days[i], "there is a gap. freq:", list_frequencies[i],"zscore:",list_zscores[i] + + time_gap=list_days[i]-list_days[i-1] + + db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap) VALUES ("+str(index_file)+", "+str(ck_id)+","+str(list_dates[i-1])+", "+str(list_dates[i])+","+str(list_days[i-1])+","+str(list_days[i])+","+str(time_gap)+", "+str(list_zscores[i])+" ") + + + + print "\n","on file",index_file,"mean freq:",np.asanyarray(list_frequencies).mean(axis=0),"std:",np.asanyarray(list_frequencies).std(axis=0, ddof=0) + + + + + + + + + raw_input() + + +################################## + +def zscore(a, axis=0, ddof=0): + """ + Calculates the z score of each value in the sample, relative to the sample + mean and standard deviation. + + Parameters + ---------- + a: array_like + An array like object containing the sample data + axis: int or None, optional + If axis is equal to None, the array is first ravel'd. If axis is an + integer, this is the axis over which to operate. Defaults to 0. + + + ddof : int, optional + Degrees of freedom correction in the calculation + of the standard deviation. Default is 0. + + Returns + ------- + zscore: array_like + the z-scores, standardized by mean and standard deviation of input + array + + Notes + ----- + This function does not convert array classes, and works also with + matrices and masked arrays. + + """ + a = np.asanyarray(a) + mns = a.mean(axis=axis) + sstd = a.std(axis=axis, ddof=ddof) + if axis and mns.ndim < a.ndim: + return ((a - np.expand_dims(mns, axis=axis) / + np.expand_dims(sstd,axis=axis))) + else: + return (a - mns) / sstd + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/analyze_frequency_gaps_in_time_series_frequencies_EDIT_DB.py b/analyze_frequency_gaps_in_time_series_frequencies_EDIT_DB.py new file mode 100644 index 0000000..5b38dcc --- /dev/null +++ b/analyze_frequency_gaps_in_time_series_frequencies_EDIT_DB.py @@ -0,0 +1,266 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on February 2012. + +Given a set of time series (in files), calculates the outliers in terms of times between events, to find statistically significant gaps. +Also, adds a new table to the original DB with the gap info. + + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (): + + + top=8924 #max: 8924 for the files with filters (>=10 days, >=10weigh-ins >= 1/30 weigh-ins per day). max:50 for the top50 longest time series (no filter) + + + zscore_threshold=1. # it is a statistically significant gap if zs>= X std + + min_freq=10. # to consider something a gap + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS gaps_by_frequency") #i remove the old table + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE gaps_by_frequency + ( + file_index INT, + ck_id CHAR (36), + index_start_day INT, + index_end_day INT, + start_day INT, + end_day INT, + days_gap INT, + zscore_gap FLOAT, + average_freq FLOAT + + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + #query="""describe gaps_by_frequency""" + #db.execute ("DROP TABLE IF EXISTS animal") + # query="""show tables""" + + + + + + query="""select * from gaps_by_frequency""" + + + + # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ") + + # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ") + + + + + #query="""show tables""" + + + # query="""select * from gaps_by_frequency""" + # result1 = db.query(query) # is a list of dict. + # for r1 in result1: + # print r1 + + + + + + + for index_file in range(top): + + + index_file+=1 + print "\n\n",index_file + + + +#input file: + #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" + file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" + + +# OJO!!!!!!!! EN ESTE ARCHIVO, EL DIA (RELATIVO AL PRIMERO) ES LA COLUNMA 4, NO LA 0 !!!!!!!!!! + + + file=open(file_name+".dat",'r') + list_lines_file=file.readlines() + + + list_dates=[] + list_days=[] + list_frequencies=[] + cont=0 + for line in list_lines_file: + if cont>0: # i skip the first line,cos it doesnt have an associated freq. + + list=line.split(" ") + + ck_id=list[8].strip("\n") + + try: + list_frequencies.append(float(list[7])) #frequency + list_days.append(float(list[4])) #relative day to the sign-up date + list_dates.append(list[5]) #dates + + + except IndexError: + + list_frequencies.append(float(0.0)) #frequency + list_days.append(float(list[4])) #day + list_dates.append(list[5]) #dates + + cont+=1 + + + + average_freq= np.mean(list_frequencies) + + list_zscores= stats.zs(list_frequencies) + + + + +# OJO!!!!!!!!! list_zscores[0] (o tb list_frequencies[0]) corresponde a la diff entre la primera y la segunda entrada de list_days, por lo que en realindad +#hay un desfase de una unidad entre los indices de las dos listas + num_gaps=0 + for i in range(len(list_zscores)): + + if list_zscores[i] >=zscore_threshold: # it is a statistically significant gap if zs>= zscore_threshold + if list_frequencies[i] > min_freq:# dont consider it a gap if it is shorter than x days + if i>=1: #because of the python thing about list[-1]=last_element_of_list) + + + print " between days:",list_days[i-1],"-",list_days[i], "there is a gap. freq:", list_frequencies[i],"zscore:",list_zscores[i],"average freq: ",average_freq, ck_id,"on file",index_file + + time_gap=list_days[i]-list_days[i-1] + + + + + + # db.execute (""" + # INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap) + #VALUES (%s, %s, %s,%s, %s, %s,%s, %s,%s, %s) + #""", str(index_file), str(ck_id),str(list_dates[i-1]), str(list_dates[i]),str(list_days[i-1]),str(list_days[i]),str(time_gap), str(list_zscores[i]), str(np.asanyarray(list_frequencies).mean(axis=0)), str(np.asanyarray(list_frequencies).std(axis=0, ddof=0))) NO FUNCIONA!! + + + db.execute (""" + INSERT INTO gaps_by_frequency (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq) + VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) + """, str(index_file), str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq)) + +# note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index) + + + + + + num_gaps+=1 + + + + if ck_id== "34214d9b-3fae-43d5-a961-bf7a94e22a3c" : + for ii in range(len(list_zscores)): + print list_days[ii],list_frequencies[ii],list_zscores[ii] + + raw_input() + + print str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq) + + + + # raw_input() + + print "on file",index_file,"mean freq:",np.asanyarray(list_frequencies).mean(axis=0),"std:",np.asanyarray(list_frequencies).std(axis=0, ddof=0) + + + + # if num_gaps>=8: #only one time series (file_index=2040) have more than 8 zs=3 gaps + # print index_file, "file has",num_gaps,"or more gaps! in ", len(list_days),"measurements" + # raw_input() + + + + + + + +################################## + +def zscore(a, axis=0, ddof=0): + """ + Calculates the z score of each value in the sample, relative to the sample + mean and standard deviation. + + Parameters + ---------- + a: array_like + An array like object containing the sample data + axis: int or None, optional + If axis is equal to None, the array is first ravel'd. If axis is an + integer, this is the axis over which to operate. Defaults to 0. + + + ddof : int, optional + Degrees of freedom correction in the calculation + of the standard deviation. Default is 0. + + Returns + ------- + zscore: array_like + the z-scores, standardized by mean and standard deviation of input + array + + Notes + ----- + This function does not convert array classes, and works also with + matrices and masked arrays. + + """ + a = np.asanyarray(a) + mns = a.mean(axis=axis) + sstd = a.std(axis=axis, ddof=ddof) + if axis and mns.ndim < a.ndim: + return ((a - np.expand_dims(mns, axis=axis) / + np.expand_dims(sstd,axis=axis))) + else: + return (a - mns) / sstd + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/analyze_frequency_gaps_in_time_series_frequencies_defined_as6times_average_first_isolated_points_EDIT_DB.py b/analyze_frequency_gaps_in_time_series_frequencies_defined_as6times_average_first_isolated_points_EDIT_DB.py new file mode 100644 index 0000000..67cdd18 --- /dev/null +++ b/analyze_frequency_gaps_in_time_series_frequencies_defined_as6times_average_first_isolated_points_EDIT_DB.py @@ -0,0 +1,244 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on February 2012. + +Given a set of time series (in files), calculates the outliers in terms of times between events, to find statistically significant gaps. +Also, adds a new table to the original DB with the gap info. + + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (): + + + top=8924 #max: 8924 for the files with filters (>=10 days, >=10weigh-ins >= 1/30 weigh-ins per day). max:50 for the top50 longest time series (no filter) + + + times_avg_freq=2. # it is a statistically significant gap if >= X times de average freq + + min_freq=3. # to consider something a gap + + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS gaps_by_frequency_6times_avg") #i remove the old table + + db.execute ("DROP TABLE IF EXISTS gaps_by_frequency_2times_avg") #i remove the old table + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE gaps_by_frequency_2times_avg + ( + file_index INT, + ck_id CHAR (36), + index_start_day INT, + index_end_day INT, + start_day INT, + end_day INT, + days_gap INT, + times_avg_freq FLOAT, + average_freq FLOAT + + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + + list_all_average_frequencies=[] + + + for index_file in range(top): + + + index_file+=1 + print "\n\n",index_file + + +#input file: + #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" + file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" + + +# OJO!!!!!!!! EN ESTE ARCHIVO, EL DIA (RELATIVO AL PRIMERO) ES LA COLUNMA 4, NO LA 0 !!!!!!!!!! + + + file=open(file_name+".dat",'r') + list_lines_file=file.readlines() + + + list_dates=[] + list_days=[] + list_frequencies=[] + cont=0 + for line in list_lines_file: + if cont>0: # i skip the first line,cos it doesnt have an associated freq. + + list=line.split(" ") + + ck_id=list[8].strip("\n") + + try: + list_frequencies.append(float(list[7])) #frequency + list_days.append(float(list[4])) #relative day to the sign-up date + list_dates.append(list[5]) #dates + + + except IndexError: + + list_frequencies.append(float(0.0)) #frequency + list_days.append(float(list[4])) #day + list_dates.append(list[5]) #dates + + cont+=1 + + + + average_freq= np.mean(list_frequencies) + + + + + +# OJO!!!!!!!!! list_frequencies[0] corresponde a la diff entre la primera y la segunda entrada de list_days, por lo que en realindad #hay un desfase de una unidad entre los indices de las dos listas + + + num_gaps=0 + for i in range(len(list_frequencies)): # loop over all interevent times + + if list_frequencies[i] >= average_freq* times_avg_freq: # it is a statistically significant gap if freq >= x times the avg freq + if list_frequencies[i] > min_freq:# dont consider it a gap if it is shorter than x days + if i>=1: #because of the python thing about list[-1]=last_element_of_list) + + + print " between days:",list_days[i-1],"-",list_days[i], "there is a gap. freq:", list_frequencies[i],", ",float(list_frequencies[i])/float(average_freq), " times the average, which is is: ",average_freq,ck_id,"on file",index_file + + time_gap=list_days[i]-list_days[i-1] + + + + db.execute (""" + INSERT INTO gaps_by_frequency_2times_avg (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, times_avg_freq, average_freq) + VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) + """, str(index_file), str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(float(list_frequencies[i])/float(average_freq)), str(average_freq)) + +# note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts at 0 index) + + + + num_gaps+=1 + + + + else: # for the very first point + + + time_gap=list_days[i] + + db.execute (""" + INSERT INTO gaps_by_frequency_2times_avg (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, times_avg_freq, average_freq) + VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) + """, str(index_file), str(ck_id),str(0),str(list_days[i]),i,i+1,str(time_gap), str(float(list_frequencies[i])/float(average_freq)), str(average_freq)) + +# note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts at 0 index) + + + num_gaps+=1 + + + + + + + + + + + + + print "on file",index_file,"mean freq:",np.asanyarray(list_frequencies).mean(axis=0),"std:",np.asanyarray(list_frequencies).std(axis=0, ddof=0) + + + + + + + +################################## + +def zscore(a, axis=0, ddof=0): + """ + Calculates the z score of each value in the sample, relative to the sample + mean and standard deviation. + + Parameters + ---------- + a: array_like + An array like object containing the sample data + axis: int or None, optional + If axis is equal to None, the array is first ravel'd. If axis is an + integer, this is the axis over which to operate. Defaults to 0. + + + ddof : int, optional + Degrees of freedom correction in the calculation + of the standard deviation. Default is 0. + + Returns + ------- + zscore: array_like + the z-scores, standardized by mean and standard deviation of input + array + + Notes + ----- + This function does not convert array classes, and works also with + matrices and masked arrays. + + """ + a = np.asanyarray(a) + mns = a.mean(axis=axis) + sstd = a.std(axis=axis, ddof=ddof) + if axis and mns.ndim < a.ndim: + return ((a - np.expand_dims(mns, axis=axis) / + np.expand_dims(sstd,axis=axis))) + else: + return (a - mns) / sstd + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/analyze_frequency_gaps_in_time_series_frequencies_first_isolated_points_EDIT_DB.py b/analyze_frequency_gaps_in_time_series_frequencies_first_isolated_points_EDIT_DB.py new file mode 100644 index 0000000..591bb38 --- /dev/null +++ b/analyze_frequency_gaps_in_time_series_frequencies_first_isolated_points_EDIT_DB.py @@ -0,0 +1,291 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on February 2012. + +Given a set of time series (in files), calculates the outliers in terms of times between events, to find statistically significant gaps. +Also, adds a new table to the original DB with the gap info. + + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (): + + + top=8924 #max: 8924 for the files with filters (>=10 days, >=10weigh-ins >= 1/30 weigh-ins per day). max:50 for the top50 longest time series (no filter) + + + zscore_threshold=1. # it is a statistically significant gap if zs>=3 std + + min_freq=10. # to consider something a gap + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS gaps_by_frequency") #i remove the old table + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE gaps_by_frequency + ( + file_index INT, + ck_id CHAR (36), + index_start_day INT, + index_end_day INT, + start_day INT, + end_day INT, + days_gap INT, + zscore_gap FLOAT, + average_freq FLOAT + + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + #query="""describe gaps_by_frequency""" + #db.execute ("DROP TABLE IF EXISTS animal") + # query="""show tables""" + + + + + + query="""select * from gaps_by_frequency""" + + + + # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ") + + # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ") + + + + + #query="""show tables""" + + + # query="""select * from gaps_by_frequency""" + # result1 = db.query(query) # is a list of dict. + # for r1 in result1: + # print r1 + + + + + + list_all_average_frequencies=[] + histogram_all_freq_no_averaged=[0]*1000 + num_events_all_freq_no_averaged=0. + + + for index_file in range(top): + + + index_file+=1 + print "\n\n",index_file + list_average_frequencies_one_user=[] + histogram_idiv=[0]*1000 + num_events_indiv=0. + + +#input file: + #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" + file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" + + +# OJO!!!!!!!! EN ESTE ARCHIVO, EL DIA (RELATIVO AL PRIMERO) ES LA COLUNMA 4, NO LA 0 !!!!!!!!!! + + + file=open(file_name+".dat",'r') + list_lines_file=file.readlines() + + + list_dates=[] + list_days=[] + list_frequencies=[] + cont=0 + for line in list_lines_file: + if cont>0: # i skip the first line,cos it doesnt have an associated freq. + + list=line.split(" ") + + ck_id=list[8].strip("\n") + + try: + list_frequencies.append(float(list[7])) #frequency + list_days.append(float(list[4])) #relative day to the sign-up date + list_dates.append(list[5]) #dates + + + except IndexError: + + list_frequencies.append(float(0.0)) #frequency + list_days.append(float(list[4])) #day + list_dates.append(list[5]) #dates + + cont+=1 + + + + average_freq= np.mean(list_frequencies) + + list_zscores= stats.zs(list_frequencies) + + + + +# OJO!!!!!!!!! list_zscores[0] (o tb list_frequencies[0]) corresponde a la diff entre la primera y la segunda entrada de list_days, por lo que en realindad +#hay un desfase de una unidad entre los indices de las dos listas + num_gaps=0 + for i in range(len(list_zscores)): + + if list_zscores[i] >=zscore_threshold: # it is a statistically significant gap if zs>= zscore_threshold + if list_frequencies[i] > min_freq:# dont consider it a gap if it is shorter than x days + if i>=1: #because of the python thing about list[-1]=last_element_of_list) + + + print " between days:",list_days[i-1],"-",list_days[i], "there is a gap. freq:", list_frequencies[i],"zscore:",list_zscores[i],"average freq: ",average_freq, ck_id,"on file",index_file + + time_gap=list_days[i]-list_days[i-1] + + + + + + # db.execute (""" + # INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap) + #VALUES (%s, %s, %s,%s, %s, %s,%s, %s,%s, %s) + #""", str(index_file), str(ck_id),str(list_dates[i-1]), str(list_dates[i]),str(list_days[i-1]),str(list_days[i]),str(time_gap), str(list_zscores[i]), str(np.asanyarray(list_frequencies).mean(axis=0)), str(np.asanyarray(list_frequencies).std(axis=0, ddof=0))) NO FUNCIONA!! + + + db.execute (""" + INSERT INTO gaps_by_frequency (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq) + VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) + """, str(index_file), str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq)) + +# note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index) + + + + + + num_gaps+=1 + + + + # if ck_id== "34214d9b-3fae-43d5-a961-bf7a94e22a3c" : + # for ii in range(len(list_zscores)): + # print list_days[ii],list_frequencies[ii],list_zscores[ii] + + # raw_input() + +# print str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq) + + else: # for the very first point + + + time_gap=list_days[i] + + db.execute (""" + INSERT INTO gaps_by_frequency (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq) + VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) + """, str(index_file), str(ck_id),str(0),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq)) + +# note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index) + + + + + + num_gaps+=1 + + + + + + + + + + + + + print "on file",index_file,"mean freq:",np.asanyarray(list_frequencies).mean(axis=0),"std:",np.asanyarray(list_frequencies).std(axis=0, ddof=0) + + + + + + + +################################## + +def zscore(a, axis=0, ddof=0): + """ + Calculates the z score of each value in the sample, relative to the sample + mean and standard deviation. + + Parameters + ---------- + a: array_like + An array like object containing the sample data + axis: int or None, optional + If axis is equal to None, the array is first ravel'd. If axis is an + integer, this is the axis over which to operate. Defaults to 0. + + + ddof : int, optional + Degrees of freedom correction in the calculation + of the standard deviation. Default is 0. + + Returns + ------- + zscore: array_like + the z-scores, standardized by mean and standard deviation of input + array + + Notes + ----- + This function does not convert array classes, and works also with + matrices and masked arrays. + + """ + a = np.asanyarray(a) + mns = a.mean(axis=axis) + sstd = a.std(axis=axis, ddof=ddof) + if axis and mns.ndim < a.ndim: + return ((a - np.expand_dims(mns, axis=axis) / + np.expand_dims(sstd,axis=axis))) + else: + return (a - mns) / sstd + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/auxiliar.py b/auxiliar.py new file mode 100644 index 0000000..c817c24 --- /dev/null +++ b/auxiliar.py @@ -0,0 +1,64 @@ + + +import sys +import os +from datetime import * +from database import * #codigo para manejar bases de datos +import math +import numpy +from scipy import stats + + + +def main (): + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + db= Connection(server, database, user, passwd) + + + + query="""select * from users""" + result1 = db.query(query) # is a list of dictionaries + + + + contador=0 + + for r1 in result1: #loop over users + contador+=1 + + ck_id=r1['ck_id'] + + print contador, r1 + + query3="select * from gaps_by_frequency where (ck_id ='"+str(ck_id)+"') order by start_day asc" + result3 = db.query(query3) + + + num_gaps=len(result3) + + print "# gaps:",num_gaps,ck_id + + + if num_gaps>0: + for r3 in result3: + + print r3 + starting_gap=r3['start_date'] + ending_gap=r3['end_date'] + trend="gap" + zscore_gap=r3['zscore_gap'] # threshold to consider a gap statistically sifnificant zs>=3 (imposed like that in: analyze_fr print num_gaps + raw_input() + + +######################### + +if __name__== "__main__": + + main() + +########################## diff --git a/average_segments_time_series_for_Mech_Turk_task.py b/average_segments_time_series_for_Mech_Turk_task.py new file mode 100644 index 0000000..fc56673 --- /dev/null +++ b/average_segments_time_series_for_Mech_Turk_task.py @@ -0,0 +1,62 @@ +#! /usr/bin/env python + +""" +Created by Julia Poncela of February 2012 + +It calculates the average of each column for a file with multiple columns + +""" + + + +import sys +import os +from scipy import stats +import scipy +import math +import numpy + + +def main (): + + file_name="Mech_Turk_experiment_input.dat" + + file=open(file_name,'r') + list_lines_file=file.readlines() + + cont=0 + list_initial=[] + list_final=[] + list_w_ins=[] + list_num_days=[] + for line in list_lines_file: #i create the list for the original file + if cont>0: + list=line.split("\t") # initial_weight final_weight num_w-ins num_days + + + + list_initial.append(float(list[0])) + list_final.append(float(list[1])) + list_w_ins.append(float(list[2])) + list_num_days.append(float(list[3])) + + + cont+=1 + + + + print "avg initial weight:",numpy.mean(list_initial),"\navg final weight:",numpy.mean(list_final),"\navg #w ins:",numpy.mean(list_w_ins),"\navg num days:",numpy.mean(list_num_days),"\n\n" + + + print "median initial weight:",numpy.median(list_initial),"\nmedian final weight:",numpy.median(list_final),"\nmedian #w ins:",numpy.median(list_w_ins),"\nmedian num days:",numpy.median(list_num_days) + +######################### + +if __name__== "__main__": + + main() + + +############################################## + + diff --git a/baduserslist.py b/baduserslist.py new file mode 100644 index 0000000..3a1e654 --- /dev/null +++ b/baduserslist.py @@ -0,0 +1,54 @@ +""" +Parses the bad users list and provides a function for querying if a user is in +the list. +""" + +# Matching comment lines: +import re +is_comment_line = re.compile(r'\s*#') +is_id_line = re.compile(r'([0-9a-f\-]+)\s*(.*)\n') + +bad_ids = dict() + +list_fh = open('bad-users.txt', 'r') +for line in list_fh: + # Skip comment lines + if is_comment_line.match(line): + continue + m = is_id_line.match(line) + if m == None: + continue + # Store the explanation under the user id: + bad_ids[m.group(1)] = m.group(2) + +def is_bad_user (ck_id): + if ck_id in bad_ids: + return True + return False + +def why_bad_user (ck_id): + if ck_id in bad_ids: + return bad_ids[ck_id] + return ck_id + ' is a good user...?' + + +# Some tests +if __name__ == '__main__': + """ + A quick test to make sure that baduserlist.py works properly + """ + + from baduserslist import * + + print "1..2" + + message = "ok 1 - user f00a9af0-a is in bad list" + if not is_bad_user('f00a9af0-a'): + message = "not " + message + print message + + message = "ok 2 - message for f00a9af0-a is accurate" + if not why_bad_user('f00a9af0-a') == 'too many zero entries': + message = "not " + message + print message + diff --git a/create_and_populate_tables_Bonnies_lab_objective_weight_history.py b/create_and_populate_tables_Bonnies_lab_objective_weight_history.py new file mode 100644 index 0000000..ad0b078 --- /dev/null +++ b/create_and_populate_tables_Bonnies_lab_objective_weight_history.py @@ -0,0 +1,157 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on July 2012. + +Given a set of time series (in files), it creates a new table in the MT_time_series DB and populates it with the data +from the MT fake time series. + + + + +""" + +import csv +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (csv_file): + + + arbitrary_initial_date=datetime(2009, 1, 1, 0, 0) # because the MT time series are generated starting at an arbitrary day=0 + + + + + database = "MT_time_series" + server="tarraco.chem-eng.northwestern.edu" + user="julia" + passwd="tiyp,julia" + db= Connection(server, database, user, passwd) + + db.execute ("DROP TABLE IF EXISTS objective_weigh_in_history") #i remove the old table + + + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE objective_weigh_in_history + ( + ck_id INT, + weight FLOAT, + on_day DATETIME, + id INT, + activity_flag CHAR (2) + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + activity_flag='WI' + + resultado_csv= csv.reader(open(csv_file, 'rb'), delimiter=',')#, quotechar='"') + #weight_week1, weight_week_week2, weight_week3,... weight_week8 + + cont=0 + for row in resultado_csv: + + if cont>0: # i skip the header line + + print "line:",cont+1,row + list_weight_values=[] + list_percent_weight_changes=[] + list_values_days=[] + cont_week=-1 + + for element in row: + if len(element) >0: # if it is not an empty enty + if cont_week==-1: # this file starts on week1, i transform it to week0 + ck_id=element # the fake ck_id (and the id) will both be the ID field + id=element + else: + if cont_week==0: # first weight + first_weight=float(element) + try: + value=float(element) + list_weight_values.append(value) + list_values_days.append(cont_week*7) + list_percent_weight_changes.append(100.*(value-first_weight)/first_weight) + except ValueError: pass # in case the value is not a numberic character + + cont_week+=1 + + + + file_name="fake_time_series/Bonnies_study/objective_weight_time_series_"+str(ck_id)+".dat" + file=open(file_name,'wt') + + for i in range(len(list_weight_values)): + # print list_values_days[i],timedelta(list_values_days[i])+arbitrary_initial_date,list_weight_values[i] + + weight=list_weight_values[i] + on_day=timedelta(list_values_days[i])+arbitrary_initial_date + + db.execute (""" + INSERT INTO objective_weigh_in_history (ck_id, weight, on_day, id, activity_flag) + VALUES (%s, %s, %s, %s,%s) + """, str(ck_id), str(weight),str(on_day),str(id), str(activity_flag)) + + print str(ck_id), str(weight),str(on_day),str(id), str(activity_flag) + + + + + + + + print >> file, list_values_days[i],list_percent_weight_changes[i],list_weight_values[i] + + + + cont+=1 + + + + + + + + + + #query="""describe weigh_in_history""" + + + + + + + + + + + + + +############################################## +if __name__ == '__main__': + if len(sys.argv) > 1: + csv_file = sys.argv[1] + + + + main(csv_file) + else: + print "usage: python whatever.py path/csv_file_objective_weight_time_series.csv " + + diff --git a/create_and_populate_tables_Bonnies_lab_subjective_weight_history.py b/create_and_populate_tables_Bonnies_lab_subjective_weight_history.py new file mode 100644 index 0000000..8897e7e --- /dev/null +++ b/create_and_populate_tables_Bonnies_lab_subjective_weight_history.py @@ -0,0 +1,218 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on July 2012. + +Given a set of time series (in files), it creates a new table in the MT_time_series DB and populates it with the data +from the MT fake time series. + + + + +""" + +import csv +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (csv_file): + + + + database = "MT_time_series" + server="tarraco.chem-eng.northwestern.edu" + user="julia" + passwd="tiyp,julia" + db= Connection(server, database, user, passwd) + + db.execute ("DROP TABLE IF EXISTS subjective_weigh_in_history") #i remove the old table + + + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE subjective_weigh_in_history + ( + ck_id INT, + weight FLOAT, + on_day DATETIME, + id INT, + activity_flag CHAR (2) + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + activity_flag='WI' + + resultado_csv= csv.reader(open(csv_file, 'rb'), delimiter=',')#, quotechar='"') + #weight_week1, weight_week_week2, weight_week3,... weight_week8 + + + + + + current_ck_id=str(51) # first id in the file THIS MAY CHANGE WITH OTHER FILES!!!!!!!!!! + + old_ck_id=str(51) + + list_weight_values=[] + list_percent_weight_changes=[] + list_dates=[] + print "first id", current_ck_id + + + cont=0 # for the tot number of lines of the one and only datafile + for row in resultado_csv: + + if cont>0: # i skip the header line + ck_id=str(row[0]) + + print ck_id + + if ck_id== current_ck_id: + + + weight=float(row[2]) + + complete_date=row[1].split("-") + year=int(complete_date[0].strip(" ")) + month=int(complete_date[1].strip(" ")) + day=int(complete_date[2].strip(" ")) + + weight_date=datetime(year, month, day, 0, 0) + + list_weight_values.append(float(weight)) + list_dates.append(weight_date) + + + # print len(list_weight_values) + if len(list_percent_weight_changes) ==0: + first_weight=float(weight) + first_date=weight_date + print "first_weight:",first_weight,"first enty" + + else: + first_weight =list_weight_values[0] + first_date=list_dates[0] + + print " first_weight:",first_weight,"rest of them" + + + list_percent_weight_changes.append(100.*(weight-first_weight)/first_weight) + + # print " ",ck_id,weight_date, weight_date-first_date,weight,100.*(weight-first_weight)/first_weight + + + + + else : + print "new id", ck_id + cont_entries_one_user=1 + + + file_name="fake_time_series/Bonnies_study/subjective_weight_time_series_"+str(old_ck_id)+".dat" + file=open(file_name,'wt') + + + for i in range(len(list_weight_values)): # i print the series for the previous user + + weight=list_weight_values[i] + on_day=list_dates[i] + + db.execute (""" + INSERT INTO subjective_weigh_in_history (ck_id, weight, on_day, id, activity_flag) + VALUES (%s, %s, %s, %s,%s) + """, str(old_ck_id), str(weight),str(on_day),str(old_ck_id), str(activity_flag)) + + + + print >> file, (on_day-first_date).days,list_percent_weight_changes[i],list_weight_values[i],list_dates[i] + print old_ck_id, (on_day-first_date).days,list_percent_weight_changes[i],list_weight_values[i],list_dates[i] + + + list_weight_values=[] # i get ready for the next user, and i save this entry as the first one of the new time series + list_dates=[] + list_percent_weight_changes=[] + current_ck_id=ck_id + old_ck_id=ck_id + + weight=float(row[2]) + + + complete_date=row[1].split("-") + year=int(complete_date[0].strip(" ")) + month=int(complete_date[1].strip(" ")) + day=int(complete_date[2].strip(" ")) + weight_date=datetime(year, month, day, 0, 0) + + + list_weight_values.append(float(weight)) + list_dates.append(weight_date) + + + if len(list_percent_weight_changes) ==0: + first_weight=float(weight) + first_date=weight_date + # print "first_weight:",first_weight,"first enty" + + else: + first_weight =list_weight_values[0] + first_date=list_dates[0] + + # print " first_weight:",first_weight,"rest of them" + + + + + list_percent_weight_changes.append(100.*(weight-first_weight)/first_weight) + # print " ",ck_id,weight_date, (weight_date-first_date).days, weight, 100.*(weight-first_weight)/first_weight + + + cont+=1 + + + + + + + + + + #query="""describe weigh_in_history""" + + + + + + + + + + + + + +############################################## +if __name__ == '__main__': + if len(sys.argv) > 1: + csv_file = sys.argv[1] + + + + main(csv_file) + else: + print "usage: python whatever.py path/csv_file_subjective_weight_time_series.csv " + + diff --git a/create_and_populate_tables_MT_time_series_DB.py b/create_and_populate_tables_MT_time_series_DB.py new file mode 100644 index 0000000..42bfde6 --- /dev/null +++ b/create_and_populate_tables_MT_time_series_DB.py @@ -0,0 +1,164 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on July 2012. + +Given a set of time series (in files), it creates a new table in the MT_time_series DB and populates it with the data +from the MT fake time series. + + + + +""" + +import csv +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (csv_file): + + + arbitrary_initial_date=datetime(2009, 1, 1, 0, 0) # because the MT time series are generated starting at an arbitrary day=0 + + + + + database = "MT_time_series" + server="tarraco.chem-eng.northwestern.edu" + user="julia" + passwd="tiyp,julia" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS weigh_in_history") #i remove the old table + + + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE weigh_in_history + ( + ck_id INT, + weight FLOAT, + on_day DATETIME, + id INT, + activity_flag CHAR (2) + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + + resultado_csv= csv.reader(open(csv_file, 'rb'), delimiter=',')#, quotechar='"') + #weight_week0, weight_week_week1, weight_week2,... weight_week29 + + cont=0 + for row in resultado_csv: + ck_id=cont+1 # the fake ck_id (and the id) will both be the line count from the csv file + id=cont+1 + activity_flag='WI' + + if cont>0: # i skip the header line + + print "line:",cont+1,row + list_weight_values=[] + flag_skip_line=0 + list_values_days=[] + cont_week=0 + + for element in row: + if len(element) >0: # if it is not an empty enty + try: + value=float(element) + if value >200.0 and value < 400.0: # i remove bad data + list_weight_values.append(value) + list_values_days.append(cont_week*7) + # else: + # flag_skip_line=1 + except ValueError: pass # in case the value is not a numberic character + + cont_week+=1 + + if len(list_weight_values)<10: # if after removing bad data i still have a long enough time serie + flag_skip_line=1 + + + if flag_skip_line==0: + for i in range(len(list_weight_values)): + print list_values_days[i],timedelta(list_values_days[i])+arbitrary_initial_date,list_weight_values[i] + + + + weight=list_weight_values[i] + on_day=timedelta(list_values_days[i])+arbitrary_initial_date + + db.execute (""" + INSERT INTO weigh_in_history (ck_id, weight, on_day, id, activity_flag) + VALUES (%s, %s, %s, %s,%s) + """, str(ck_id), str(weight),str(on_day),str(id), str(activity_flag)) + + delta=(len(row)-1)*7 + print delta,timedelta(delta)+arbitrary_initial_date ,240. # fixed final value + weight=240. + on_day=timedelta(delta)+arbitrary_initial_date + + db.execute (""" + INSERT INTO weigh_in_history (ck_id, weight, on_day, id, activity_flag) + VALUES (%s, %s, %s, %s,%s) + """, str(ck_id), str(weight),str(on_day),str(id), str(activity_flag)) + + + + + + #raw_input() + else: + print "bad data!" + cont+=1 + + + + + + + + + + #query="""describe weigh_in_history""" + + + + + + + + + + + + + +############################################## +if __name__ == '__main__': + if len(sys.argv) > 1: + csv_file = sys.argv[1] + + + + main(csv_file) + else: + print "usage: python whatever.py path/csv_file_all_MT_series.csv " + + diff --git a/create_and_populate_tables_MT_time_series_DB_free_time_stamps.py b/create_and_populate_tables_MT_time_series_DB_free_time_stamps.py new file mode 100644 index 0000000..4c28cbc --- /dev/null +++ b/create_and_populate_tables_MT_time_series_DB_free_time_stamps.py @@ -0,0 +1,168 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela on July 2012. + +Given a set of time series (in files), it creates a new table in the MT_time_series DB and populates it with the data +from the MT fake time series. + + + + +""" + +import csv +import sys +import os +from datetime import * +import math +import numpy as np +from scipy import stats +from database import * #package to handle databases + + +def main (csv_file): + + + arbitrary_initial_date=datetime(2009, 1, 1, 0, 0) # because the MT time series are generated starting at an arbitrary day=0 + + + + + database = "MT_time_series" + server="tarraco.chem-eng.northwestern.edu" + user="julia" + passwd="tiyp,julia" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS weigh_in_history") #i remove the old table + + + + + #i create a new table in an existing DB + db.execute (""" + CREATE TABLE weigh_in_history + ( + ck_id INT, + weight FLOAT, + on_day DATETIME, + id INT, + activity_flag CHAR (2) + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + + resultado_csv= csv.reader(open(csv_file, 'rb'), delimiter=',')#, quotechar='"') + #weight_week0, weight_week_week1, weight_week2,... weight_week29 + + cont=0 + for row in resultado_csv: + ck_id=cont+1 # the fake ck_id (and the id) will both be the line count from the csv file + id=cont+1 + activity_flag='WI' + + if cont>0: # i skip the header line + + print "line:",cont+1,row + #raw_input() + list_weight_values=[] + flag_skip_line=0 + list_values_days=[] + cont_week=0 + + day=1 + for element in row: + if len(element) >0 and day <141: # if it is not an empty enty + try: + value=float(element) + if value >100.0 and value < 450.0: # i remove bad data + list_weight_values.append(value) + list_values_days.append(day) + + # else: + # flag_skip_line=1 + except ValueError: pass # in case the value is not a numberic character + day+=1 + + + + if len(list_weight_values)<10: # if after removing bad data i still have a long enough time serie + flag_skip_line=1 + + + if flag_skip_line==0: + for i in range(len(list_weight_values)): + print list_values_days[i],timedelta(list_values_days[i])+arbitrary_initial_date,list_weight_values[i] + + + + weight=list_weight_values[i] + on_day=timedelta(list_values_days[i])+arbitrary_initial_date + + db.execute (""" + INSERT INTO weigh_in_history (ck_id, weight, on_day, id, activity_flag) + VALUES (%s, %s, %s, %s,%s) + """, str(ck_id), str(weight),str(on_day),str(id), str(activity_flag)) + + # delta=(len(row)-1)*7 + # print delta,timedelta(delta)+arbitrary_initial_date ,240 # fixed final value + #weight=240. + #on_day=timedelta(delta)+arbitrary_initial_date + + #db.execute (""" + # INSERT INTO weigh_in_history (ck_id, weight, on_day, id, activity_flag) + # VALUES (%s, %s, %s, %s,%s) + # """, str(ck_id), str(weight),str(on_day),str(id), str(activity_flag)) + + + + + + #raw_input() + else: + print "bad data!" + cont+=1 + + + + + + + + + + #query="""describe weigh_in_history""" + + + + + + + + + + + + + +############################################## +if __name__ == '__main__': + if len(sys.argv) > 1: + csv_file = sys.argv[1] + + + + main(csv_file) + else: + print "usage: python whatever.py path/csv_file_all_MT_series.csv " + + diff --git a/generate_gnuplot_scripts_for_multiplot_time_series_.py b/generate_gnuplot_scripts_for_multiplot_time_series_.py new file mode 100644 index 0000000..b9ee099 --- /dev/null +++ b/generate_gnuplot_scripts_for_multiplot_time_series_.py @@ -0,0 +1,105 @@ +#! /usr/bin/env python + +""" +Created by Julia Poncela of November 2011 + +It reads the CK time series and writes a gnuplot script to (multi)plot 10 at the time, and the calls gnuplot to do it every time. + + +It doesnt take any arguments. + +""" + + + +import sys +import os +import subprocess as sp + +#p = sp.Popen(["python","hola.py"]) + +def main (): + + + + + file=open("number_segments_acording_to_frequencies.dat",'r') # i read the file with the number of segments in each time serie + list_lines_file=file.readlines() + dicc_number_segments={} + + for line in list_lines_file: + list=line.split(" ") + key=str(list[0]) #index file + value=int(list[1]) # corresponding number of segments + dicc_number_segments[key]=value + + #print dicc_number_segments + + + + + + + +# Text for the gnuplot script: + + header="reset ; set term post enhanced color dashed 8 ; set output 'Multiplot_time_series_indiv_frequencies_segments_meaningful_A_test.eps' ;NX=1; NY=2 ; SX=1; SY=0.5 ; set size SX*NX,SY*NY ;set multiplot; " + + file="1" + num_segments=dicc_number_segments[file] + + + for index in range(num_segments): + + print index + + figure_partial="set size SX,SY; set origin 0,0.5 ; set pointsize 0.2; set yrange [-15:10]; set nokey; set ylabel '% weight change' ;set xlabel ''; p 'weigh_in_time_serie_days1_top50_frequencies_t_points_segments10_threshold.dat' u 1:3 index "+str(index)+" with points ls "+str(index+1) + + figure_partial2=";set size SX,SY; set origin 0,0 ; set pointsize 0.2; set yrange [-15:10]; set nokey; set ylabel '% weight change' ;set xlabel ''; p 'weigh_in_time_serie_days1_top50_t_points_residuals.dat' index "+str(index)+" with points ls "+str(index+1) + + + + + figure=figure_partial+figure_partial2 + + + + #figure2=";set size SX,SY; set origin 0,0.6 ; set pointsize 0.2; set yrange [-15:10]; set nokey; set ylabel '% weight change' ;set xlabel ""; p 'temporal_series/most_weigh_ins/weigh_in_time_serie_days2_top50_frequencies_t_points_segments10_threshold.dat' u 1:3 index 0 with points ls 1 ,'temporal_series/most_weigh_ins/weigh_in_time_serie_days2_top50_frequencies_t_points_segments10_threshold.dat' u 1:3 index 1 with points ls 2 ;" + + ending=";set nomultiplot; reset " + + + + + output_script="test_gnuplot.gpt" + file = open(output_script,'wt') + print >> file, header,figure, ending + file.close() + + + + p = sp.Popen(["gnuplot","test_gnuplot.gpt"]) + + # salida=p.communicate() #es una tupla en la q guarda lo que sale + # print salida + + + + + +############################################ + +if __name__== "__main__": + + main() + + try: + import psyco + except ImportError: pass + + + +############################################## + + + diff --git a/get_more_statistics_time_series_cutting.py b/get_more_statistics_time_series_cutting.py index db7fa27..dde490e 100644 --- a/get_more_statistics_time_series_cutting.py +++ b/get_more_statistics_time_series_cutting.py @@ -4,8 +4,7 @@ """ Created by Julia Poncela of January 2013 -Get some statistics on time series cutting from the database: -average number of segments, type of segments, gaps,... +Get some statistics on time series cutting from the database @@ -21,21 +20,17 @@ from database import * #package to handle databases import histograma_gral import histograma_bines_gral -import operator - def main (): min_wi=20 #Filter1: min number of weigh ins >= - min_timespan=0 # Filter2: min length of the serie - - + min_timespan=0 # Filter2: min lenght of the serie - - max_num_users_for_testing=1000 # this is just while i test the code!! - + + filename4="./Results/More_summary_statistics_cutting_time_series.dat" + file4=open(filename4,'wt') @@ -57,15 +52,35 @@ def main (): num_users=len(result1) - dir="/home/staff/julia/at_Northwestern/calorieking/time_series/temporal_series/most_weigh_ins/" # to save the datafiles for the time series + cont=0 + for r1 in result1: #loop over users to get their number_of_weigh-ins + + ck_id=r1['ck_id'] ## the ck_id from the weigh_in_cut table is a shorter id than the one in the users table + + +#weigh_ins_query = db.query(''' + # SELECT on_day, weight + # FROM weigh_in_history + # WHERE ck_id LIKE %s + # ORDER BY on_day + # ''', user_id + '%') + + - # result4 = db.query(''' SELECT on_day, weight FROM weigh_in_history WHERE ck_id LIKE %s ORDER BY on_day''', ck_id + '%')#THIS QUERY IS JUST TO TEST IT OUT!! + result4 = db.query(''' SELECT on_day, weight FROM weigh_in_history WHERE ck_id LIKE %s ORDER BY on_day''', ck_id + '%')#THIS QUERY IS JUST TO TEST IT OUT!! # is a list of dict. + print result4 + for r4 in result4: + print r4 + + cont+=1 + print cont + + exit() + - - list_num_segments_per_user=[] @@ -74,44 +89,23 @@ def main (): num_segments=0 num_lin_segments=0 - num_const_segments=0 num_exp_segments=0 num_isolated=0 num_gaps=0 + num_valid_users=0 - num_users_with_enough_wi=0 - num_users_with_valid_segments=0 - num_users_with_gaps=0 - - list_lengths=[] - list_lin_lengths=[] - list_const_lengths=[] - list_exp_lengths=[] + list_lenghts=[] + list_lin_lenghts=[] + list_exp_lenghts=[] - list_gap_lengths=[] - - - list_gap_alternation_values=[] - list_seg_alternation_values=[] - cont=0 for r1 in result1: #loop over users to get their number_of_weigh-ins - - - # if cont <=max_num_users_for_testing: #COMMENT THIS LINE FOR THE FINAL RUN!!! - - - - print cont - cont+=1 - - + ## r1 is a dict.: {'ck_id': u'bd84dbe2-dd6e-4125-b006-239442df2ff6', 'age': 52L, 'state': u'', 'height': 64L, 'join_date': datetime.datetime(2009, 11, 27, 10, 41, 5), 'is_staff': u'public', 'most_recent_weight': 142.0, 'initial_weight': 144.0, 'id': 1L} ck_id=r1['ck_id'] - id=r1['id'] query2="select * from weigh_in_history where (ck_id ='"+str(ck_id)+"') order by on_day asc" @@ -130,43 +124,21 @@ def main (): if r1['num_wi']>=min_wi :#and int(time_span_days) >= min_timespan: - dict_start_day_type_segment={} - segment_index_current_user=0 - - num_users_with_enough_wi+=1 - - - - - - - output_file2=dir+"Time_series_user_id"+str(id)+".dat" - file2 = open(output_file2,'wt') - - - for r2 in result2: - weigh_in_day=r2['on_day'] - weight=float(r2['weight']) - print >> file2, (weigh_in_day-first).days+1,weight - file2.close() - - print "written file:",dir+"Time_series_user_id"+str(id)+".dat" - - + num_valid_users=num_valid_users+1 + print num_valid_users - query3="select * from weigh_in_cuts where (ck_id ='"+str(ck_id)+"') order by start_day" + query3="select * from weigh_in_cuts where (ck_id ='"+str(ck_id)+"') order by id, start_day" result3 = db.query(query3) # is a list of dict. - - - + list_num_segments_per_user.append(len(result3)) + + for r3 in result3: # each line is a dict, each line is a segment - - segment_index_current_user+=1 + fit_type=str(r3['fit_type']) start_day=int(r3['start_day']) stop_day=int(r3['stop_day']) @@ -174,141 +146,53 @@ def main (): stop_weight=float(r3['stop_weight']) - - - if fit_type != "isolated": - dict_start_day_type_segment[start_day]=fit_type - list_lengths.append(stop_day-start_day+1) - - - + list_lenghts.append(stop_day-start_day+1) + if fit_type == "isolated": num_isolated+=1 elif fit_type == "linear": num_lin_segments+=1 - list_lin_lengths.append(stop_day-start_day+1) + list_lin_lenghts.append(stop_day-start_day+1) - elif fit_type == "constant": - num_const_segments+=1 - list_const_lengths.append(stop_day-start_day+1) - - elif fit_type == "exponent": + elif fit_type == "exponential": num_exp_segments+=1 - list_exp_lengths.append(stop_day-start_day+1) - - # else: - # print "other fit type", ck_id, fit_type - # raw_input() - - - - - - if len(dict_start_day_type_segment)>0: # user with at least one useful segment + list_exp_lenghts.append(stop_day-start_day+1) - num_users_with_valid_segments+=1 - list_num_segments_per_user.append(len(dict_start_day_type_segment)) # excluding isolated (not including gaps, either) - + - query4="SELECT * FROM frequency_gaps where (ck_id ='"+str(ck_id)+"') order by start_day" #gap info - result4 = db.query(query4) # is a list of dict. + query4="SELECT * FROM weigh_in_cuts where (ck_id ='"+str(ck_id)+"') order start_day" #gap info 3THIS QUERY IS JUST TO TEST IT OUT!! + # query4="select * from frequency_cuts where (ck_id ='"+str(ck_id)+"') order start_day" #gap info + result4 = db.query(query4) # is a list of dict. - - list_num_gaps_per_user.append(len(result4)) - num_gaps+=len(result4) - num_users_with_gaps+=1 - - for r4 in result4: - segment_index_current_user+=1 - - start_day=int(r4['start_day']) - stop_day=int(r4['stop_day']) - start_weight=float(r4['start_weight']) - stop_weight=float(r4['stop_weight']) - - list_gap_lengths.append(stop_day-start_day+1) + + for r4 in result4: + if r4['param3']==Null: - dict_start_day_type_segment[start_day]="gap" - - - list_tuples_sorted_dict = sorted(dict_start_day_type_segment.iteritems(), key=operator.itemgetter(0)) # the index of itermgetter is by which i order the list of tuples that was the dictionary - print id," ",ck_id - - alternation_segs=[] - alternation_gaps=[] - i=0 - if len(list_tuples_sorted_dict)>1: # if there is only one behavior, no alternation calculations - for item in list_tuples_sorted_dict: - - print item[0], item[1] - type_segment=item[1] - if i >0: - if old_type_segment == type_segment: - if type_segment == "gap": - alternation_gaps.append(0.) - else: - alternation_segs.append(0.) - else: - alternation_gaps.append(1.) - - - alternation_segs.append(1.) - i+=1 - old_type_segment=type_segment - - print " average gap alternation:",numpy.mean(alternation_gaps)," number of cuts:",len(alternation_gaps) - print " average segment alternation:",numpy.mean(alternation_segs)," number of cuts:",len(alternation_segs) + raw_input() - - if len(alternation_gaps)>0: - list_gap_alternation_values.append(numpy.mean(alternation_gaps)) - else: - print "empty alternation gap list" - - if len(alternation_segs)>0: - list_seg_alternation_values.append(numpy.mean(alternation_segs)) - else: - print "empty alternation segment list" - - else: - print " single segment user" - print "# users with enough wi:",num_users_with_enough_wi, " # users with valid segments: ",num_users_with_valid_segments - print "\n" + list_num_gaps_per_user.append(len(result4)) + num_gaps+=len(result4) -#list_sorted_dict=[(u'Weiss', 5.0), (u'Wunderink', 5.0), (u'Keller', 4.0), (u'Go', 3.0), (u'Cuttica', 3.0), (u'Rosario', 2.0), (u'Radigan', 2.0), (u'Smith', 2.0), (u'RosenbergN', 2.0), (u'Gillespie', 1.0), (u'Osher', 1.0), (u'Mutlu', 1.0), (u'Dematte', 1.0), (u'Hawkins', 1.0), (u'Gates', 1.0)] - - - filename4="../Results/More_summary_statistics_cutting_time_series.dat" - file4=open(filename4,'wt') - print >> file4, "\n\nSummary results cutting time series (applying dynamics programing first for frequencies, and thento get the fits for the different trends):\n\n" + + print >> file4, "Summary results cutting time series:\n\n" - print >> file4,"Total number of users:",num_users - print >> file4," Number users with at least 20 weigh-ins:", num_users_with_enough_wi - print >> file4," and with at least one valid segment:",num_users_with_valid_segments,"(all statistics on this dataset)\n" - - print >> file4,"Total number of segments:", sum(list_num_segments_per_user), ", of average length:", numpy.mean(list_lengths),"days" - print >> file4,"Average number of segments per individual:",numpy.mean(list_num_segments_per_user),"\n" - + print >> file4,"Total number of users:",num_users,", Number users with at least 20 weigh-ins:",num_valid_users + print >> file4,"Number of segments:", sum(list_num_segments_per_user) #not including one-point segments + print >> file4,"Average number of segments per individual:",numpy.mean(list_num_segments_per_user) print >> file4,"Number of one-point segments:",num_isolated print >> file4, "Number segments by type:" - print >> file4, " Linear: ",num_lin_segments, ", of average length:", numpy.mean(list_lin_lengths ),"days" - print >> file4, " Constant: ",num_const_segments, ", of average length:", numpy.mean(list_const_lengths ),"days" - print >> file4, " Exponential: ",num_exp_segments, ", of average length:", numpy.mean(list_exp_lengths ),"days","\n" - - print >> file4, "Total number of gaps:",num_gaps - print >> file4,"Average number of gaps per individual:",numpy.mean(list_num_gaps_per_user),', of average length:',numpy.mean(list_gap_lengths),"days (number of users with at least one gap:",num_users_with_gaps,")\n" + print >> file4, " Linear: ",num_lin_segments + print >> file4, " Exponential: ",num_exp_segments,"\n" + print >> file4, "Number of gaps:",num_gaps + print >> file4,"Average number of segments per individual:",numpy.mean(list_num_gaps_per_user) - print >> file4,"Average gap alternation:",numpy.mean(list_gap_alternation_values) - print >> file4,"Average segment alternation:",numpy.mean(list_seg_alternation_values),"\n" - - file4.close() diff --git a/plot-freq-vs-fit-cuts.py b/plot-freq-vs-fit-cuts.py new file mode 100644 index 0000000..2cdb26d --- /dev/null +++ b/plot-freq-vs-fit-cuts.py @@ -0,0 +1,253 @@ +""" +The name of this script is no longer appropriate. This script uses PyGrace to +make plots of the fit cuts, which have been informed by the weigh-in gaps. This +script takes a single argument, the minimum number of weigh-ins that you wish to +include in the analysis. For example, this will run the analysis and generate +the plots for all users who have 400 or more weigh-ins: + + python plot-freq-vs-fit-cuts.py 400 + +""" +######### +# Setup # +######### + +N_plots_per_panel = 15 + +# Set up the database connection: +from database import * # module in PYTHONPATH +database = "calorie_king_social_networking_2010" +server = "tarraco.chem-eng.northwestern.edu" +user = "calorieking" +passwd = "n1ckuDB!" +db = Connection(server, database, user, passwd) + +# Get the number of weigh-ins from the scrip's command-line invocation +import sys +if len(sys.argv) < 2: + raise AssertionError('You must provide a minimum number of weigh-ins') +N_weigh_ins = sys.argv[1] + +# Get a list of applicable users sorted by decreasing variance. The return value +# from this query is a list of dictionaries. +count_query_list = db.query(""" + SELECT + C.ck_id, + C.n_weigh_ins / MSE.mse AS variance + FROM + weigh_in_counts C + JOIN ( + SELECT ck_id, sum(quality) AS mse + FROM weigh_in_cuts + GROUP BY ck_id + ) MSE + ON (C.ck_id = MSE.ck_id) + WHERE C.n_weigh_ins > %s + ORDER BY variance DESC + +""" % (N_weigh_ins)) +# This is the old query that didn't pay attention to the variance: +# SELECT ck_id +# FROM weigh_in_counts +# WHERE n_weigh_ins >= %s + +# Kick out bad users: --> skipping this for now +#import baduserslist + +# Make sure we got at least one user: +if len(count_query_list) == 0: + raise ValueError('Did not get any users with more than %s weigh-ins' % (N_weigh_ins)) + +# The on_day values are datetime objects, and we'll need to do some arithmetic +from datetime import * + +# Needed to compute the curve for the exponential fit: +import math + +# Needed, obviously, for plotting +from PyGrace.grace import Grace +from PyGrace.Styles.el import * + +# Create multi-panel pages: +from PyGrace.Extensions.panel import Panel, MultiPanelGrace + +######################################## +# A function that creates output files # +######################################## +# The resulting individual postscript files are eventually concatenated into +# one large file; this simply generates one file per page. + +def make_output_for (grace, panel): +# for g in grace.graphs: +# g.world.ymax = 500 + grace.automulti(width_to_height_ratio=1.0,hgap=0.05,vgap=0.15, + hoffset=(0.1,0.05),voffset=(0.05,0.1)) + grace.scale_suffix(0.3,"major_size") + grace.scale_suffix(0.3,"minor_size") + grace.scale_suffix(0.3,"char_size") + grace.write_file('%03d.ps' % panel) + +n_plots_this_panel = 0 +n_panels = 0 +grace = MultiPanelGrace() + +################################## +# Loop over all applicable users # +################################## + +for count_query in count_query_list: + n_plots_this_panel += 1 + ck_id = count_query['ck_id'] + short_id = ck_id[:10] + +# # Skip bad users: --> skipping this for now +# if (baduserslist.is_bad_user(short_id)): +# print "Skipping %s: %s" % (short_id,baduserslist.why_bad_user(short_id)) +# continue + + print "Working with user %s (%s); variance %f" % \ + (short_id, ck_id, count_query['variance']) + + # Create the graph object and set the (sub)title + graph = grace.add_graph(Panel) + graph.subtitle.text = str(short_id) + graph.subtitle.size = 1 + + # Set the x and y axes to decimal format (rather than scientific notation) + graph.xaxis.ticklabel.configure(format='decimal') + graph.yaxis.ticklabel.configure(format='decimal') + + #################################### + # Pull the actual time series data # + #################################### + + time_series = db.query(""" + SELECT on_day,weight + FROM weigh_in_history + WHERE ck_id = "%s" + ORDER BY on_day + """ % (ck_id)) + + # Calculate the 'days since first weigh-in' + days = [entry['on_day'] for entry in time_series] + first_day = days[0] + days = [int((day - first_day).days) for day in days] + + # Calculate the percentage weight change + weights = [entry['weight'] for entry in time_series] + # Get the first nonzero weight. Not a problem if we've removed zeroes, + # but here in case we decide to view such time series: + first_weight = [w for w in weights if w > 0][0] + pct_changes = [(w - first_weight) / first_weight * 100 for w in weights] + # Filter out weigh-ins of zero weight, or enormous values: + #t_series = filter(lambda x: x[1] > -100 and x[1] < 200, zip (days, pct_changes)) + t_series = zip (days, pct_changes) + + #################################### + # Shaded background indicates gaps # + #################################### + + freq_cuts = db.query(""" + SELECT start_day,end_day + FROM gaps_by_frequency + WHERE ck_id LIKE "%s" + ORDER BY start_day + """ % (ck_id)) + + y_max = max(pct_changes) + for freq_cut in freq_cuts: + # add a dataset from the upper left to the upper right corner of the + # grey area: + background = graph.add_dataset([(freq_cut['start_day'], y_max), \ + (freq_cut['end_day'], y_max)], legend = '') + background.symbol.shape = 0 # no symbols + background.line.style = 0 # hide lines ... + background.line.pattern = 0 # ... leave ... + background.line.color = 1 # ... only ... + background.line.linewidth = 1 # ... shades + + # This sets the filling type as "to_baseline", baseline as "x-axis" and + # shade color as lightgrey + background.fill.type = 2 + background.fill.color = 7 # lightgrey + background.baseline.type = 3 + + ############################### + # Plot the actual time series # + ############################### + + # Add the data to the graph + dataset = graph.add_dataset(t_series, ElCircleDataSet, 1) + dataset.symbol.size = 0.2 + graph.autoscale() + + + ################### + # The fit results # + ################### + + # Plot the fit curves using alternating colors + fits = db.query(""" + SELECT fit_type,start_day,stop_day,param1,param2,param3,quality + FROM weigh_in_cuts + WHERE ck_id = "%s" + ORDER BY start_day + """ % (ck_id)) + + line_color = 2 # alternate line colors + + for fit in fits: + # draw lines/curves for linear/exponential fits + xs = range (fit['start_day'], fit['stop_day']) + if fit['fit_type'] == 'lin': + ys = [fit['param1'] + fit['param2'] * x for x in xs]; + else: + ys = [fit['param1'] + fit['param2'] * math.exp(x * fit['param3']) \ + for x in xs]; + + fit_curve = graph.add_dataset(zip (xs, ys), ElLineDataSet, line_color) + fit_curve.line.configure(linewidth = 2.0) + + # Alternate colors: + line_color = 3 if line_color == 2 else 2 + + ########################### + # Force a common y extent # + ########################### + + # Make all graphs have a y-extent of at least 40 + y_min = min(pct_changes) + y_max = max(pct_changes) + if y_max - y_min < 30: + extra = 30 - (y_max - y_min) + graph.world.ymin = y_min - extra/2 + graph.world.ymax = y_max + extra/2 + + + + ############################ + # Save the panel to a file # + ############################ + + if n_plots_this_panel == 15: + # print the grace to a file (.agr format) + n_panels += 1 + make_output_for(grace, n_panels) + # Set up the grace object for the next round + n_plots_this_panel = 0 + grace = MultiPanelGrace() + +# Finish by flushing out the last panel if it has any plots +if n_plots_this_panel > 0: + n_panels += 1 + make_output_for(grace, n_panels) + +####################################### +# Finally, merge all the panel images # +####################################### + +import subprocess +# Merge the ps files +subprocess.call("psmerge -oimages.ps [0-9][0-9][0-9].ps", shell=True) # merge all individuals pages into a single figure +# Remove individual .ps files +subprocess.call("rm [0-9][0-9][0-9].ps", shell=True) # remove all individual figures-pages diff --git a/plot_histogram_DW_score_cutting_time_series.py b/plot_histogram_DW_score_cutting_time_series.py new file mode 100644 index 0000000..0c61c28 --- /dev/null +++ b/plot_histogram_DW_score_cutting_time_series.py @@ -0,0 +1,156 @@ +import csv +import sys +from PyGrace.grace import Grace +from PyGrace.colors import ColorBrewerScheme +def main (): + + + + + + + filename1="./Results/Distribution_DW_scores_lin_segments.dat" + # filename2="./Results/Distribution_DW_scores_const_segments.dat" + filename3="./Results/Distribution_DW_scores_exp_segments.dat" + + + list_filenames=[filename1,filename3] + + + + xs=[] + ys=[] + for filename in list_filenames: + + file=open(filename,'r') + lista_lines=file.readlines() # each element is a line of the file STRING TYPE!! + + + listX=[] + listY=[] + + + + + + for line in lista_lines: + + elements=[] + items=line.strip('\n').split(' ') +# print items + + listX.append(float(items[0])) + # listY.append(float(items[1])) # Bin_count/(Total_count*Bin_size) + listY.append(float(items[5])) #Bin_count/Total_count + + xs.append(listX) + ys.append(listY) + + + + list_yLegends=["Lin. segments","Exp. segments"] + + xTitle="DW score" + yTitle="P(DW score)" + title="" #empty + subtitle="" #empty + filename="./Results/Distribution_DW_score_lin_exp_segments.eps" + + colors = [1,2,3] + + + realmultilinegraph(xs,ys,xTitle,yTitle,list_yLegends,filename,colors) + + print "\n printed out:",filename + + + + + +############################ + +#################################### + + +def realmultilinegraph(xs,ys,xTitle,yTitle,list_yLegends,filename, colors): +#ys is a list of y-series +#xs is a list of x-series + + + +#1:black +#2:red +#3: light green +#4:dark blue +#5:yellow +#6:light brown +#7:grey +#8:purple +#9:cyan +#10:pink +#11:orange +#12: purple2 +#13:maroon +#14:cyan2 +#15:dark green + + grace = Grace() + graph = grace.add_graph() + # graph.title.text = graphTitle + graph.title.size = 1.2 + #graph.subtitle.text = subtitle + graph.subtitle.size = .6 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = 2 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = 2 + graph.xaxis.ticklabel.prec = 2 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = 2 + graph.yaxis.ticklabel.char_size = 2 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + + + + + + for [i,y] in enumerate(ys): # enumerate([j,u,l,i,a]) returs a list of tuples: [(0,j),(1,u),(2,l),(3,i),(4,a)] + # zip([a,b,c],[1,2,3]) returns [(a,1),(b,2),(c,3)] + dataset = graph.add_dataset(zip(xs[i],y),legend=list_yLegends[i]) + dataset.symbol.shape = 1 + dataset.symbol.size = 0.7 + dataset.symbol.color = colors[i] + dataset.symbol.fill_color = colors[i] + dataset.line.color = colors[i] + if "Training" in list_yLegends[i]: + dataset.symbol.fill_color=0 + + graph.legend.char_size = 1. + graph.legend.box_linestyle=0 # NO legend box + graph.legend.loc = (.8,.8) # coordinate system from 0 to 1 for both axes (NOT world coordenates, but scaled) + + + + + + + grace.autoscale() + grace.write_file(filename) + + + + + +###################################### +if __name__ == '__main__': + #if len(sys.argv) > 1: + # graph_filename = sys.argv[1] + + main() + # else: + # print "Usage: python script.py path/network.gml" + + diff --git a/plot_histogram_number_cuts_per_user_time_series.py b/plot_histogram_number_cuts_per_user_time_series.py new file mode 100644 index 0000000..80e2f7e --- /dev/null +++ b/plot_histogram_number_cuts_per_user_time_series.py @@ -0,0 +1,101 @@ +import csv +import sys +from PyGrace.grace import Grace +from PyGrace.colors import ColorBrewerScheme +def main (): + + + + + + + filename_actual_evol="./Results/Distribution_num_segments_per_user.dat" + file=open(filename_actual_evol,'r') + + listX=[] + listY=[] + + + + + lista_lines=file.readlines() # each element is a line of the file STRING TYPE!! + + + + + for line in lista_lines: + + elements=[] + items=line.strip('\n').split(' ') + # print items + + listX.append(int(items[0])) + listY.append(float(items[1])) + + + + xTitle="Number of segments per user" + yTitle="P(number of segments)" + title="" #empty + subtitle="" #empty + filename="./Results/histogram_num_segments_per_user.png" + + + linegraph(listX,listY,xTitle,yTitle,title,subtitle,filename) + + print "\n printed out:",filename + + + + + +############################ + +def linegraph(x,y,xTitle,yTitle,title,subtitle,filename): + + data = zip(x,y) + + grace = Grace() + graph = grace.add_graph() + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size =1.5 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = 1 + graph.xaxis.ticklabel.prec = 0 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = 1.5 + graph.yaxis.ticklabel.char_size = 1 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 0 + + graph.title.text = title + graph.title.size = .9 + graph.subtitle.text = subtitle + graph.subtitle.size = .7 + + + dataset1 = graph.add_dataset(data) + dataset1.symbol.shape = 1 # 0: no symbol + dataset1.symbol.color =2 + dataset1.symbol.fill_color =2 + dataset1.symbol.size = 1 + + dataset1.line.color = 2 + graph.legend.box_linestyle=0 # NO legend box + + grace.autoscale() + grace.write_file(filename) + + +###################################### +if __name__ == '__main__': + #if len(sys.argv) > 1: + # graph_filename = sys.argv[1] + + main() + # else: + # print "Usage: python script.py path/network.gml" + + diff --git a/plot_scatter_plots.py b/plot_scatter_plots.py new file mode 100644 index 0000000..416011c --- /dev/null +++ b/plot_scatter_plots.py @@ -0,0 +1,110 @@ + +import sys +from PyGrace.grace import Grace +from PyGrace.colors import ColorBrewerScheme +def main (): + + + + + + what_scatterplot='exp' # 'exp' or lin + + if what_scatterplot == 'lin': + filename_actual_evol="./Results/Scatter_plot_length_slope_lin.dat" + elif what_scatterplot == 'exp': + filename_actual_evol="./Results/Scatter_plot_tau_deltaY_exp.dat" + + + file=open(filename_actual_evol,'r') + + listX=[] + listY=[] + + + lista_lines=file.readlines() # each element is a line of the file STRING TYPE!! + + + + + for line in lista_lines: + + elements=[] + items=line.strip('\n').split(' ') + # print items + + listX.append(float(items[0])) + listY.append(float(items[1])) + + + + if what_scatterplot == 'lin': + xTitle="Duration segment (days)" + yTitle="Slope" + elif what_scatterplot == 'exp': + xTitle="Tau (days)" + yTitle="Delta Weight" + + + + title="" #empty + subtitle="" #empty + + + if what_scatterplot == 'lin': + filename="./Results/Scatterplot_duration_vs_slope.agr" + elif what_scatterplot == 'exp': + filename="./Results/Scatterplot_tau_vs_deltaWeight.agr" + + + + + scatterplot(listX,listY,xTitle,yTitle,title,subtitle,filename) + + print "\n printed out:",filename + + + +###################################### + +def scatterplot(x_points,y_points,xTitle,yTitle,graphTitle,subTitle,filename): + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .9 + graph.subtitle.text = subTitle + graph.subtitle.size = .7 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = 1.5 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = 1 + graph.xaxis.ticklabel.prec = 2 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = 1.5 + graph.yaxis.ticklabel.char_size = 1 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + + points = graph.add_dataset(zip(x_points,y_points)) + points.symbol.shape = 1 + points.symbol.color = 2 # perimeter of the symbol + points.symbol.fill_color = 2 # inside the symbol + + points.line.color = 0 #NO LINE + + grace.autoscale() + grace.write_file(filename) + +###################################### +if __name__ == '__main__': + #if len(sys.argv) > 1: + # graph_filename = sys.argv[1] + + main() + # else: + # print "Usage: python script.py path/network.gml" + + diff --git a/plottools_pygrace_Laura.py b/plottools_pygrace_Laura.py new file mode 100644 index 0000000..c1acffe --- /dev/null +++ b/plottools_pygrace_Laura.py @@ -0,0 +1,394 @@ + #plottools.py + +''' +Goodies included in this file, with usage: + +import plottools as pt + +pt.linegraph(x,y,xTitle,yTitle,title,subtitle,filename) + Plot a simple line graph + x and y are lists of points. + +pt.linegraphwpts(x,y,xTitle,yTitle,title,subtitle,filename) + A linegraph with symbols + +pt.multilinegraph(x,ys,xTitle,yTitle,yLabel,graphTitle,subtitle,filename) + A multiple line graph where all y-series use the same x-values. + x is a list of points, ys is a list of lists. + +pt.realmultilinegraph(xs,ys,xTitle,yTitle,yLabel,graphTitle,subtitle,filename) + A multiple line graph where each series has its own x and y values. + xis is a list of lists, ys is a list of lists. + +pt.pointlinegraph(x_points,y_points,x,y,xTitle,yTitle,graphTitle,graphSubtitle,filename) + Plot data points and a line on the same graph. + +pt.pointjgraph(x_points,y_points,xTitle,yTitle,graphTitle,graphSubtitle,filename) + Plot data points + +pt.loglogpointgraph(x_points,y_points,xTitle,yTitle,graphTitle,subTitle,filename) + Plot data points on log scale. + +More code at the bottom... either not finished or very specific application. +''' + +# IMPORTS ---------------------------------------------------------------------------------- +from PyGrace.grace import Grace +from PyGrace.colors import ColorBrewerScheme + +#----------------------------------------------------------------------------------------------- + +# Series of colors to use throughout. +colors = [2,4,15,14,8,11] + +#----------------------------------------------------------------------------------------------- + +def linegraph(x,y,xTitle,yTitle,title,subtitle,filename): + + data = zip(x,y) + + grace = Grace() + graph = grace.add_graph() + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .8 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.prec = 0 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .8 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 0 + + graph.title.text = title + graph.title.size = .9 + graph.subtitle.text = subtitle + graph.subtitle.size = .7 + + + dataset1 = graph.add_dataset(data) + dataset1.symbol.shape = 0 + grace.autoscale() + grace.write_file(filename) + +#----------------------------------------------------------------------------------------------- + +def linegraphwpts(x,y,xTitle,yTitle,title,subtitle,filename): + + data = zip(x,y) + + grace = Grace() + graph = grace.add_graph() + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .8 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.prec = 0 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .8 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 0 + + graph.title.text = title + graph.title.size = 1.2 + graph.subtitle.text = subtitle + graph.subtitle.size = .7 + + + dataset1 = graph.add_dataset(data) + dataset1.symbol.shape = 1 + dataset1.symbol.size = 1.1 + grace.autoscale() + grace.write_file(filename) + +#----------------------------------------------------------------------------------------------- + + +def multilinegraph(x,ys,xTitle,yTitle,yLabel,graphTitle,subtitle,filename): +#ys is a list of y-series + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .8 + graph.subtitle.text = subtitle + graph.subtitle.size = .6 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = 1 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.prec = 2 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = 1 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + + for [i,y] in enumerate(ys): + dataset = graph.add_dataset(zip(x,y),legend=yLabel[i]) + dataset.symbol.shape = 0 + dataset.line.color = 1 + + grace.autoscale() + grace.write_file(filename) + +#----------------------------------------------------------------------------------------------- + +def realmultilinegraph(xs,ys,xTitle,yTitle,yLabel,graphTitle,subtitle,filename): +#ys is a list of y-series + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .8 + graph.subtitle.text = subtitle + graph.subtitle.size = .6 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .8 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .6 + graph.xaxis.ticklabel.prec = 2 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .8 + graph.yaxis.ticklabel.char_size = .6 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + + for [i,y] in enumerate(ys): + dataset = graph.add_dataset(zip(xs[i],y),legend=yLabel[i]) + dataset.symbol.shape = 0 + dataset.line.color = colors[i] + + graph.legend.char_size = .6 + graph.legend.loc = (.2,.75) + + grace.autoscale() + grace.write_file(filename) + + +#---------------------------------------------------------------------------------------------------- + +def pointlinegraph(x_points,y_points,x,y,xTitle,yTitle,graphTitle,graphSubtitle,filename): +#I think this adds datasets in the reverse order that I want? Fix me! + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .8 + graph.subtitle.text = graphSubtitle + graph.subtitle.size = .6 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = 1 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.prec = 2 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = 1 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + + points = graph.add_dataset(zip(x_points,y_points)) + points.symbol.shape = 3 + points.line.type = 0 + + line = graph.add_dataset(zip(x,y)) + line.symbol.shape = 0 + line.line.color = 1 + + grace.autoscale() + grace.write_file(filename) + + +#---------------------------------------------------------------------------------------------------- + +def pointgraph(x_points,y_points,xTitle,yTitle,graphTitle,subTitle,filename): + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .9 + graph.subtitle.text = subTitle + graph.subtitle.size = .7 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .8 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.prec = 2 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .8 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + + points = graph.add_dataset(zip(x_points,y_points)) + points.symbol.shape = 1 + points.line.color = 0 + + grace.autoscale() + grace.write_file(filename) + + + + #---------------------------------------------------------------------------------------------------- + +def loglogpointgraph(x_points,y_points,xTitle,yTitle,graphTitle,subTitle,filename): + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .8 + graph.subtitle.text = subTitle + graph.subtitle.size = .6 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = 1 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.prec = 2 + graph.xaxis.scale = 'Logarithmic' + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = 1 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 2 + graph.yaxis.scale = 'Logarithmic' + + points = graph.add_dataset(zip(x_points,y_points)) + points.symbol.shape = 1 + points.line.color = 0 + + grace.autoscale() + grace.write_file(filename) + + + #---------------------------------------------------------------------------------------------------- + +def bargraph(x,y,xTitle,yTitle,title,subtitle,filename): + + data = zip(x,y) + + grace = Grace() + graph = grace.add_graph() + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .9 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.char_size = .7 + graph.xaxis.ticklabel.prec = 0 + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .9 + graph.yaxis.ticklabel.char_size = .7 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 0 + + graph.title.text = title + graph.title.size = 1 + graph.subtitle.text = subtitle + graph.subtitle.size = .8 + + dataset1 = graph.add_dataset(data,type = 'bar') + dataset1.symbol.shape = 0 + dataset1.line.type = 0 + grace.autoscale() + grace.write_file(filename) + + #---------------------------------------------------------------------------------------------------- + +def sectionedbargraph(ys,xTitle,yTitle,yLabel,title,subtitle,filename): + + grace = Grace() + graph = grace.add_graph() + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .8 + graph.xaxis.ticklabel.char_size = .8 + graph.xaxis.ticklabel.format = 'Decimal' + graph.xaxis.ticklabel.prec = 0 + + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .8 + graph.yaxis.ticklabel.char_size = .8 + graph.yaxis.ticklabel.format = 'Decimal' + graph.yaxis.ticklabel.prec = 0 + + graph.title.text = title + graph.title.size = .9 + graph.subtitle.text = subtitle + graph.subtitle.size = .7 + + base = 0 + for [i,y] in enumerate(ys): + ylen = len(y) + x = range(base, base + ylen) + base += ylen - 1 + dataset = graph.add_dataset(zip(x,y), type = 'bar', legend=yLabel[i]) + dataset.line.type = 0 + dataset.symbol.color = colors[i] + dataset.symbol.fill_color = colors[i] + dataset.symbol.size = .2 + + graph.legend.char_size = .6 + graph.legend.loc = (.8,.75) + + grace.autoscale() + grace.write_file(filename) + + + + #---------------------------------------------------------------------------------------------------- + +def realmultilinegraph_log(xs,ys,xTitle,yTitle,yLabel,graphTitle,subtitle,filename): +#ys is a list of y-series + + grace = Grace() + graph = grace.add_graph() + graph.title.text = graphTitle + graph.title.size = .8 + graph.subtitle.text = subtitle + graph.subtitle.size = .6 + + graph.xaxis.label.text = xTitle + graph.xaxis.label.char_size = .8 + graph.xaxis.ticklabel.format = 'Scientific' + graph.xaxis.ticklabel.char_size = .6 + graph.xaxis.ticklabel.prec = 2 + graph.xaxis.scale = 'Logarithmic' + + graph.yaxis.label.text = yTitle + graph.yaxis.label.char_size = .8 + graph.yaxis.ticklabel.char_size = .6 + graph.yaxis.ticklabel.format = 'Scientific' + graph.yaxis.ticklabel.prec = 2 + graph.yaxis.scale = 'Logarithmic' + + for [i,y] in enumerate(ys): + dataset = graph.add_dataset(zip(xs[i],y),legend=yLabel[i]) + dataset.symbol.shape = 3 + dataset.line.color = colors[i] + dataset.symbol.color = colors[i] + dataset.symbol.fill_color = colors[i] + + graph.legend.char_size = .6 + graph.legend.loc = (.2,.75) + + grace.autoscale() + grace.write_file(filename) + + diff --git a/queries_weight_ins_and_activity_histogram.py b/queries_weight_ins_and_activity_histogram.py index bec0184..0017690 100644 --- a/queries_weight_ins_and_activity_histogram.py +++ b/queries_weight_ins_and_activity_histogram.py @@ -65,8 +65,6 @@ def main (): total_histogr_interevent_act=[0]*10000 - cum_histogr_interevent_act=[0]*10000 - cont_interevent=0. control=0 @@ -171,17 +169,11 @@ def main (): - for i in range(len(total_histogr_interevent_act)): - for j in range(len(total_histogr_interevent_act)): - if j>=i: - cum_histogr_interevent_act[i]+=total_histogr_interevent_act[j] - - name="temporal_series/most_weigh_ins/histogram_interevent_activity.dat" file = open(name,'wt') for i in range(len(total_histogr_interevent_act)): if total_histogr_interevent_act[i]!=0: - print >> file, i, total_histogr_interevent_act[i]/float(cont_interevent), cum_histogr_interevent_act[i]/float(cont_interevent) + print >> file, i, total_histogr_interevent_act[i]/float(cont_interevent) file.close() @@ -190,7 +182,7 @@ def main (): exit() -########################################################## + ######################################################## sorted_list_of_dict=[] for i in range(top): diff --git a/rearrange_fake_time_series.py b/rearrange_fake_time_series.py new file mode 100644 index 0000000..15acab9 --- /dev/null +++ b/rearrange_fake_time_series.py @@ -0,0 +1,66 @@ +#! /usr/bin/env python + + +import sys +import os +import numpy +import csv +from matplotlib import mlab + +def main(): + + + + input_name="/home/staff/julia/at_Northwestern/calorieking/time_series/fake_time_series/Batch_830054_batch_results_just_time_series.csv" + resultado= csv.reader(open(input_name, 'rb'), delimiter=',')#, quotechar='"') + + + file1=open("./fake_time_series/all_rearrange_fake_time_series.dat",'wt') + + cont_valid_series=0 + cont_lines=-1 + for row in resultado: + + cont_lines+=1 + if cont_lines>0: # the first line is just the headers + + file2=open("./fake_time_series/rearrange_fake_time_series"+str(cont_lines)+".dat",'wt') + + cont_week=1 + list_weigh_ins=[] + for item in row: + try: + if float(item)>=200. and float(item)<=400. : # IS IT OK TO JUST REMOVE BAD DATA FOR PART OF A TIME SERIES??? + pair_week_weight=[] + pair_week_weight.append(cont_week) + pair_week_weight.append(float(item)) + + list_weigh_ins.append(pair_week_weight) + else: + print "bad data in line", cont_lines+1,"week",cont_week + + except ValueError: + print "empty value in line:",cont_lines+1 + + cont_week+=1 + + + print "line:",cont_lines+1 + if len(list_weigh_ins)>2: + for pair in list_weigh_ins: + print pair[0],pair[1] + print >> file1,pair[0],pair[1] + print >> file2,pair[0],pair[1] + + print >> file1,"\n" + cont_valid_series+=1 + else: + print "too short a time series" + #raw_input() + print "total # of valid time series:",cont_valid_series +#################################### +###################################### +if __name__ == '__main__': + + main() + diff --git a/segmentation_time_series.py b/segmentation_time_series.py index a3b1c1b..db7c085 100644 --- a/segmentation_time_series.py +++ b/segmentation_time_series.py @@ -24,7 +24,7 @@ def main (): - significance_threshold=0.95 + significance_threshold=0.95 # value to accept a cut (set to this in Luis's paper) for index_file in range(50): index_file+=1 @@ -68,6 +68,9 @@ def main (): num_lines=len(values_time_serie) + + +# LOOP OVER THE SERIES, TO FIND THE OPTIMUM CUTTING POINT: t_max=0.0 index_max_t=0 diff --git a/segmentation_time_series_frequencies_iterative.py b/segmentation_time_series_frequencies_iterative.py index 5b3f20d..6324aec 100644 --- a/segmentation_time_series_frequencies_iterative.py +++ b/segmentation_time_series_frequencies_iterative.py @@ -26,9 +26,9 @@ def main (): significance_threshold=0.95 - min_lenght=10 # to cut the series + min_lenght=10 # (# w-ins) to cut the series - top=50 #max: 8921 with filters + top=50 #max: 8921 with filters , all sorted from longest (in # w-ins) to shortest record list_all_average_frequencies=[] histogram_all_freq_no_averaged=[0]*1000 diff --git a/segmentation_time_series_frequencies_iterative_meaningful_threshold.py b/segmentation_time_series_frequencies_iterative_meaningful_threshold.py index fa34f83..7e5d1e3 100644 --- a/segmentation_time_series_frequencies_iterative_meaningful_threshold.py +++ b/segmentation_time_series_frequencies_iterative_meaningful_threshold.py @@ -2,7 +2,7 @@ #! /usr/bin/env python """ -Created by Julia Poncela in October 2011 +Created by Julia Poncela of October 2011 Given a file for a non-stationary time serie, it calculates the optimum points to cut it, that mark different trends. @@ -25,10 +25,15 @@ def main (): + + + + + significance_threshold=0.95 min_lenght=10 # to cut the series (in terms of num of points) - top= 8921 #max: 8921 with filters + top= 8924 #max: 8924 with filters freq_threshold=0.2 # meaningful threshold for (freq1-freq2)/Average(freq1,freq2) @@ -87,19 +92,19 @@ def main (): list=line.split(" ") - + ck_id=list[8] vector=[] try: vector.append(float(list[4])) #day - vector.append(float(list[9])) #frequency + vector.append(float(list[7])) #frequency vector.append(float(list[2])) #%weight change - list_values_for_average.append(float(list[9])) + list_values_for_average.append(float(list[7])) - index=int(round(float(list[9]))) #for the histogram of frequencies (not averaged) + index=int(round(float(list[7]))) #for the histogram of frequencies (not averaged) histogram_all_freq_no_averaged[index]+=1 num_events_all_freq_no_averaged+=1. @@ -251,7 +256,7 @@ def main (): - print >> file2, "\n" + print >> file2, "\n" # to separate segments of the same time series if cut == list_cut_times[-1]: @@ -265,6 +270,9 @@ def main (): cut_inferior=cut + + + else: # if only one frequency for the whole serie for vector in list_vectors: print >> file2, vector[0],vector[1],vector[2] diff --git a/segmentation_time_series_frequencies_iterative_meaningful_threshold_EDIT_DB.py b/segmentation_time_series_frequencies_iterative_meaningful_threshold_EDIT_DB.py new file mode 100644 index 0000000..ad47c43 --- /dev/null +++ b/segmentation_time_series_frequencies_iterative_meaningful_threshold_EDIT_DB.py @@ -0,0 +1,400 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela of October 2011 + +Given a file for a non-stationary time serie, it calculates the optimum points to cut it, that mark different trends. + +More info: It follows the method proposed by Fukuda, Stanley and Amaral PRL 69, 2004. + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy +from scipy import stats +from database import * #package to handle databases + + + + +def main (): + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS frequency_cuts") #i remove the old table + + + #i create a new table in an existin DB + db.execute (""" + CREATE TABLE frequency_cuts + ( + file_index INT, + ck_id CHAR (200), + start_day INT, + end_day INT, + mean_freq FLOAT, + std_freq FLOAT, + tot_num_cuts INT + + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + significance_threshold=0.95 + min_lenght=10 # to cut the series (in terms of num of points) + + top=8924 #max: 8924 with filters, 50 for the top50 longest series + + + freq_threshold=0.2 # meaningful threshold for (freq1-freq2)/Average(freq1,freq2) + + + list_all_average_frequencies=[] + num_events_all_freq_no_averaged=0. + + + + + file22=open("temporal_series/most_weigh_ins/number_segments_acording_to_frequencies.dat",'wt') + + + + + for index_file in range(top): + index_file+=1 + + + list_average_frequencies_one_user=[] + num_events_indiv=0. + + file33=open("temporal_series/most_weigh_ins/average_frequencies_segments_"+str(index_file)+".dat",'wt') + + + + +#input file: + #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" + file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" + + + + + file=open(file_name+".dat",'r') + list_lines_file=file.readlines() + + + + + file2=open(file_name+"_frequencies_t_points_segments"+str(min_lenght)+"_threshold.dat",'wt') + + + + + list_vectors=[] + list_series=[] + list_values_for_average=[] # to do the average if the whole serie is just one piece + + cont=0 + for line in list_lines_file: + if cont>0: # i skip the first line,cos it doesnt have an associated freq. + + list=line.split(" ") + + ck_id=list[8] + + vector=[] + + # try: + + vector.append(float(list[4])) #day + vector.append(float(list[7])) #frequency + vector.append(float(list[2])) #%weight change + + list_values_for_average.append(float(list[7])) + + index=int(round(float(list[7]))) #for the histogram of frequencies (not averaged) + num_events_all_freq_no_averaged+=1. + + num_events_indiv+=1. + + + + + vector.append(0.0) # here it will go the value for t + list_vectors.append(vector) + cont+=1 + end=list_vectors[-1][0] #last day + + num_lines=len(list_vectors) + + list_series.append(list_vectors) + + + list_cut_times=[] + + for list_evolution in list_series: # i analyze the current time serie (or fragment of it) + + + num_points=len(list_evolution) + + + if num_points>=min_lenght: # if the serie is too short, i wont cut it any further + + t_max=0.0 + + for index in range(num_points): + + if index>=1 and index < num_points-1: # to cut the serie, at least need one point in each list + + list1=[] #first segment of the list + list2=[] #second segment of the list + + + for x1 in range(num_points): + + if x1 <= index: + list1.append(list_evolution[x1][1]) #only the list of values (not times!) + else: + list2.append(list_evolution[x1][1]) + + + + mu1=numpy.mean(list1) + mu2=numpy.mean(list2) + + sd1=numpy.std(list1) + sd2=numpy.std(list2) + + N1=float(len(list1)) + N2=float(len(list2)) + + S_D=math.sqrt(((N1-1)*sd1*sd1 + (N2-1)*sd2*sd2)/(N1+N2-2))*math.sqrt(1.0/N1 + 1.0/N2) + t=math.fabs((mu1-mu2)/S_D) + + + + + list_evolution[index][3]=t + + + + if t >= t_max: + t_max=t + index_max_t=index + time_max=list_evolution[index][0] + + + segment1=[] + segment2=[] + for x2 in range(num_points): # i save the definitive two segments + + if x2 <= index_max_t: + segment1.append(list_evolution[x2]) #list of events (time, value,t) + else: + segment2.append(list_evolution[x2]) + + + + + + eta=4.19*math.log(float(num_points))-11.54 + delta=0.40 + nu=float(num_points)-2.0 + + a=delta*nu #for the Incomplete beta function + b=delta + x=nu/(nu+t_max*t_max) + I=stats.mstats.betai(a,b,x) + + #print eta, nu, a, b,x,I + try: + significance_t_max=math.pow((1.0-I),eta) #Return x raised to the power y. + except ValueError: + significance_t_max=0. #in case I=1.0 + + + # print "independently of the signif. and meaninfl",mu1, mu2, abs(mu1-mu2)/((mu1+mu2)/2.) + + if significance_t_max>significance_threshold and (abs(mu1-mu2)/((mu1+mu2)/2.) >= freq_threshold or len(list_cut_times) ==0) : #so i allow the first cut always + if len(segment1)>min_lenght and len(segment2)>min_lenght: + + #print "file:",index_file,"max_t:", t_max, "at time:",list_evolution[index_max_t][0],"significance:",significance_t_max," I:",I,"eta:",eta,"nu:",nu,"x:",x,"a:",a,"N:",num_points, "diff freq:",abs(mu1-mu2)/((mu1+mu2)/2.),mu1,mu2 + + list_series.append(segment1) # next i will analyze the two segments independently + list_series.append(segment2) + + #print " ",len(segment1), len(segment2) + + + list_cut_times.append(time_max) + + if round(mu1) not in list_average_frequencies_one_user: # for the histogram of frequencies + list_average_frequencies_one_user.append(round(mu1)) + if round(mu2) not in list_average_frequencies_one_user: + list_average_frequencies_one_user.append(round(mu2)) + + + else: + pass + # print " file:",index_file,"max_t:", t_max, "at time:",list_evolution[index_max_t][0],"NON significant!:",significance_t_max," I:",I,"eta:",eta,"nu:",nu,"x:",x,"a:",a,"N:",num_points + + + + + + +########################################################### +# print out the cut time series to files and add info to DB: +########################################################## + + start=0. + list_cut_times=sorted(list_cut_times) + #if len(list_cut_times)>0: + print "\n\nfile:",index_file,"number of cutting points:",len(list_cut_times),"namely:",list_cut_times + + cut_inferior=0.0 + + if len(list_cut_times)!=0: + for cut in list_cut_times: + + + list_freq_that_segment=[] #list of freq only of that segment + for vector in list_vectors: + + if vector[0]>= cut_inferior and vector[0]<= cut: + print >> file2, vector[0],vector[1],vector[2] + list_freq_that_segment.append(vector[1]) + + + print >> file2, "\n" # to separate segments of the same time series + + + + print str(index_file),str(start),str(cut),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)), len(list_freq_that_segment) + + db.execute (""" + INSERT INTO frequency_cuts (file_index, ck_id, start_day, end_day, mean_freq, std_freq, tot_num_cuts) + VALUES (%s, %s, %s,%s, %s, %s, %s) + """, str(index_file), str(ck_id),str(start),str(cut),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)) ) + + + + + + + if cut == list_cut_times[-1]: #for the last segment (also if there is just one cut) + list_freq_that_segment=[] + for vector in list_vectors: + if vector[0]> cut: + print >> file2, vector[0],vector[1],vector[2] + list_freq_that_segment.append(vector[1]) + + + #i write the freq. segments info into the DB + + print str(index_file),str(cut),str(list_vectors[-1][0]),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)), len(list_freq_that_segment) + + db.execute (""" + INSERT INTO frequency_cuts (file_index, ck_id, start_day, end_day, mean_freq, std_freq, tot_num_cuts) + VALUES (%s, %s, %s,%s, %s, %s, %s) + """, str(index_file), str(ck_id),str(cut),str(list_vectors[-1][0]),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)) ) + + + + print >> file2, "\n" + + cut_inferior=cut + + start=cut + + + else: # if only one frequency for the whole serie + list_freq_that_segment=[] + + for vector in list_vectors: + print >> file2, vector[0],vector[1],vector[2] + list_freq_that_segment.append(vector[1]) + + + + print str(index_file), str(list_vectors[0][0]),str(list_vectors[-1][0]),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)), len(list_freq_that_segment) + + db.execute (""" + INSERT INTO frequency_cuts (file_index, ck_id, start_day, end_day, mean_freq, std_freq, tot_num_cuts) + VALUES (%s, %s, %s,%s, %s, %s, %s) + """, str(index_file), str(ck_id),str(list_vectors[0][0]),str(list_vectors[-1][0]),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)) ) + + + + + + + #i calculate the average frequency of the whole serie + mu=round(numpy.mean(list_values_for_average)) + if mu not in list_average_frequencies_one_user: + list_average_frequencies_one_user.append(mu) + + file2.close() + + + + + + + + for item in list_average_frequencies_one_user: + list_all_average_frequencies.append(int(item)) + + + + + + + + + # i create a file with the number of segments for every time serie (i need that info so i can plot them automatically) + + print >> file22,index_file,len(list_cut_times)+1 + + for i in list_average_frequencies_one_user: + print >> file33, i + + + + file22.close() + file33.close() + + + # print list_all_average_frequencies + + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/segmentation_time_series_frequencies_iterative_meaningful_threshold_include_gap_info_EDIT_DB.py b/segmentation_time_series_frequencies_iterative_meaningful_threshold_include_gap_info_EDIT_DB.py new file mode 100644 index 0000000..c0ffb68 --- /dev/null +++ b/segmentation_time_series_frequencies_iterative_meaningful_threshold_include_gap_info_EDIT_DB.py @@ -0,0 +1,576 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela of October 2011 + +Cut a time series according to diff. frequency trends. Also, takes into account gap info (table from the database) + +More info: It follows the method proposed by Fukuda, Stanley and Amaral PRL 69, 2004. + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy +from scipy import stats +from database import * #package to handle databases + + + + +def main (): + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + db.execute ("DROP TABLE IF EXISTS frequency_cuts") #i remove the old table + + + #i create a new table in an existin DB + db.execute (""" + CREATE TABLE frequency_cuts + ( + file_index INT, + ck_id CHAR (200), + start_day INT, + end_day INT, + mean_freq FLOAT, + std_freq FLOAT, + tot_num_cuts INT + + + ) + """) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones + + + + + + significance_threshold=0.95 + min_lenght=5 # to cut the series (in terms of num of points) + + top=8924 #max: 8924 with filters, 50 for the top50 longest series + + + freq_threshold=0.2 # meaningful threshold for (freq1-freq2)/Average(freq1,freq2) + + + list_all_average_frequencies=[] + num_events_all_freq_no_averaged=0. + + + + + file22=open("temporal_series/most_weigh_ins/number_segments_acording_to_frequencies.dat",'wt') + + + + + for index_file in range(top): + index_file+=1 + + + list_cut_times=[] #OJO!! now i consider cuttings from gaps AND from freq. changes + + + list_average_frequencies_one_user=[] + num_events_indiv=0. + + file33=open("temporal_series/most_weigh_ins/average_frequencies_segments_"+str(index_file)+".dat",'wt') + + + + +#input file: + #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" + file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" + + + + + file=open(file_name+".dat",'r') + list_lines_file=file.readlines() + + + + file2=open(file_name+"_frequencies_t_points_segments"+str(min_lenght)+"_threshold.dat",'wt') + + + list_vectors=[] + list_series=[] + list_values_for_average=[] # to do the average if the whole serie is just one piece + + + + + cont=0 + for line in list_lines_file: + if cont>0: # i skip the first line,cos it doesnt have an associated freq. + + list=line.split(" ") + + ck_id=list[8].strip("\n") + + vector=[] + + + vector.append(float(list[4])) #day + vector.append(float(list[7])) #frequency + vector.append(float(list[2])) #%weight change + + list_values_for_average.append(float(list[7])) + + index=int(round(float(list[7]))) #for the histogram of frequencies (not averaged) + num_events_all_freq_no_averaged+=1. + + num_events_indiv+=1. + + + + + vector.append(0.0) # here it will go the value for t + list_vectors.append(vector) + cont+=1 + end=list_vectors[-1][0] #last day + + num_lines=len(list_vectors) + + list_series.append(list_vectors) + + + + # i find out if there is a gap in the time series that i need to consider: + + query22= "select * from gaps_by_frequency where ck_id='"+str(ck_id)+"' order by start_day" + result22 = db.query(query22) + + + + current_time_serie=[] + for element in list_series[0]: # i made a copy of the current time serie (to cut it according to gaps) + current_time_serie.append(element) + + + + + print " index_file:",index_file, " num_gaps:",len(result22),"lenght current time serie:",len(list_series[0]) + + + if len(result22)>0: ## if there is a gap in the time series, i have to pre-cut it before i run the algorithm for freq. + + list_series=[] # # i overwrite the previosly un-cut time series, and i substitute it by the fragments + + + + last_index=0 # for the case of multiple gaps + last_end_index=0 + + for r22 in result22: # to cut the time serie + last_index+=1 + + segmento=[] + + index_start=r22['index_start_day'] + index_end=r22['index_end_day'] + + + if len(result22)==1 : ######### if only one gap + + segmento1=[] + segmento2=[] + + for index in range(len(current_time_serie)): # i save the definitive two segments + if index+1 <= index_start: # OJO!!! index+1 to compensate the index diff. between weight change time series and freq time series + segmento1.append(current_time_serie[index]) #list of events (time, value,t) + + elif index+1 >= index_end: + segmento2.append(current_time_serie[index]) + else: + print "something is wrong", index,current_time_serie[index] ,index_start, index_end + + + + + + + + if len(segmento1)>0: # to avoid isolated points? + if segmento1[-1][0] not in list_cut_times: + list_cut_times.append(segmento1[-1][0]) # i adda as cutting point the starting mark of the gap + if len(segmento2)>0: # to avoid isolated points? + if segmento2[0][0] not in list_cut_times: + list_cut_times.append(segmento2[0][0]) # i adda as cutting point the ending mark of the gap + + + try: + del segmento2[0] # because now the freq. of the fist point of the second segment is not defined + except: pass + + list_series.append(segmento1) + list_series.append(segmento2) + + + + + + else: # if several gaps + + + if last_index= last_end_index: + if index+1 <= index_start: # OJO!!! index+1 to compensate the index diff. between weight change time series and freq time series + segmento1.append(current_time_serie[index]) #list of events (time, value,t) + + elif index+1 >= index_end: + segmento2.append(current_time_serie[index]) + else: + print "something is wrong", index,current_time_serie[index] ,index_start, index_end + + + + + if len(segmento1)>0: # to avoid isolated points? + if segmento1[-1][0] not in list_cut_times: + list_cut_times.append(segmento1[-1][0]) # i adda as cutting point the starting mark of the gap + if len(segmento2)>0: # to avoid isolated points? + if segmento2[0][0] not in list_cut_times: + list_cut_times.append(segmento2[0][0]) # i adda as cutting point the ending mark of the gap + + + + if last_index>1: # if it is not the first segment + try: + del segmento1[0] # because now the freq. of the fist point of the second segment is not defined + except: pass + + + + + list_series.append(segmento1) + + last_end_index=index_end + + else: # if it is the last segment + + segmento1=[] + segmento2=[] + + for index in range(len(current_time_serie)): # i save the definitive two segments + if index+1 >= last_end_index: + if index+1 <= index_start: # OJO!!! index+1 to compensate the index diff. between weight change time series and freq time series + segmento1.append(current_time_serie[index]) #list of events (time, value,t) + + elif index+1 >= index_end: + segmento2.append(current_time_serie[index]) + else: + print "something is wrong", index,current_time_serie[index] ,index_start, index_end + + + + + + if len(segmento1)>0: # to avoid isolated points? + if segmento1[-1][0] not in list_cut_times: + list_cut_times.append(segmento1[-1][0]) # i adda as cutting point the starting mark of the gap + if len(segmento1)>0: # to avoid isolated points? + if segmento2[0][0] not in list_cut_times: + list_cut_times.append(segmento2[0][0]) # i adda as cutting point the ending mark of the gap + + + + try: + del segmento2[0] # now the freq. of the fist point of the second segment is not defined + except : pass + + + + list_series.append(segmento1) + list_series.append(segmento2) + + + + + + + print "num segm. for freq. cutting:",len(list_series),"\n" + print index_file,list_cut_times + + + + for list_evolution in list_series: # i analyze the current time serie (or fragment of it) + + num_points=len(list_evolution) + + + + + if num_points>=min_lenght: # if the serie is too short, i wont cut it any further + + t_max=0.0 + + for index in range(num_points): + + if index>=1 and index < num_points-1: # to cut the serie, at least need one point in each list + + list1=[] #first segment of the list + list2=[] #second segment of the list + + + for x1 in range(num_points): + + if ( (len(list_series)>1 and x1>0) or (len(list_series)==1) ): # if there have been a gap cut, then i ignore the first point (not correct freq) + + if x1 <= index: + list1.append(list_evolution[x1][1]) #only the list of values of freq. (not times!) + else: + list2.append(list_evolution[x1][1]) + + + + mu1=numpy.mean(list1) + mu2=numpy.mean(list2) + + sd1=numpy.std(list1) + sd2=numpy.std(list2) + + N1=float(len(list1)) + N2=float(len(list2)) + + S_D=math.sqrt(((N1-1)*sd1*sd1 + (N2-1)*sd2*sd2)/(N1+N2-2))*math.sqrt(1.0/N1 + 1.0/N2) + t=math.fabs((mu1-mu2)/S_D) + + + + + list_evolution[index][3]=t + + + + if t >= t_max: + t_max=t + index_max_t=index + time_max=list_evolution[index][0] + + + segment1=[] + segment2=[] + for x2 in range(num_points): # i save the definitive two segments + + if x2 <= index_max_t: + segment1.append(list_evolution[x2]) #list of events (time, value,t) + else: + segment2.append(list_evolution[x2]) + + + + + + eta=4.19*math.log(float(num_points))-11.54 + delta=0.40 + nu=float(num_points)-2.0 + + a=delta*nu #for the Incomplete beta function + b=delta + x=nu/(nu+t_max*t_max) + I=stats.mstats.betai(a,b,x) + + + try: + significance_t_max=math.pow((1.0-I),eta) #Return x raised to the power y. + except ValueError: + significance_t_max=0. #in case I=1.0 + + + + + + + + + + if significance_t_max>significance_threshold and ( abs(mu1-mu2)/((mu1+mu2)/2.) >= freq_threshold or len(list_cut_times) ==0) : #so i allow the first cut always + if len(segment1)>min_lenght and len(segment2)>min_lenght: + + + list_series.append(segment1) # next i will analyze the two segments independently + list_series.append(segment2) + + print " FREQ. CUT!!!" + ############ raw_input() + + + list_cut_times.append(time_max) + + if round(mu1) not in list_average_frequencies_one_user: # for the histogram of frequencies + list_average_frequencies_one_user.append(round(mu1)) + if round(mu2) not in list_average_frequencies_one_user: + list_average_frequencies_one_user.append(round(mu2)) + + + + + + + print index_file,list_cut_times + # raw_input() + +########################################################### +# print out the cut time series to files and add info to DB: +########################################################## + + start=0. + list_cut_times=sorted(list_cut_times) + #if len(list_cut_times)>0: + ######################### print "\n\nfile:",index_file,"number of cutting points:",len(list_cut_times),"namely:",list_cut_times + + cut_inferior=0.0 + + if len(list_cut_times)!=0: + + for cut in list_cut_times: + + + list_freq_that_segment=[] #list of freq only of that segment + for vector in list_vectors: + + if vector[0]>= cut_inferior and vector[0]<= cut: + print >> file2, vector[0],vector[1],vector[2] + list_freq_that_segment.append(vector[1]) + + + print >> file2, "\n" # to separate segments of the same time series + + + + + + + + if len(list_freq_that_segment)>2: + + db.execute (""" + INSERT INTO frequency_cuts (file_index, ck_id, start_day, end_day, mean_freq, std_freq, tot_num_cuts) + VALUES (%s, %s, %s,%s, %s, %s, %s) + """, str(index_file), str(ck_id),str(start),str(cut),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)) ) + + + + + + + if cut == list_cut_times[-1]: #for the last segment (also if there is just one cut) + list_freq_that_segment=[] + for vector in list_vectors: + if vector[0]> cut: + print >> file2, vector[0],vector[1],vector[2] + list_freq_that_segment.append(vector[1]) + + + #i write the freq. segments info into the DB + + + if len(list_freq_that_segment)>2: + + db.execute (""" + INSERT INTO frequency_cuts (file_index, ck_id, start_day, end_day, mean_freq, std_freq, tot_num_cuts) + VALUES (%s, %s, %s,%s, %s, %s, %s) + """, str(index_file), str(ck_id),str(cut),str(list_vectors[-1][0]),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)) ) + + + + print >> file2, "\n" + + cut_inferior=cut + + + start=cut + + + else: # if only one frequency for the whole serie + list_freq_that_segment=[] + + for vector in list_vectors: + print >> file2, vector[0],vector[1],vector[2] + list_freq_that_segment.append(vector[1]) + + + if len(list_freq_that_segment)>2: + + db.execute (""" + INSERT INTO frequency_cuts (file_index, ck_id, start_day, end_day, mean_freq, std_freq, tot_num_cuts) + VALUES (%s, %s, %s,%s, %s, %s, %s) + """, str(index_file), str(ck_id),str(list_vectors[0][0]),str(list_vectors[-1][0]),str(numpy.mean(list_freq_that_segment)), str(numpy.std(list_freq_that_segment)) ,str(len(list_cut_times)) ) + + + + + + + #i calculate the average frequency of the whole serie + mu=round(numpy.mean(list_values_for_average)) + if mu not in list_average_frequencies_one_user: + list_average_frequencies_one_user.append(mu) + + file2.close() + + + + + + + + for item in list_average_frequencies_one_user: + list_all_average_frequencies.append(int(item)) + + + + + + + + + # i create a file with the number of segments for every time serie (i need that info so i can plot them automatically) + + print >> file22,index_file,len(list_cut_times)+1 + + for i in list_average_frequencies_one_user: + print >> file33, i + + + + file22.close() + file33.close() + + + # print list_all_average_frequencies + + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/segmentation_time_series_iterative.py b/segmentation_time_series_iterative.py index 8d0e7aa..b48ca8a 100644 --- a/segmentation_time_series_iterative.py +++ b/segmentation_time_series_iterative.py @@ -15,6 +15,7 @@ import sys import os +from datetime import * import math import numpy from scipy import stats @@ -22,7 +23,7 @@ def main (): - + significance_threshold=0.95 min_lenght=30 # to cut the series diff --git a/time_series_queries_weight_loss_50top_frequencies.py b/time_series_queries_weight_loss_50top_frequencies.py index 5d770d8..8f28034 100644 --- a/time_series_queries_weight_loss_50top_frequencies.py +++ b/time_series_queries_weight_loss_50top_frequencies.py @@ -28,7 +28,7 @@ def main (): - top=500 + top=50 @@ -63,8 +63,7 @@ def main (): query2="select * from weigh_in_history where (ck_id ='"+str(ck_id)+"') order by on_day asc" result2 = db.query(query2) - - + # result2 is a list of dict.: [{'id': 163978L, 'activity_flag': u'WI', 'weight': 144.0, 'on_day': datetime.datetime(2009, 11, 27, 0, 0), 'ck_id': u'bd84dbe2-dd6e-4125-b006-239442df2ff6'}, {'id': 163979L, 'activity_flag': u'WI', 'weight': 143.09999999999999, 'on_day': datetime.datetime(2009, 12, 15, 0, 0), 'ck_id': u'bd84dbe2-dd6e-4125-b006-239442df2ff6'}, ...,] r1['num_wi']=len(result2) # i add another key-value to the dict. -->> with this i ALSO modify the list of dict. result!!! @@ -74,7 +73,7 @@ def main (): - sorted_list_of_dict=[] + sorted_list_of_dict=[] # i sort the time series from the longest (highest # w-ins) to shortest for i in range(top): try: @@ -98,7 +97,7 @@ def main (): cont=0 #number of users - + print cont ck_id=r1['ck_id'] id=r1['id'] @@ -107,6 +106,7 @@ def main (): + print ck_id num_user=num_user+1 @@ -142,7 +142,7 @@ def main (): if (float(row['weight'])>10.0): # to eliminate some error in the data - print >> file, contador,row['weight'],float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),(float(row['weight'])-initial_weight)*100.0/initial_weight,row['on_day']-first_day ,row['on_day'] + print >> file, contador,row['weight'],float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),(float(row['weight'])-initial_weight)*100.0/initial_weight,row['on_day']-first_day ,row['on_day'] , ck_id contador=contador+1 print >> file,"\n" #to separate users @@ -169,10 +169,10 @@ def main (): first_day=row['on_day'] - interevent=row['on_day']-previous # time between events + interevent=(row['on_day']-previous).days # time between events if (float(row['weight'])>10.0): # to eliminate some error data - print >> file, contador2,row['weight'],(float(row['weight'])-initial_weight)*100.0/initial_weight,float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),row['on_day']-first_day ,row['on_day'],interevent + print >> file, contador2,row['weight'],(float(row['weight'])-initial_weight)*100.0/initial_weight,float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),(row['on_day']-first_day).days ,row['on_day'],interevent, ck_id contador2=contador2+1 diff --git a/time_series_queries_weight_loss_with_frequencies_and_filters.py b/time_series_queries_weight_loss_with_frequencies_and_filters.py index a08abda..543504d 100644 --- a/time_series_queries_weight_loss_with_frequencies_and_filters.py +++ b/time_series_queries_weight_loss_with_frequencies_and_filters.py @@ -53,7 +53,7 @@ def main (): - + num_user=0 for r1 in result1: #loop over users to get their number_of_weigh-ins @@ -109,11 +109,13 @@ def main (): first_day=row['on_day'] - interevent=row['on_day']-previous # time between events + interevent=(row['on_day']-previous).days # time between events if (float(row['weight'])>10.0): # to eliminate some error data - print >> file, contador2,row['weight'],(float(row['weight'])-initial_weight)*100.0/initial_weight,float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),row['on_day']-first_day ,row['on_day'],interevent + print >> file,contador2,row['weight'],(float(row['weight'])-initial_weight)*100.0/initial_weight,float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),(row['on_day']-first_day).days ,row['on_day'],interevent,ck_id + # print contador2,row['weight'],(float(row['weight'])-initial_weight)*100.0/initial_weight,float(row['weight'])*703.0/(float(r1['height'])*float(r1['height'])),(row['on_day']-first_day).days ,row['on_day'],interevent,ck_id + #raw_input() contador2=contador2+1 previous=row['on_day'] diff --git a/transition_matrix_for_time_series_coef_lin_only_positive_negative_plus_freq_info.py b/transition_matrix_for_time_series_coef_lin_only_positive_negative_plus_freq_info.py index 49b5ddf..7c5f532 100644 --- a/transition_matrix_for_time_series_coef_lin_only_positive_negative_plus_freq_info.py +++ b/transition_matrix_for_time_series_coef_lin_only_positive_negative_plus_freq_info.py @@ -27,7 +27,7 @@ def main (): max_coef_slope=5 # +- that %pwc in one day!! - tot_num_transitions=00 # across all users + tot_num_transitions=0 # across all users list_diff_states=[] list_all_transitions=[] @@ -54,7 +54,7 @@ def main (): ################################ # lin: intersection + X*slope # -#exponential: A+ B*exp(lambda*t)# +#exponential: A+B*exp(lambda*t)# ################################ cont_lines=1 @@ -125,10 +125,10 @@ def main (): if len(list_lines_file) == 1: # if only one line in the file --> only one state --> diagonal term for the matrix - lista=[] - lista.append(state) - lista.append(state) - list_all_transitions.append(lista) + list=[] + list.append(state) + list.append(state) + list_all_transitions.append(list) tot_num_transitions+=1.0 @@ -137,10 +137,10 @@ def main (): old_state=state else: new_state=state - lista=[] - lista.append(old_state) - lista.append(new_state) - list_all_transitions.append(lista) + list=[] + list.append(old_state) + list.append(new_state) + list_all_transitions.append(list) old_state=state # i update for the next transition tot_num_transitions+=1.0 diff --git a/transition_matrix_for_time_series_freq_weigh_cuts_and_gaps.py b/transition_matrix_for_time_series_freq_weigh_cuts_and_gaps.py new file mode 100644 index 0000000..effa357 --- /dev/null +++ b/transition_matrix_for_time_series_freq_weigh_cuts_and_gaps.py @@ -0,0 +1,455 @@ +#! /usr/bin/env python + +""" +Created by Julia Poncela on March 2012 + +It queries the DB for the info about diff. behaviors in the time series, in terms of trend, frequency and gaps, and it calculates the transition probability matrix from every state to every state, by counting the number of times each transition occurs accros all time series. + + +""" + + +import sys +import os +from datetime import * +from database import * #codigo para manejar bases de datos +import math +import numpy +from scipy import stats + + + +def main (): + + + + max_coef_exp=1000 + max_coef_slope=5 # +- that %pwc in one day!! + + + tot_num_transitions=0 # across all users + list_diff_states=[] # for the transition matrix + list_all_transitions=[] # for the transition matrix + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + db= Connection(server, database, user, passwd) + + + + query="""select * from users""" + result1 = db.query(query) # is a list of dictionaries + + + + contador_time_series=0 + contador=0 + + for r1 in result1: #loop over users + contador+=1 + + ck_id=r1['ck_id'] + + + + + list_start_end_dates_one_user=[] #without considering the gaps + + list_states_one_user=[] #without considering the gaps (just diff. behaviors) + + + dicc_states_one_user={} # the key is the initial_day, the value is the type of trend or gap + temp_dicc_states={} + + try: #only for the users with an entry in that time_series related table: one of the (filtered) time series + + ###################################### + # for the info about different trends# (for a given, fixed user) + ###################################### + + contador_time_series+=1 + print "\n\n\n",contador, ck_id + + + + query2="select * from weigh_in_cuts where (ck_id ='"+str(ck_id)+"') order by start_day asc" + result2 = db.query(query2) + + + #result2: [{'fit_type': u'exp', 'stop_idx': 4L, 'ck_id': u'a48eba6e-51ad-42bc-b367-a24bb6504f0a', 'id': 68008L, 'cost': 0.5, 'param3': -0.60660499999999995, 'param2': 3.4280400000000002, 'param1': -3.1829299999999998, 'start_idx': 0L}, {'fit_type': u'lin', 'stop_idx': 16L, 'ck_id': u'a48eba6e-51ad-42bc-b367-a24bb6504f0a', 'id': 68007L, 'cost': 0.5, 'param3': None, 'param2': 0.015299699999999999, 'param1': -4.1967499999999998, 'start_idx': 5L}] + + + num_trends=len(result2) + + + + for r2 in result2: + + start_stop_days=[] # for that particular segment + + starting_day=int(r2['start_day']) + ending_day=int(r2['stop_day']) + trend=r2['fit_type'] + param1=r2['param1'] + param2=r2['param2'] + param3=r2['param3'] + + + + start_stop_days.append(starting_day) + start_stop_days.append(ending_day) + + + list_start_end_dates_one_user.append(start_stop_days) + + + + + if trend == "exp": + #i dont really care about the interception + coef1_exp=param1 + coef2_exp=param2 + coef3_exp=param3 ################################ + # lin: intersection + X*slope # + #exponential: A+B*exp(lambda*t)# + ################################ + + + if coef1_exp <= max_coef_exp and coef1_exp >= -max_coef_exp : # to avoid weird fits + if coef2_exp <= max_coef_exp and coef2_exp >= -max_coef_exp : + if coef3_exp <= max_coef_exp and coef3_exp >= -max_coef_exp : + + + if coef2_exp <0 and coef3_exp >0: # i only care about exp increase/decrease + state="exp_down" + elif coef2_exp >0 and coef3_exp <0: + state="exp_down" + elif coef2_exp >0 and coef3_exp >0: + state="exp_up" + elif coef2_exp <0 and coef3_exp <0: + state="exp_up" + + # elif coef2_exp==0.0 or coef2_exp== -0.0: + # state="flat" + else: + print ck_id, coef2_exp, coef3_exp,"not one of the two exp types" + raw_input() + + + + + + else: + print "values for the fit coef. too weird!", param1,param2,param3, ck_id + + elif trend == "lin": + # i dont really care about the interception + coef2_lin=param2 + + + + if coef2_lin <= max_coef_slope and coef2_lin >= -max_coef_slope : # to avoid weird fits + + if coef2_lin <0: # i only care about linear increase/decrease + state="lin_down" + + if coef2_lin > -0.0001: + state="flat" + + elif coef2_lin==0.0: + state="flat" + else: + state="lin_up" + if coef2_lin < 0.0001: + state="flat" + + + + list_states_one_user.append(state) + temp_dicc_states[starting_day]=state # i save the pair starting_day, trend for that user + if state not in list_diff_states: + list_diff_states.append(state) + + + + ####################### end loop over result2 (info diff trends) + + + except MySQLdb.Error: pass #for the users without an entry in that time_series related table: not one of the (filtered) time series + + + + + + + + ########################## + # for the gap info # (for the same given, fixed user ck_id) + ########################## + + + query3="select * from gaps_by_frequency where (ck_id ='"+str(ck_id)+"') order by start_day asc" + result3 = db.query(query3) + + #result3: [{'file_index': 1408, 'ck_id': 8647c765-e37e-4024-92da-be838b792379, 'start_date':2009-05-07 00:00:00, 'end_date':2009-06-08 00:00:00, 'start_day': 108, 'end_day': 140, 'days_gap': 32, 'zscore_gap': 3.83125},{'file_index': 1408, 'ck_id': 8647c765-e37e-4024-92da-be838b792379, 'start_date':2009-08-04 00:00:00, 'end_date':2009-09-30 00:00:00, 'start_day': 197, 'end_day': 254, 'days_gap': 57, 'zscore_gap': 7.1496} ] + + + num_gaps=len(result3) + + + + + if num_gaps>0: + + + + for r3 in result3: + + file_index=r3['file_index'] + + starting_gap=int(r3['start_day']) + ending_gap=int(r3['end_day']) + trend="gap" + zscore_gap=r3['zscore_gap'] # threshold to consider a gap statistically sifnificant zs>=3 (imposed like that in: analyze_frequency_gaps_in_time_series_frequencies_EDIT_DB.py) + + + + if trend not in list_diff_states: + list_diff_states.append(trend) + + cont=-1 #to go over list_states_one_user + for segment in list_start_end_dates_one_user: # the list is sorted chronologically + cont+=1 + + start_behavior=segment[0] + end_behavior=segment[1] + + if (starting_gap <= end_behavior) and (ending_gap >= start_behavior): # if there is a gap + # in the middle of a behavior + num_trends+=2 + ## i cut the trend in two segments, with a gap in between + new_segment1=[start_behavior,starting_gap] + new_segment2=[starting_gap,ending_gap] + new_segment3=[ending_gap,end_behavior] + + + + + temp_dicc_states[start_behavior]=list_states_one_user[cont] + temp_dicc_states[starting_gap]=trend + temp_dicc_states[ending_gap]=list_states_one_user[cont] + + + #print "gap cutting in the middle of a single behavior:",new_segment1,new_segment2,new_segment3, ck_id + + + + + + + for i in range(len(list_start_end_dates_one_user)): # i check whether there is a gap in between DIFF behaviors + try: + + old_ending=list_start_end_dates_one_user[i][1] + new_beginnig=list_start_end_dates_one_user[i+1][0] + + + if (starting_gap >= old_ending) and (ending_gap <= new_beginnig): + + # print "gap in the middle of two diff. behaviors:",list_states_one_user[i],trend,list_states_one_user[i+1] + + + + temp_dicc_states[starting_gap]=trend + temp_dicc_states[ending_gap]=list_states_one_user[i+1] + + + + + except IndexError: pass + #print "no room for any more gaps,",len(list_start_end_dates_one_user),i + + + + + + + # create a final list of all states, and then go for state in list_states: and copy code from aux, line 70 on + + + + + + + + + + # end loop over result3 (gap info) + + + + else: #if the gap info doesnt change the number of trends (NO gaps) + pass + + + + for key in temp_dicc_states.keys(): # i make a copy of the dicc, to be the final one + dicc_states_one_user[key]=temp_dicc_states[key] + + + + +# i account for all possible states and transsitions between states: + + if len(dicc_states_one_user)>1: # several trends for the time series + + cont=0 + for key in sorted(dicc_states_one_user.keys()): # i print out the result of combining weigh_in cuts and gaps: + print key,dicc_states_one_user[key] + state=dicc_states_one_user[key] + if state not in list_diff_states: + list_diff_states.append(state) + + + list=[] + if cont==0: # for the first state + state1=dicc_states_one_user[key] + else: # for all the rest of states in the sorted dictionary + state2=dicc_states_one_user[key] + list.append(state1) + list.append(state2) + list_all_transitions.append(list) + + state1=state2 + cont+=1 + + + elif len(dicc_states_one_user)==1: # one single trend for the time series + list=[] + for key in dicc_states_one_user: + list.append(dicc_states_one_user[key]) + list.append(dicc_states_one_user[key]) + print key,dicc_states_one_user[key] + if state not in list_diff_states: + list_diff_states.append(state) + + list_all_transitions.append(list) + + #print list_all_transitions + +# if num_gaps>0: + # raw_input() + + + + + ################################# end loop over result1 (loop over users) + + + + + + +# i create the empty transition matrix + + list_diff_states=sorted(list_diff_states) # this is the order of the trends for the transition matrix + + matrix=[] + for i in range(len(list_diff_states)): + matrix.append([0.000]*len(list_diff_states)) #ojo!!!!!!!!!!!!!!! si la creo: matrix.append(list), con list=[0.]*len(list_diff_states), al modificar unos elementos, modificare otros sin querer!!!!!!!!!!!!!!!!!!! + + + + + + print "num transitions:",len(list_all_transitions), "diff. states:",list_diff_states + + + for transition in list_all_transitions: + for i in range(len(list_diff_states)): + old_state=list_diff_states[i] + for j in range(len(list_diff_states)): + + new_state=list_diff_states[j] + + if transition[0]== old_state and transition[1]== new_state : + matrix[i][j]+=1.000 + + + + + file_name1="temporal_series/transiton_probability_matrix_test_with_gap_info.dat" + file1=open(file_name1,'wt') + + for i in range(len(list_diff_states)): + for j in range(len(list_diff_states)): + print >> file1,round(matrix[i][j]/float(len(list_all_transitions)),3), + + print >> file1,"\n" + file1.close() + + + + #### PLOTGRACE IMPLEMENTATION ### + # Draw the matrix + + + from PlotGrace import plot_matrix # i use a function from the module + # and just modify some of the default options + + plot_matrix(matrix,"temporal_series/transition_prob_matrix_with_gaps.agr", + xticklabels = list_diff_states, + yticklabels = list_diff_states, + xticklabelangle = 90, + colorscheme='YlOrRd', + #logcolorbar=True, + reversecolorscheme = False, + mincolor=(255,255,255), + title="Transition Probabilities") + + + + + + +# mx = dictmatrix(0.) #a diff. way of doing it: feeding it a dicc. of dicc. instead of a list of lists + +# for i in range(len(list_diff_states)): +# rowlabel = list_diff_states[i] +# for j in range(len(list_diff_states)): +# collabel = list_diff_states[j] +# value = matrix[i][j] +# mx.put(rowlabel,collabel,value) + +# plot_matrix(mx, "transition_prob_matrix.agr", +# xticklabelangle = 90, +# colorscheme='YlOrRd', +# reversecolorscheme = True, +# title="Transition Probabilities") + + + + + + + + + + + + + + + + +######################### + +if __name__== "__main__": + + main() + +########################## diff --git a/weight_change_after_gap.py b/weight_change_after_gap.py new file mode 100644 index 0000000..d2c911e --- /dev/null +++ b/weight_change_after_gap.py @@ -0,0 +1,330 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela of June 2012 + +Analyze weight change after a gap. + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy +from scipy import stats +from database import * #package to handle databases +import datetime +import random + + +def main (): + + max_index=2000 # just to try out the program with a few time series + index=-1 + + min_num_weigh_ins=30 # according to the time series filtering we agree on + + few_points=4 # to classify gaps in the beginning/ end of the time series + + + Niter_bootstrap=1000 + + ending_date_DB=datetime.datetime(2010, 12, 31, 0, 0) + reasonable_ending=datetime.datetime(2010, 12, 1, 0, 0) # i only consider users active until 1month before the ending date for the DB + + + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + + + + + query="""select * from users""" + result1 = db.query(query) # is a list of dict. + + + + list_weight_changes_gaps=[] + list_weight_changes_gaps_network=[] + list_all_weight_changes=[] + list_all_weight_changes_network=[] + + list_net_weight_changes=[] + list_net_weight_changes_network=[] + + num_users=0. + num_users_with_gaps=0. + num_network_users_with_gaps=0. + num_network_users=0. + + list_gaps_no_gaps_all=[] # prob. of having a gap + list_gaps_no_gaps_network=[] + + + list_weight_gain_after_gap_all=[] # prob. of gaining weight after a gap + list_weight_gain_after_gap_network=[] + + + list_init_gap_all=[] # prob. of having a gap initally/in the middle/ at the end + list_init_gap_network=[] + + list_middle_gap_all=[] + list_middle_gap_network=[] + + list_end_gap_all=[] + list_end_gap_network=[] + + + + for r1 in result1: #loop over users to get their gap info + + + index+=1 + #if index <= max_index: # just to try out the program with a few time series + + print index + + ck_id=r1['ck_id'] + + + + query2="select * from gaps_by_frequency where (ck_id ='"+str(ck_id)+"') order by start_day asc" + result2 = db.query(query2) + + + + query3="select * from weigh_in_history where (ck_id ='"+str(ck_id)+"') order by on_day asc" + result3 = db.query(query3) + + + + query4="select * from friends where (src ='"+str(ck_id)+"')or (dest ='"+str(ck_id)+"') " + result4= db.query(query4) + + + + + if len(result3)>=min_num_weigh_ins: ################### ONLY if more than X weigh-ins + num_users+=1. + + + cont=0 + initial_weight=float(result3[0]['weight']) + for r3 in result3: # to calculate weight-change, i skip the first entry + if cont>0: + list_all_weight_changes.append(float(r3['weight'])-initial_weight) + + if len(result4)>0: # if he/she has friends + list_all_weight_changes_network.append(float(r3['weight'])-initial_weight) + + initial_weight=float(r3['weight']) + cont+=1 + list_net_weight_changes.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + + + if len(result4)>0: + num_network_users+=1. + list_net_weight_changes_network.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + + + if len(result2)>0: # if there is a gap record for that user + num_users_with_gaps+=1. + list_gaps_no_gaps_all.append(1.) + + if len(result4)>0: + num_network_users_with_gaps+=1. + + + + + + for r2 in result2: + + start_index=int(r2['index_start_day']) # index for the entry in the weigh_in_history table where the gap is (point number...): starting on 0 + end_index=int(r2['index_end_day']) + + + + +# to compute probabilities of having gaps at the beginning/middle/end, i am carefull about the time series +# not ending JUST because the DB ended + + date_end_serie=result3[-1]['on_day'] + if date_end_serie <=reasonable_ending: + print "gap",date_end_serie,"prior to the reasonable ending date of the DB",ending_date_DB + + + if start_index <= few_points: + list_init_gap_all.append(1.0) # prob. of having a gap initally/in the middle/ at the end + list_middle_gap_all.append(0.0) + list_end_gap_all.append(0.0) + + if len(result4)>0: + list_init_gap_network.append(1.0) + list_middle_gap_network.append(0.0) + list_end_gap_network.append(0.0) + + elif len(result3)-int(end_index)<=few_points: + list_init_gap_all.append(0.0) + list_middle_gap_all.append(0.0) + list_end_gap_all.append(1.0) + + if len(result4)>0: + list_init_gap_network.append(0.0) + list_middle_gap_network.append(0.0) + list_end_gap_network.append(1.0) + + else: + list_init_gap_all.append(0.0) + list_middle_gap_all.append(1.0) + list_end_gap_all.append(0.0) + + if len(result4)>0: + list_init_gap_network.append(0.0) + list_middle_gap_network.append(1.0) + list_end_gap_network.append(0.0) + + + else: + print date_end_serie,"after reasonable ending date of the DB",ending_date_DB + + + + tot_weight_change_gap=float(result3[end_index]['weight'])-float(result3[start_index]['weight']) + + list_weight_changes_gaps.append(tot_weight_change_gap) + + + if tot_weight_change_gap>=0.: + list_weight_gain_after_gap_all.append(1.) + else: + list_weight_gain_after_gap_all.append(0.) + + + + if len(result4)>0: + + if tot_weight_change_gap>=0.: + list_weight_gain_after_gap_network.append(1.) + else: + list_weight_gain_after_gap_network.append(0.) + + + list_gaps_no_gaps_network.append(1.) + list_weight_changes_gaps_network.append(float(result3[end_index]['weight'])- float(result3[start_index]['weight']) ) + + + + + else: # no gaps + list_gaps_no_gaps_all.append(0.) + if len(result4)>0: + list_gaps_no_gaps_network.append(0.) + + + + print 'total # users with >=30 w-ins: ',num_users,' total # users with gaps: ',num_users_with_gaps + print 'network users with >=30 w-ins: ',num_network_users,' network users with gaps: ',num_network_users_with_gaps,"\n" + + + print 'num_users_with_gaps/num_users:',num_users_with_gaps/num_users,' weight_change gaps all users:',numpy.mean(list_weight_changes_gaps),' +/- ',numpy.std(list_weight_changes_gaps) + print 'num_network_users_with_gaps/num_network_users',num_network_users_with_gaps/num_network_users,' weight_change gaps all ntwk users:',numpy.mean(list_weight_changes_gaps_network),' +/- ',numpy.std(list_weight_changes_gaps_network),"\n" + + print 'average weight change all changes, all users:',numpy.mean(list_all_weight_changes),' +/- ',numpy.std(list_all_weight_changes) + print 'average weight change all changes, network users:',numpy.mean(list_all_weight_changes_network),' +/- ',numpy.std(list_all_weight_changes_network) + + + print '# all weight changes:',len(list_all_weight_changes),' # gap changes all users:',len(list_weight_changes_gaps),' # gap changes network users:',len(list_weight_changes_gaps_network),"\n" + + + + print '# net weight changes:',numpy.mean(list_net_weight_changes),numpy.std(list_net_weight_changes),' # events::',len(list_net_weight_changes) + print '# net weight changes network:',numpy.mean(list_net_weight_changes_network),numpy.std(list_net_weight_changes_network),' # events:',len(list_net_weight_changes_network),"\n" + + + + + print "prob. having a gap for tot population with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_all)," # events:",len(list_gaps_no_gaps_all) + print "prob. having a gap for network population with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_network),"# events: ",len(list_gaps_no_gaps_network),"\n" + + + + print "prob. gaining weight after a gap, for tot population with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_all)," # events:",len(list_weight_gain_after_gap_all) + print "prob. gaining weight after a gap, for network population with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_network)," # events:",len(list_weight_gain_after_gap_network),"\n" + + + + print "prob. init. gap all:",numpy.mean(list_init_gap_all)," middle:",numpy.mean(list_middle_gap_all)," end:",numpy.mean(list_end_gap_all) + print "prob. init. gap network:",numpy.mean(list_init_gap_network)," middle:",numpy.mean(list_middle_gap_network)," end:",numpy.mean(list_end_gap_network),"\n" + + + + + print "zscore between all weight changes and gap changes (All):",bootstrap(list_all_weight_changes, len(list_weight_changes_gaps),numpy.std(list_weight_changes_gaps),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes), len(list_weight_changes_gaps) + + print "zscore between all weight changes and gap changes (Network):",bootstrap(list_all_weight_changes_network, len(list_weight_changes_gaps_network),numpy.std(list_weight_changes_gaps_network),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes_network), len(list_weight_changes_gaps_network) + + print "zscore between all weight changes (All) and gap changes (Network):",bootstrap(list_all_weight_changes, len(list_weight_changes_gaps_network),numpy.std(list_weight_changes_gaps_network),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes), len(list_weight_changes_gaps_network) + + + + + + +####################################### +def sample_with_replacement(population, k): + "Chooses k random elements (with replacement) from a population" + n = len(population) + _random, _int = random.random, int # speed hack + result = [None] * k + for i in xrange(k): + j = _int(_random() * n) + result[i] = population[j] + return result + + +############################################# +def bootstrap(list, sample_size,real_average_value,Niter): + + + list_synth_average_values=[] + + + for iter in range (Niter): + + list_synth=sample_with_replacement(list,sample_size) + + list_synth_average_values.append(numpy.mean(list_synth)) + + + + zscore=(real_average_value-numpy.mean(list_synth_average_values))/numpy.std(list_synth_average_values) + + + return zscore + + + + +######################### + +if __name__== "__main__": + + main() + + + +############################################## diff --git a/weight_change_after_gap_R6comparison.py b/weight_change_after_gap_R6comparison.py new file mode 100644 index 0000000..0529146 --- /dev/null +++ b/weight_change_after_gap_R6comparison.py @@ -0,0 +1,607 @@ + +#! /usr/bin/env python + +""" +Created by Julia Poncela of June 2012 + +Analyze weight change after a gap. + + + +""" + + +import sys +import os +from datetime import * +import math +import numpy +from scipy import stats +from database import * #package to handle databases +import datetime +import random +import networkx as nx + + + +def main (graph_name1, graph_name2): + + max_index=5000 # just to try out the program with a few time series + index=-1 + + min_num_weigh_ins=30 # according to the time series filtering we agree on + + few_points=4 # to classify gaps in the beginning/ end of the time series + + + Niter_bootstrap=1000 + + ending_date_DB=datetime.datetime(2010, 12, 31, 0, 0) + reasonable_ending=datetime.datetime(2010, 12, 1, 0, 0) # i only consider users active until 1month before the ending date for the DB + + + + + + database = "calorie_king_social_networking_2010" + server="tarraco.chem-eng.northwestern.edu" + user="calorieking" + passwd="n1ckuDB!" + + db= Connection(server, database, user, passwd) + + query="""select * from users""" + result1 = db.query(query) # is a list of dict. + + + + + # cont=0 + #num_friendships=0 + + # for r1 in result1: + # ck_id=r1['ck_id'] + + # query4="select * from friends where (src ='"+str(ck_id)+"')or (dest ='"+str(ck_id)+"') " + # result4= db.query(query4) + # if len(result4)>0: + # cont+=1 + # num_friendships+=len(result4) + + # query11="""SELECT * FROM private_messages""" + # result11 = db.query(query11) + + # query22="""SELECT * FROM forum_posts""" + # result22 = db.query(query22) + + # query33="""SELECT * FROM blog_comments """ + # result33 = db.query(query33) + + # query44="""SELECT * FROM homepage_comment""" + # result44 = db.query(query44) + + + # print "number of users with k>0",cont,"num links:",float(num_friendships)/2., "tot. num. messages",len(result11), "tot. num. forum posts",len(result22), "tot. num. blog posts",len(result33), "tot. num. homepage posts",len(result44) + + # exit() + + + H1 = nx.read_gml(graph_name1) + G1 = nx.connected_component_subgraphs(H1)[0] # Giant component + + + + H2 = nx.read_gml(graph_name2) + G2 = nx.connected_component_subgraphs(H2)[0] # Giant component + + dict_R6overlap={} + for i in range(max(list(G1.degree().values()))): + dict_R6overlap[i]=[] # {0:[],1:[],2:[],...} dict of lists of nodes with 0, 1, 2,... R6s_overlap + + + + + list_hub_ids=[] + list_R6_ids=[] + list_R6overlap_ids=[] # connected to R6s + for node in G1.nodes(): + if (G1.node[node]['role'] =="R6"): + list_R6_ids.append(int(G1.node[node]['label']))# this actually corresponds to the id from the users table in the DB + + if (int(G1.node[node]['degree']) >=30): + list_hub_ids.append(int(G1.node[node]['label'])) + + + if (int(G1.node[node]['R6_overlap']) >0): + list_R6overlap_ids.append(int(G1.node[node]['label'])) + + index=int(G1.node[node]['R6_overlap']) + dict_R6overlap[index].append(int(G1.node[node]['label'])) + else: + dict_R6overlap[0].append(int(G1.node[node]['label'])) + + + + + top_kshell=12 # because i know is that so + list_top_kshell_ids=[] + for node in G2.nodes(): + if (int(G2.node[node]['kshell_index']) >= top_kshell): + list_top_kshell_ids.append(int(G2.node[node]['label']))# this actually corresponds to the id from the users table in the DB + + + + print "# R6s: ",len(list_R6_ids),"# hubs: ",len(list_hub_ids),"# 12-kshells: ",len(list_top_kshell_ids) ## 55 of them + + + + + + list_weight_changes_gaps=[] + list_weight_changes_gaps_network=[] + list_weight_changes_gaps_R6s=[] + list_weight_changes_gaps_hubs=[] + list_weight_changes_gaps_top_kshells=[] + list_weight_changes_gaps_R6overlap=[] # people connected to R6s + + + list_all_weight_changes=[] + list_all_weight_changes_network=[] + list_all_weight_changes_R6s=[] + list_all_weight_changes_hubs=[] + list_all_weight_changes_top_kshells=[] + list_all_weight_changes_R6overlap=[] + + + list_net_weight_changes=[] + list_net_weight_changes_network=[] + list_net_weight_changes_R6s=[] + list_net_weight_changes_hubs=[] + list_net_weight_changes_top_kshells=[] + list_net_weight_changes_R6overlap=[] + + + num_users=0. + num_users_with_gaps=0. + num_network_users_with_gaps=0. + num_network_users=0. + + list_gaps_no_gaps_all=[] # prob. of having a gap + list_gaps_no_gaps_network=[] + list_gaps_no_gaps_R6s=[] + list_gaps_no_gaps_hubs=[] + list_gaps_no_gaps_top_kshells=[] + list_gaps_no_gaps_R6overlap=[] + + list_weight_gain_after_gap_all=[] # prob. of gaining weight after a gap + list_weight_gain_after_gap_network=[] + list_weight_gain_after_gap_R6s=[] + list_weight_gain_after_gap_hubs=[] + list_weight_gain_after_gap_top_kshells=[] + list_weight_gain_after_gap_R6overlap=[] + + + list_init_gap_all=[] # prob. of having a gap initally/in the middle/ at the end + list_init_gap_network=[] + list_init_gap_R6s=[] + list_init_gap_hubs=[] + list_init_gap_top_kshells=[] + list_init_gap_R6overlap=[] + + list_middle_gap_all=[] + list_middle_gap_network=[] + list_middle_gap_R6s=[] + list_middle_gap_hubs=[] + list_middle_gap_top_kshells=[] + list_middle_gap_R6overlap=[] + + list_end_gap_all=[] + list_end_gap_network=[] + list_end_gap_R6s=[] + list_end_gap_hubs=[] + list_end_gap_top_kshells=[] + list_end_gap_R6overlap=[] + + + for r1 in result1: #loop over users to get their gap info + + + index+=1 + # if index <= max_index: # just to try out the program with a few time series + + print index + + ck_id=r1['ck_id'] + n_id=int(r1['id']) #this corresponds with the 'label' attribute in the .gml file + + + + + query2="select * from gaps_by_frequency where (ck_id ='"+str(ck_id)+"') order by start_day asc" + result2 = db.query(query2) + + + + query3="select * from weigh_in_history where (ck_id ='"+str(ck_id)+"') order by on_day asc" + result3 = db.query(query3) + + + + query4="select * from friends where (src ='"+str(ck_id)+"')or (dest ='"+str(ck_id)+"') " + result4= db.query(query4) + + + + + if len(result3)>=min_num_weigh_ins: ################### ONLY if more than X weigh-ins + num_users+=1. + + + cont=0 + initial_weight=float(result3[0]['weight']) + for r3 in result3: # to calculate weight-change, i skip the first entry + if cont>0: + list_all_weight_changes.append(float(r3['weight'])-initial_weight) + + if len(result4)>0: # if he/she has friends + list_all_weight_changes_network.append(float(r3['weight'])-initial_weight) + + if n_id in list_R6_ids: + list_all_weight_changes_R6s.append(float(r3['weight'])-initial_weight) + if n_id in list_hub_ids: + list_all_weight_changes_hubs.append(float(r3['weight'])-initial_weight) + if n_id in list_top_kshell_ids: + list_all_weight_changes_top_kshells.append(float(r3['weight'])-initial_weight) + + if n_id in list_R6overlap_ids: + list_all_weight_changes_R6overlap.append(float(r3['weight'])-initial_weight) + + + + + initial_weight=float(r3['weight']) + cont+=1 + list_net_weight_changes.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + + + if len(result4)>0: + num_network_users+=1. + list_net_weight_changes_network.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + + if n_id in list_R6_ids: + list_net_weight_changes_R6s.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + if n_id in list_hub_ids: + list_net_weight_changes_hubs.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + if n_id in list_top_kshell_ids: + list_net_weight_changes_top_kshells.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + + if n_id in list_R6overlap_ids: + list_net_weight_changes_R6overlap.append(float(result3[-1]['weight'])-float(result3[0]['weight'])) + + + if len(result2)>0: # if there is a gap record for that user + num_users_with_gaps+=1. + list_gaps_no_gaps_all.append(1.) + + if len(result4)>0: + num_network_users_with_gaps+=1. + + + list_gaps_no_gaps_network.append(1.) + + if n_id in list_R6_ids: + list_gaps_no_gaps_R6s.append(1.) + if n_id in list_hub_ids: + list_gaps_no_gaps_hubs.append(1.) + if n_id in list_top_kshell_ids: + list_gaps_no_gaps_top_kshells.append(1.) + if n_id in list_R6overlap_ids: + list_gaps_no_gaps_R6overlap.append(1.) + + + for r2 in result2: + + start_index=int(r2['index_start_day']) # index for the entry in the gaps_by_frequency table where the gap is (point number...): starting on 0 + end_index=int(r2['index_end_day']) + + + + +# to compute probabilities of having gaps at the beginning/middle/end, i am carefull about the time series +# not ending JUST because the DB ended + + date_end_serie=result3[-1]['on_day'] + if date_end_serie <=reasonable_ending: + # print "gap",date_end_serie,"prior to the reasonable ending date of the DB",ending_date_DB + + + if start_index <= few_points: + list_init_gap_all.append(1.0) # prob. of having a gap initally/in the middle/ at the end + list_middle_gap_all.append(0.0) + list_end_gap_all.append(0.0) + + if len(result4)>0: + list_init_gap_network.append(1.0) + list_middle_gap_network.append(0.0) + list_end_gap_network.append(0.0) + + + if n_id in list_R6_ids: + list_init_gap_R6s.append(1.0) + list_middle_gap_R6s.append(0.0) + list_end_gap_R6s.append(0.0) + if n_id in list_hub_ids: + list_init_gap_hubs.append(1.0) + list_middle_gap_hubs.append(0.0) + list_end_gap_hubs.append(0.0) + if n_id in list_top_kshell_ids: + list_init_gap_top_kshells.append(1.0) + list_middle_gap_top_kshells.append(0.0) + list_end_gap_top_kshells.append(0.0) + if n_id in list_R6overlap_ids: + list_init_gap_R6overlap.append(1.0) + list_middle_gap_R6overlap.append(0.0) + list_end_gap_R6overlap.append(0.0) + + + elif len(result3)-int(end_index)<=few_points: + list_init_gap_all.append(0.0) + list_middle_gap_all.append(0.0) + list_end_gap_all.append(1.0) + + if len(result4)>0: + list_init_gap_network.append(0.0) + list_middle_gap_network.append(0.0) + list_end_gap_network.append(1.0) + + if n_id in list_R6_ids: + list_init_gap_R6s.append(0.0) + list_middle_gap_R6s.append(0.0) + list_end_gap_R6s.append(1.0) + if n_id in list_hub_ids: + list_init_gap_hubs.append(0.0) + list_middle_gap_hubs.append(0.0) + list_end_gap_hubs.append(1.0) + if n_id in list_top_kshell_ids: + list_init_gap_top_kshells.append(0.0) + list_middle_gap_top_kshells.append(0.0) + list_end_gap_top_kshells.append(1.0) + if n_id in list_R6overlap_ids: + list_init_gap_R6overlap.append(0.0) + list_middle_gap_R6overlap.append(0.0) + list_end_gap_R6overlap.append(1.0) + + else: + list_init_gap_all.append(0.0) + list_middle_gap_all.append(1.0) + list_end_gap_all.append(0.0) + + if len(result4)>0: + list_init_gap_network.append(0.0) + list_middle_gap_network.append(1.0) + list_end_gap_network.append(0.0) + + if n_id in list_R6_ids: + list_init_gap_R6s.append(0.0) + list_middle_gap_R6s.append(1.0) + list_end_gap_R6s.append(0.0) + if n_id in list_hub_ids: + list_init_gap_hubs.append(0.0) + list_middle_gap_hubs.append(1.0) + list_end_gap_hubs.append(0.0) + if n_id in list_top_kshell_ids: + list_init_gap_top_kshells.append(0.0) + list_middle_gap_top_kshells.append(1.0) + list_end_gap_top_kshells.append(0.0) + if n_id in list_R6overlap_ids: + list_init_gap_R6overlap.append(0.0) + list_middle_gap_R6overlap.append(1.0) + list_end_gap_R6overlap.append(0.0) + else: #i dont consider a gap too close to the end of the DB final date + print date_end_serie,"after reasonable ending date of the DB",ending_date_DB + + + tot_weight_change_gap=float(result3[end_index]['weight'])-float(result3[start_index]['weight']) + + list_weight_changes_gaps.append(tot_weight_change_gap) + + + if tot_weight_change_gap>=0.: + list_weight_gain_after_gap_all.append(1.) + else: + list_weight_gain_after_gap_all.append(0.) + + + + if len(result4)>0: + + if tot_weight_change_gap>=0.: + list_weight_gain_after_gap_network.append(1.) + + if n_id in list_R6_ids: + list_weight_gain_after_gap_R6s.append(1.) + if n_id in list_hub_ids: + list_weight_gain_after_gap_hubs.append(1.) + if n_id in list_top_kshell_ids: + list_weight_gain_after_gap_top_kshells.append(1.) + if n_id in list_R6overlap_ids: + list_weight_gain_after_gap_R6overlap.append(1.) + else: + list_weight_gain_after_gap_network.append(0.) + + if n_id in list_R6_ids: + list_weight_gain_after_gap_R6s.append(0.) + if n_id in list_hub_ids: + list_weight_gain_after_gap_hubs.append(0.) + if n_id in list_top_kshell_ids: + list_weight_gain_after_gap_top_kshells.append(0.) + if n_id in list_R6overlap_ids: + list_weight_gain_after_gap_R6overlap.append(0.) + + + + + list_weight_changes_gaps_network.append(float(result3[end_index]['weight'])- float(result3[start_index]['weight']) ) + + if n_id in list_R6_ids: + list_weight_changes_gaps_R6s.append(float(result3[end_index]['weight'])- float(result3[start_index]['weight'])) + if n_id in list_hub_ids: + list_weight_changes_gaps_hubs.append(float(result3[end_index]['weight'])- float(result3[start_index]['weight'])) + if n_id in list_top_kshell_ids: + list_weight_changes_gaps_top_kshells.append(float(result3[end_index]['weight'])- float(result3[start_index]['weight'])) + + if n_id in list_R6overlap_ids: + list_weight_changes_gaps_R6overlap.append(float(result3[end_index]['weight'])- float(result3[start_index]['weight'])) + + + + + else: # no gaps + list_gaps_no_gaps_all.append(0.) + if len(result4)>0: + list_gaps_no_gaps_network.append(0.) + + if n_id in list_R6_ids: + list_gaps_no_gaps_R6s.append(0.) + if n_id in list_hub_ids: + list_gaps_no_gaps_hubs.append(0.) + if n_id in list_top_kshell_ids: + list_gaps_no_gaps_top_kshells.append(0.) + if n_id in list_R6overlap_ids: + list_gaps_no_gaps_R6overlap.append(0.) + + + + print 'total # users with >=',min_num_weigh_ins,'w-ins: ',num_users,' total # users with gaps: ',num_users_with_gaps + print 'network users with >=',min_num_weigh_ins, 'w-ins: ',num_network_users,' network users with gaps: ',num_network_users_with_gaps,"\n" + + + print 'weight_change gaps all users:',numpy.mean(list_weight_changes_gaps),' +/- ',numpy.std(list_weight_changes_gaps) + print ' ntwk users:',numpy.mean(list_weight_changes_gaps_network),' +/- ',numpy.std(list_weight_changes_gaps_network) + print ' R6s:',numpy.mean(list_weight_changes_gaps_R6s),' +/- ',numpy.std(list_weight_changes_gaps_R6s) + print ' hubs:',numpy.mean(list_weight_changes_gaps_hubs),' +/- ',numpy.std(list_weight_changes_gaps_hubs) + print ' top kshells:',numpy.mean(list_weight_changes_gaps_top_kshells),' +/- ',numpy.std(list_weight_changes_gaps_top_kshells) + print ' R6overlap:',numpy.mean(list_weight_changes_gaps_R6overlap),' +/- ',numpy.std(list_weight_changes_gaps_R6overlap),"\n" + + + print 'average weight change all changes, all users:',numpy.mean(list_all_weight_changes),' +/- ',numpy.std(list_all_weight_changes) + print 'average weight change all changes, network users:',numpy.mean(list_all_weight_changes_network),' +/- ',numpy.std(list_all_weight_changes_network),"\n" + + + + print '# all weight changes:',len(list_all_weight_changes),' # gap changes all users:',len(list_weight_changes_gaps),' # gap changes network users:',len(list_weight_changes_gaps_network),"\n" + + + + print '# net weight changes all users:',numpy.mean(list_net_weight_changes),numpy.std(list_net_weight_changes),' # events::',len(list_net_weight_changes) + print '# network:',numpy.mean(list_net_weight_changes_network),numpy.std(list_net_weight_changes_network),' # events:',len(list_net_weight_changes_network) + print '# R6s:',numpy.mean(list_net_weight_changes_R6s),numpy.std(list_net_weight_changes_R6s),' # events:',len(list_net_weight_changes_R6s) + print '# hubs:',numpy.mean(list_net_weight_changes_hubs),numpy.std(list_net_weight_changes_hubs),' # events:',len(list_net_weight_changes_hubs) + print '# top kshells:',numpy.mean(list_net_weight_changes_top_kshells),numpy.std(list_net_weight_changes_top_kshells),' # events:',len(list_net_weight_changes_top_kshells) + print '# R6overlap:',numpy.mean(list_net_weight_changes_R6overlap),numpy.std(list_net_weight_changes_R6overlap),' # events:',len(list_net_weight_changes_R6overlap),"\n" + + + + print "prob. having a gap for tot population with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_all)," # events:",len(list_gaps_no_gaps_all) + print " for network population with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_network),"# events: ",len(list_gaps_no_gaps_network) + print " for R6s with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_R6s),"# events: ",len(list_gaps_no_gaps_R6s) + print " for hubs with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_hubs),"# events: ",len(list_gaps_no_gaps_hubs) + print " for 12-kshells with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_top_kshells),"# events: ",len(list_gaps_no_gaps_top_kshells) + print " for R6overlap with more than 30weigh-ins:",numpy.mean(list_gaps_no_gaps_R6overlap),"# events: ",len(list_gaps_no_gaps_R6overlap),"\n" + + + + print "prob. gaining weight after a gap, for tot population with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_all)," # events:",len(list_weight_gain_after_gap_all) + print " for network population with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_network)," # events:",len(list_weight_gain_after_gap_network) + print " for R6s with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_R6s)," # events:",len(list_weight_gain_after_gap_R6s) + print " for hubs with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_hubs)," # events:",len(list_weight_gain_after_gap_hubs) + print " for 12-kshells with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_top_kshells)," # events:",len(list_weight_gain_after_gap_top_kshells) + print " for R6overlap with gaps and more than 30weigh-ins:",numpy.mean(list_weight_gain_after_gap_R6overlap)," # events:",len(list_weight_gain_after_gap_R6overlap),"\n" + + + + + print "prob. init. gap for all:",numpy.mean(list_init_gap_all)," middle:",numpy.mean(list_middle_gap_all)," end:",numpy.mean(list_end_gap_all) + print " for network:",numpy.mean(list_init_gap_network)," middle:",numpy.mean(list_middle_gap_network)," end:",numpy.mean(list_end_gap_network) + print " for R6s:",numpy.mean(list_init_gap_R6s)," middle:",numpy.mean(list_middle_gap_R6s)," end:",numpy.mean(list_end_gap_R6s) + print " for hubs:",numpy.mean(list_init_gap_hubs)," middle:",numpy.mean(list_middle_gap_hubs)," end:",numpy.mean(list_end_gap_hubs) + print " for top kshell:",numpy.mean(list_init_gap_top_kshells)," middle:",numpy.mean(list_middle_gap_top_kshells)," end:",numpy.mean(list_end_gap_top_kshells) + + print " for R6overlap:",numpy.mean(list_init_gap_R6overlap)," middle:",numpy.mean(list_middle_gap_R6overlap)," end:",numpy.mean(list_end_gap_R6overlap),"\n" + + + + print "zscore between all weight changes and gap changes (All):",bootstrap(list_all_weight_changes, len(list_weight_changes_gaps),numpy.std(list_weight_changes_gaps),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes), len(list_weight_changes_gaps) + + print "zscore between all weight changes and gap changes (Network):",bootstrap(list_all_weight_changes_network, len(list_weight_changes_gaps_network),numpy.std(list_weight_changes_gaps_network),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes_network), len(list_weight_changes_gaps_network) # which of the 2 comparisons makes more sense?? + + print "zscore between all weight changes (R6s) and gap changes (R6s):",bootstrap(list_all_weight_changes_R6s, len(list_weight_changes_gaps_R6s),numpy.std(list_weight_changes_gaps_R6s),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes_R6s), len(list_weight_changes_gaps_R6s) + + print "zscore between all weight changes (hubs) and gap changes (hubs):",bootstrap(list_all_weight_changes_hubs, len(list_weight_changes_gaps_hubs),numpy.std(list_weight_changes_gaps_hubs),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes_hubs), len(list_weight_changes_gaps_hubs) + + print "zscore between all weight changes (top kshells) and gap changes (top kshells):",bootstrap(list_all_weight_changes_top_kshells, len(list_weight_changes_gaps_top_kshells),numpy.std(list_weight_changes_gaps_top_kshells),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes_top_kshells), len(list_weight_changes_gaps_top_kshells) + + print "zscore between all weight changes (R6overlap) and gap changes (R6overlap):",bootstrap(list_all_weight_changes_R6overlap, len(list_weight_changes_gaps_R6overlap),numpy.std(list_weight_changes_gaps_R6overlap),Niter_bootstrap),Niter_bootstrap,"iter",len(list_all_weight_changes_R6overlap), len(list_weight_changes_gaps_R6overlap),"\n" + + + + + print "zscore between gap changes (All) and gap changes (Network):",bootstrap(list_weight_changes_gaps, len(list_weight_changes_gaps_network),numpy.std(list_weight_changes_gaps_network),Niter_bootstrap),Niter_bootstrap,"iter",len(list_weight_changes_gaps), numpy.mean(list_weight_changes_gaps), numpy.std(list_weight_changes_gaps),len(list_weight_changes_gaps_network),numpy.mean(list_weight_changes_gaps_network),numpy.std(list_weight_changes_gaps_network) # which of the 2 comparisons makes more sense?? + + print "zscore between gap changes (All) and gap changes (R6overlap):",bootstrap(list_weight_changes_gaps, len(list_weight_changes_gaps_R6overlap),numpy.std(list_weight_changes_gaps_R6overlap),Niter_bootstrap),Niter_bootstrap,"iter",len(list_weight_changes_gaps), numpy.mean(list_weight_changes_gaps), numpy.std(list_weight_changes_gaps),len(list_weight_changes_gaps_R6overlap) ,numpy.mean(list_weight_changes_gaps_R6overlap),numpy.std(list_weight_changes_gaps_R6overlap) + + print "zscore between gap changes (Networked) and gap changes (R6overlap):",bootstrap(list_weight_changes_gaps_network, len(list_weight_changes_gaps_R6overlap),numpy.std(list_weight_changes_gaps_R6overlap),Niter_bootstrap),Niter_bootstrap,"iter",len(list_weight_changes_gaps_network), numpy.mean(list_weight_changes_gaps_network), numpy.std(list_weight_changes_gaps_network),len(list_all_weight_changes_R6overlap), numpy.mean(list_weight_changes_gaps_R6overlap), numpy.std(list_weight_changes_gaps_R6overlap) + + +####################################### +def sample_with_replacement(population, k): + "Chooses k random elements (with replacement) from a population" + n = len(population) + _random, _int = random.random, int # speed hack + result = [None] * k + for i in xrange(k): + j = _int(_random() * n) + result[i] = population[j] + return result + + +############################################# +def bootstrap(lista, sample_size,real_average_value,Niter): + + + list_synth_average_values=[] + + + for iter in range (Niter): + + list_synth=sample_with_replacement(lista,sample_size) + + list_synth_average_values.append(numpy.mean(list_synth)) + + + + zscore=(real_average_value-numpy.mean(list_synth_average_values))/numpy.std(list_synth_average_values) + + + return zscore + + + + +######################### + +if __name__== "__main__": + if len(sys.argv) > 2: + graph_filename1 = sys.argv[1] + graph_filename2 = sys.argv[2] + + + main(graph_filename1,graph_filename2) + else: + print "usage: python whatever.py path/network_file1_R6s_info.gml path/network_file2_kshell_info.gml " + + + + + +##############################################