diff --git a/benchmarks/tabular/analysis.py b/benchmarks/tabular/analysis.py new file mode 100644 index 00000000..2608da52 --- /dev/null +++ b/benchmarks/tabular/analysis.py @@ -0,0 +1,798 @@ +import json + +import matplotlib.pyplot as plt +import numpy as np + +with open("tabular_results.json", "r") as j: + metrics = json.loads(j.read()) + +# ~~~REGRESSION~~~ + +# MAP +map_nlls = [ + metrics["regression"][k]["map"]["nll"] for k in metrics["regression"].keys() +] +map_quantiles_nlls = np.percentile(map_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + +map_picp_errors = [ + np.abs(0.95 - metrics["regression"][k]["map"]["picp"]) + for k in metrics["regression"].keys() +] +map_quantiles_picp_errors = np.percentile( + map_picp_errors, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) + +map_times = [ + metrics["regression"][k]["map"]["time"] for k in metrics["regression"].keys() +] + +# TEMPERATURE SCALING +temp_scaling_nlls = [ + metrics["regression"][k]["temp_scaling"]["nll"] + for k in metrics["regression"].keys() +] +temp_scaling_quantiles_nlls = np.percentile( + temp_scaling_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_nlls = np.sum(np.array(temp_scaling_nlls) / np.array(map_nlls) <= 1) +winlose_temp_scaling_nlls = ( + f"{win_temp_scaling_nlls} / {len(map_nlls) - win_temp_scaling_nlls}" +) +rel_improve_temp_scaling_nlls = ( + np.array(map_nlls) - np.array(temp_scaling_nlls) +) / np.array(map_nlls) +max_loss_temp_scaling_nlls = ( + str( + np.round( + 100 + * np.abs( + np.max(rel_improve_temp_scaling_nlls[rel_improve_temp_scaling_nlls < 0]) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_nlls = ( + f"{np.round(np.median(rel_improve_temp_scaling_nlls), 2)}" +) + + +temp_scaling_picp_errors = [ + np.abs(0.95 - metrics["regression"][k]["temp_scaling"]["picp"]) + for k in metrics["regression"].keys() +] +temp_scaling_quantiles_picp_errors = np.percentile( + temp_scaling_picp_errors, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_picp_errors = np.sum( + np.array(temp_scaling_picp_errors) / np.array(map_picp_errors) <= 1 +) +winlose_temp_scaling_picp_errors = f"{win_temp_scaling_picp_errors} / {len(map_picp_errors) - win_temp_scaling_picp_errors}" +rel_improve_temp_scaling_picp_errors = ( + np.array(map_picp_errors) - np.array(temp_scaling_picp_errors) +) / np.array(map_picp_errors) +max_loss_temp_scaling_picp_errors = ( + str( + np.round( + 100 + * np.abs( + np.max( + rel_improve_temp_scaling_picp_errors[ + rel_improve_temp_scaling_picp_errors < 0 + ] + ) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_picp_errors = ( + f"{np.round(np.median(rel_improve_temp_scaling_picp_errors), 2)}" +) + +temp_scaling_times = [ + metrics["regression"][k]["temp_scaling"]["time"] + for k in metrics["regression"].keys() +] + +# CQR +cqr_picp_errors = [ + np.abs(0.95 - metrics["regression"][k]["cqr"]["picp"]) + for k in metrics["regression"].keys() +] +cqr_quantiles_picp_errors = np.percentile( + cqr_picp_errors, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_cqr_picp_errors = np.sum(np.array(cqr_picp_errors) / np.array(map_picp_errors) <= 1) +winlose_cqr_picp_errors = ( + f"{win_cqr_picp_errors} / {len(map_picp_errors) - win_cqr_picp_errors}" +) +rel_improve_cqr_picp_errors = ( + np.array(map_picp_errors) - np.array(cqr_picp_errors) +) / np.array(map_picp_errors) +max_loss_cqr_picp_errors = ( + str( + np.round( + 100 + * np.abs( + np.max(rel_improve_cqr_picp_errors[rel_improve_cqr_picp_errors < 0]) + ), + 2, + ) + ) + + "%" +) +med_improv_cqr_picp_errors = f"{np.round(np.median(rel_improve_cqr_picp_errors), 2)}" + +cqr_times = [ + metrics["regression"][k]["cqr"]["time"] for k in metrics["regression"].keys() +] + +# # TEMPERED CQR +temp_cqr_picp_errors = [ + np.abs(0.95 - metrics["regression"][k]["temp_cqr"]["picp"]) + for k in metrics["regression"].keys() +] +temp_cqr_quantiles_picp_errors = np.percentile( + temp_cqr_picp_errors, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_temp_cqr_picp_errors = f"{np.sum(np.array(temp_cqr_picp_errors) / np.array(map_picp_errors) <= 1)} / {len(map_picp_errors)}" +med_improv_temp_cqr_picp_errors = f"{np.round(np.median((np.array(map_picp_errors) - np.array(temp_cqr_picp_errors)) / np.array(map_picp_errors)), 2)}" + +temp_cqr_times = [ + metrics["regression"][k]["temp_cqr"]["time"] for k in metrics["regression"].keys() +] + +plt.figure(figsize=(8, 6)) +plt.suptitle("Quantile-quantile plots of metrics on regression datasets") + +plt.subplot(2, 2, 1) +plt.title("NLL") +plt.scatter(map_quantiles_nlls, temp_scaling_quantiles_nlls, s=3) +_min, _max = min(map_quantiles_nlls.min(), temp_scaling_quantiles_nlls.min()), max( + map_quantiles_nlls.max(), temp_scaling_quantiles_nlls.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("temp scaling quantiles") +plt.grid() + +plt.subplot(2, 2, 2) +plt.title("PICP absolute error") +plt.scatter(map_quantiles_picp_errors, temp_scaling_quantiles_picp_errors, s=3) +_min, _max = min( + map_quantiles_picp_errors.min(), temp_scaling_quantiles_picp_errors.min() +), max(map_quantiles_picp_errors.max(), temp_scaling_quantiles_picp_errors.max()) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("temp scaling quantiles") +plt.grid() + +plt.subplot(2, 2, 4) +plt.title("PICP absolute error") +plt.scatter(map_quantiles_picp_errors, cqr_quantiles_picp_errors, s=3) +_min, _max = min(map_quantiles_picp_errors.min(), cqr_quantiles_picp_errors.min()), max( + map_quantiles_picp_errors.max(), cqr_quantiles_picp_errors.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("CQR quantiles") +plt.grid() + +plt.tight_layout() + +plt.show() + +print("~~~REGRESSION~~~\n") +print("## TEMPERATURE SCALING ##") +print( + f"Fraction of times temp_scaling is at least on a par w.r.t. the NLL: {winlose_temp_scaling_nlls}" +) +print( + f"Fraction of times temp_scaling is at least on a par w.r.t. the PICP error: {winlose_temp_scaling_picp_errors}" +) +print() +print( + f"Median of relative NLL improvement given by temp_scaling: {med_improv_temp_scaling_nlls}" +) +print( + f"Median of relative PICP error improvement given by temp_scaling: {med_improv_temp_scaling_picp_errors}" +) +print() +print() +print("## CQR ##") +print( + f"Fraction of times CQR is at least on a par w.r.t. the PICP error: {winlose_cqr_picp_errors}" +) +print() +print( + f"Median of relative PICP error improvement given by temp_scaling: {med_improv_cqr_picp_errors}" +) + + +# ~~~CLASSIFICATION~~~ + +# MAP +map_nlls = [ + metrics["classification"][k]["map"]["nll"] for k in metrics["classification"].keys() +] +map_quantiles_nlls = np.percentile(map_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + +map_mse = [ + metrics["classification"][k]["map"]["mse"] for k in metrics["classification"].keys() +] +map_quantiles_mse = np.percentile(map_mse, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + +map_ece = [ + metrics["classification"][k]["map"]["ece"] for k in metrics["classification"].keys() +] +map_quantiles_ece = np.percentile(map_ece, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + +map_rocauc = [ + metrics["classification"][k]["map"]["rocauc"] + for k in metrics["classification"].keys() + if "rocauc" in metrics["classification"][k]["map"] +] +map_quantiles_rocauc = np.percentile(map_rocauc, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + +map_prauc = [ + metrics["classification"][k]["map"]["prauc"] + for k in metrics["classification"].keys() + if "prauc" in metrics["classification"][k]["map"] +] +map_quantiles_prauc = np.percentile(map_prauc, [10, 20, 30, 40, 50, 60, 70, 80, 90]) + +map_acc = [ + metrics["classification"][k]["map"]["accuracy"] + for k in metrics["classification"].keys() +] + +map_times = [ + metrics["regression"][k]["map"]["time"] for k in metrics["regression"].keys() +] + +# TEMPERATURE SCALING +temp_scaling_nlls = [ + metrics["classification"][k]["temp_scaling"]["nll"] + for k in metrics["classification"].keys() +] +temp_scaling_quantiles_nlls = np.percentile( + temp_scaling_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_nlls = np.sum(np.array(temp_scaling_nlls) / np.array(map_nlls) <= 1) +winlose_temp_scaling_nlls = ( + f"{win_temp_scaling_nlls} / {len(map_nlls) - win_temp_scaling_nlls}" +) +rel_improve_temp_scaling_nlls = ( + np.array(map_nlls) - np.array(temp_scaling_nlls) +) / np.array(map_nlls) +max_loss_temp_scaling_nlls = ( + str( + np.round( + 100 + * np.abs( + np.max(rel_improve_temp_scaling_nlls[rel_improve_temp_scaling_nlls < 0]) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_nlls = ( + f"{np.round(np.median(rel_improve_temp_scaling_nlls), 2)}" +) + +temp_scaling_mse = [ + metrics["classification"][k]["temp_scaling"]["mse"] + for k in metrics["classification"].keys() +] +temp_scaling_quantiles_mse = np.percentile( + temp_scaling_mse, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_mse = np.sum(np.array(temp_scaling_mse) / np.array(map_mse) <= 1) +winlose_temp_scaling_mse = ( + f"{win_temp_scaling_mse} / {len(map_mse) - win_temp_scaling_mse}" +) +rel_improve_temp_scaling_mse = ( + np.array(map_mse) - np.array(temp_scaling_mse) +) / np.array(map_mse) +max_loss_temp_scaling_mse = ( + str( + np.round( + 100 + * np.abs( + np.max(rel_improve_temp_scaling_mse[rel_improve_temp_scaling_mse < 0]) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_mse = f"{np.round(np.median(rel_improve_temp_scaling_mse), 2)}" + +temp_scaling_ece = [ + metrics["classification"][k]["temp_scaling"]["ece"] + for k in metrics["classification"].keys() +] +temp_scaling_quantiles_ece = np.percentile( + temp_scaling_ece, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_ece = np.sum(np.array(temp_scaling_ece) / np.array(map_ece) <= 1) +winlose_temp_scaling_ece = ( + f"{win_temp_scaling_ece} / {len(map_ece) - win_temp_scaling_ece}" +) +rel_improve_temp_scaling_ece = ( + np.array(map_ece) - np.array(temp_scaling_ece) +) / np.array(map_ece) +max_loss_temp_scaling_ece = ( + str( + np.round( + 100 + * np.abs( + np.max(rel_improve_temp_scaling_ece[rel_improve_temp_scaling_ece < 0]) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_ece = f"{np.round(np.median(rel_improve_temp_scaling_ece), 2)}" + +temp_scaling_rocauc = [ + metrics["classification"][k]["temp_scaling"]["rocauc"] + for k in metrics["classification"].keys() + if "rocauc" in metrics["classification"][k]["temp_scaling"] +] +temp_scaling_quantiles_rocauc = np.percentile( + temp_scaling_rocauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_rocauc = np.sum( + np.array(temp_scaling_rocauc) / np.array(map_rocauc) <= 1 +) +winlose_temp_scaling_rocauc = ( + f"{win_temp_scaling_rocauc} / {len(map_rocauc) - win_temp_scaling_rocauc}" +) +rel_improve_temp_scaling_rocauc = ( + np.array(map_rocauc) - np.array(temp_scaling_rocauc) +) / np.array(map_rocauc) +max_loss_temp_scaling_rocauc = ( + str( + np.round( + 100 + * np.abs( + np.max( + rel_improve_temp_scaling_rocauc[rel_improve_temp_scaling_rocauc < 0] + ) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_rocauc = ( + f"{np.round(np.median(rel_improve_temp_scaling_rocauc), 2)}" +) + +temp_scaling_prauc = [ + metrics["classification"][k]["temp_scaling"]["prauc"] + for k in metrics["classification"].keys() + if "prauc" in metrics["classification"][k]["temp_scaling"] +] +temp_scaling_quantiles_prauc = np.percentile( + temp_scaling_prauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_temp_scaling_prauc = np.sum(np.array(temp_scaling_prauc) / np.array(map_prauc) <= 1) +winlose_temp_scaling_prauc = ( + f"{win_temp_scaling_prauc} / {len(map_prauc) - win_temp_scaling_prauc}" +) +rel_improve_temp_scaling_prauc = ( + np.array(map_prauc) - np.array(temp_scaling_prauc) +) / np.array(map_prauc) +max_loss_temp_scaling_prauc = ( + str( + np.round( + 100 + * np.abs( + np.max( + rel_improve_temp_scaling_prauc[rel_improve_temp_scaling_prauc < 0] + ) + ), + 2, + ) + ) + + "%" +) +med_improv_temp_scaling_prauc = ( + f"{np.round(np.median(rel_improve_temp_scaling_prauc), 2)}" +) + +temp_scaling_acc = [ + metrics["classification"][k]["temp_scaling"]["accuracy"] + for k in metrics["classification"].keys() +] + +temp_scaling_times = [ + metrics["classification"][k]["temp_scaling"]["time"] + for k in metrics["classification"].keys() +] + +# MULTICALIBRATE CONF +mc_conf_nlls = [ + metrics["classification"][k]["mc_conf"]["nll"] + for k in metrics["classification"].keys() +] +mc_conf_nlls = np.array(mc_conf_nlls) +mc_conf_quantiles_nlls = np.percentile( + mc_conf_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_mc_conf_nlls = np.sum(np.array(mc_conf_nlls) / np.array(np.array(map_nlls)) <= 1) +winlose_mc_conf_nlls = f"{win_mc_conf_nlls} / {len(mc_conf_nlls) - win_mc_conf_nlls}" +rel_improve_mc_conf_nlls = (np.array(map_nlls) - np.array(mc_conf_nlls)) / np.array( + map_nlls +) +max_loss_mc_conf_nlls = ( + str( + np.round( + 100 + * np.abs(np.max(rel_improve_mc_conf_nlls[rel_improve_mc_conf_nlls < 0])), + 2, + ) + ) + + "%" +) +med_improv_mc_conf_nlls = f"{np.round(np.median(rel_improve_mc_conf_nlls), 2)}" + +mc_conf_mse = [ + metrics["classification"][k]["mc_conf"]["mse"] + for k in metrics["classification"].keys() +] +mc_conf_mse = np.array(mc_conf_mse) +mc_conf_quantiles_mse = np.percentile(mc_conf_mse, [10, 20, 30, 40, 50, 60, 70, 80, 90]) +win_mc_conf_mse = np.sum(np.array(mc_conf_mse) / np.array(np.array(map_mse)) <= 1) +winlose_mc_conf_mse = f"{win_mc_conf_mse} / {len(mc_conf_mse) - win_mc_conf_mse}" +rel_improve_mc_conf_mse = (np.array(map_mse) - np.array(mc_conf_mse)) / np.array( + map_mse +) +max_loss_mc_conf_mse = ( + str( + np.round( + 100 * np.abs(np.max(rel_improve_mc_conf_mse[rel_improve_mc_conf_mse < 0])), + 2, + ) + ) + + "%" +) +med_improv_mc_conf_mse = f"{np.round(np.median(rel_improve_mc_conf_mse), 2)}" + +mc_conf_ece = [ + metrics["classification"][k]["mc_conf"]["ece"] + for k in metrics["classification"].keys() +] +mc_conf_quantiles_ece = np.percentile(mc_conf_ece, [10, 20, 30, 40, 50, 60, 70, 80, 90]) +win_mc_conf_ece = np.sum(np.array(mc_conf_ece) / np.array(map_ece) <= 1) +winlose_mc_conf_ece = f"{win_mc_conf_ece} / {len(map_ece) - win_mc_conf_ece}" +rel_improve_mc_conf_ece = (np.array(map_ece) - np.array(mc_conf_ece)) / np.array( + map_ece +) +max_loss_mc_conf_ece = ( + str( + np.round( + 100 * np.abs(np.max(rel_improve_mc_conf_ece[rel_improve_mc_conf_ece < 0])), + 2, + ) + ) + + "%" +) +med_improv_mc_conf_ece = f"{np.round(np.median(rel_improve_mc_conf_ece), 2)}" + +mc_conf_rocauc = [ + metrics["classification"][k]["mc_conf"]["rocauc"] + for k in metrics["classification"].keys() +] +mc_conf_rocauc = np.array(mc_conf_rocauc) +mc_conf_quantiles_rocauc = np.percentile( + mc_conf_rocauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_mc_conf_rocauc = np.sum( + np.array(mc_conf_rocauc) / np.array(np.array(map_rocauc)) <= 1 +) +winlose_mc_conf_rocauc = ( + f"{win_mc_conf_rocauc} / {len(mc_conf_rocauc) - win_mc_conf_rocauc}" +) +rel_improve_mc_conf_rocauc = ( + np.array(map_rocauc) - np.array(mc_conf_rocauc) +) / np.array(map_rocauc) +max_loss_mc_conf_rocauc = ( + str( + np.round( + 100 + * np.abs( + np.max(rel_improve_mc_conf_rocauc[rel_improve_mc_conf_rocauc < 0]) + ), + 2, + ) + ) + + "%" +) +med_improv_mc_conf_rocauc = f"{np.round(np.median(rel_improve_mc_conf_rocauc), 2)}" + +mc_conf_prauc = [ + metrics["classification"][k]["mc_conf"]["prauc"] + for k in metrics["classification"].keys() +] +mc_conf_prauc = np.array(mc_conf_prauc) +mc_conf_quantiles_prauc = np.percentile( + mc_conf_prauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +win_mc_conf_prauc = np.sum(np.array(mc_conf_prauc) / np.array(np.array(map_prauc)) <= 1) +winlose_mc_conf_prauc = ( + f"{win_mc_conf_prauc} / {len(mc_conf_prauc) - win_mc_conf_prauc}" +) +rel_improve_mc_conf_prauc = (np.array(map_prauc) - np.array(mc_conf_prauc)) / np.array( + map_prauc +) +max_loss_mc_conf_prauc = ( + str( + np.round( + 100 + * np.abs(np.max(rel_improve_mc_conf_prauc[rel_improve_mc_conf_prauc < 0])), + 2, + ) + ) + + "%" +) +med_improv_mc_conf_prauc = f"{np.round(np.median(rel_improve_mc_conf_prauc), 2)}" + +mc_conf_acc = [ + metrics["classification"][k]["mc_conf"]["accuracy"] + for k in metrics["classification"].keys() +] + +mc_conf_times = [ + metrics["classification"][k]["mc_conf"]["time"] + for k in metrics["classification"].keys() +] + +# TEMPERED MULTICALIBRATE CONF +temp_mc_conf_nlls = [ + metrics["classification"][k]["temp_mc_conf"]["nll"] + for k in metrics["classification"].keys() +] +temp_mc_conf_nlls = np.array(temp_mc_conf_nlls) +temp_mc_conf_quantiles_nlls = np.percentile( + temp_mc_conf_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_temp_mc_conf_nlls = f"{np.sum(np.array(temp_mc_conf_nlls) / np.array(np.array(map_nlls)) <= 1)} / {len(temp_mc_conf_nlls)}" +med_improv_temp_mc_conf_nlls = f"{np.round(np.median((np.array(map_nlls) - np.array(temp_mc_conf_nlls)) / np.array(map_nlls)), 2)}" + +temp_mc_conf_mse = [ + metrics["classification"][k]["temp_mc_conf"]["mse"] + for k in metrics["classification"].keys() +] +temp_mc_conf_mse = np.array(temp_mc_conf_mse) +temp_mc_conf_quantiles_mse = np.percentile( + temp_mc_conf_mse, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_temp_mc_conf_mse = f"{np.sum(np.array(temp_mc_conf_mse) / np.array(np.array(map_mse)) <= 1)} / {len(temp_mc_conf_mse)}" +med_improv_temp_mc_conf_mse = f"{np.round(np.median((np.array(map_mse) - np.array(temp_mc_conf_mse)) / np.array(map_mse)), 2)}" + +temp_mc_conf_ece = [ + metrics["classification"][k]["temp_mc_conf"]["ece"] + for k in metrics["classification"].keys() +] +temp_mc_conf_quantiles_ece = np.percentile( + temp_mc_conf_ece, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_temp_mc_conf_ece = ( + f"{np.sum(np.array(temp_mc_conf_ece) / np.array(map_ece) <= 1)} / {len(map_ece)}" +) +med_improv_temp_mc_conf_ece = f"{np.round(np.median((np.array(map_ece) - np.array(temp_mc_conf_ece)) / np.array(map_ece)), 2)}" + +temp_mc_conf_rocauc = [ + metrics["classification"][k]["temp_mc_conf"]["rocauc"] + for k in metrics["classification"].keys() +] +temp_mc_conf_rocauc = np.array(temp_mc_conf_rocauc) +temp_mc_conf_quantiles_rocauc = np.percentile( + temp_mc_conf_rocauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_temp_mc_conf_rocauc = f"{np.sum(np.array(temp_mc_conf_rocauc) / np.array(np.array(map_rocauc)) <= 1)} / {len(temp_mc_conf_rocauc)}" +med_improv_temp_mc_conf_rocauc = f"{np.round(np.median((np.array(map_rocauc) - np.array(temp_mc_conf_rocauc)) / np.array(map_rocauc)), 2)}" + +temp_mc_conf_prauc = [ + metrics["classification"][k]["temp_mc_conf"]["prauc"] + for k in metrics["classification"].keys() +] +temp_mc_conf_prauc = np.array(temp_mc_conf_prauc) +temp_mc_conf_quantiles_prauc = np.percentile( + temp_mc_conf_prauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_temp_mc_conf_prauc = f"{np.sum(np.array(temp_mc_conf_prauc) / np.array(np.array(map_prauc)) <= 1)} / {len(temp_mc_conf_prauc)}" +med_improv_temp_mc_conf_prauc = f"{np.round(np.median((np.array(map_prauc) - np.array(temp_mc_conf_prauc)) / np.array(map_prauc)), 2)}" + +temp_mc_conf_acc = [ + metrics["classification"][k]["temp_mc_conf"]["accuracy"] + for k in metrics["classification"].keys() +] + +temp_mc_conf_times = [ + metrics["classification"][k]["temp_mc_conf"]["time"] + for k in metrics["classification"].keys() +] + +# MULTICALIBRATE PROB +idx_overlap = [ + i + for i, k in enumerate(metrics["classification"]) + if len(metrics["classification"][k]["mc_prob"]) +] + +mc_prob_nlls = [ + metrics["classification"][k]["mc_prob"]["nll"] + for k in metrics["classification"].keys() + if "nll" in metrics["classification"][k]["mc_prob"] +] +mc_prob_quantiles_nlls = np.percentile( + mc_prob_nlls, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_mc_prob_nlls = f"{np.sum(np.array(mc_prob_nlls) / np.array(map_nlls)[idx_overlap] <= 1)} / {len(idx_overlap)}" +med_improv_mc_prob_nlls = f"{np.round(np.median((np.array(map_nlls)[idx_overlap] - np.array(mc_prob_nlls)) / np.array(map_nlls)[idx_overlap]), 2)}" + +mc_prob_mse = [ + metrics["classification"][k]["mc_prob"]["mse"] + for k in metrics["classification"].keys() + if "mse" in metrics["classification"][k]["mc_prob"] +] +mc_prob_quantiles_mse = np.percentile(mc_prob_mse, [10, 20, 30, 40, 50, 60, 70, 80, 90]) +winlose_mc_prob_mse = f"{np.sum(np.array(mc_prob_mse) / np.array(map_mse)[idx_overlap] <= 1)} / {len(idx_overlap)}" +med_improv_mc_prob_mse = f"{np.round(np.median((np.array(map_mse)[idx_overlap] - np.array(mc_prob_mse)) / np.array(map_mse)[idx_overlap]), 2)}" + +mc_prob_ece = [ + metrics["classification"][k]["mc_prob"]["ece"] + for k in metrics["classification"].keys() + if "ece" in metrics["classification"][k]["mc_prob"] +] +mc_prob_quantiles_ece = np.percentile(mc_prob_ece, [10, 20, 30, 40, 50, 60, 70, 80, 90]) +winlose_mc_prob_ece = f"{np.sum(np.array(mc_prob_ece) / np.array(map_ece)[idx_overlap] <= 1)} / {len(idx_overlap)}" +med_improv_mc_prob_ece = f"{np.round(np.median((np.array(map_ece)[idx_overlap] - np.array(mc_prob_ece)) / np.array(map_ece)[idx_overlap]), 2)}" + +mc_prob_rocauc = [ + metrics["classification"][k]["mc_prob"]["rocauc"] + for k in metrics["classification"].keys() + if "rocauc" in metrics["classification"][k]["mc_prob"] +] +mc_prob_quantiles_rocauc = np.percentile( + mc_prob_rocauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_mc_prob_rocauc = f"{np.sum(np.array(mc_prob_rocauc) / np.array(map_rocauc)[idx_overlap] <= 1)} / {len(idx_overlap)}" +med_improv_mc_prob_rocauc = f"{np.round(np.median((np.array(map_rocauc)[idx_overlap] - np.array(mc_prob_rocauc)) / np.array(map_rocauc)[idx_overlap]), 2)}" + +mc_prob_prauc = [ + metrics["classification"][k]["mc_prob"]["prauc"] + for k in metrics["classification"].keys() + if "prauc" in metrics["classification"][k]["mc_prob"] +] +mc_prob_quantiles_prauc = np.percentile( + mc_prob_prauc, [10, 20, 30, 40, 50, 60, 70, 80, 90] +) +winlose_mc_prob_prauc = f"{np.sum(np.array(mc_prob_prauc) / np.array(map_prauc)[idx_overlap] <= 1)} / {len(idx_overlap)}" +med_improv_mc_prob_prauc = f"{np.round(np.median((np.array(map_prauc)[idx_overlap] - np.array(mc_prob_prauc)) / np.array(map_prauc)[idx_overlap]), 2)}" + +mc_prob_acc = [ + metrics["classification"][k]["mc_prob"]["accuracy"] + for k in metrics["classification"].keys() + if "accuracy" in metrics["classification"][k]["mc_prob"] +] + +mc_prob_times = [ + metrics["classification"][k]["mc_prob"]["time"] + for k in metrics["classification"].keys() + if "time" in metrics["classification"][k]["mc_prob"] +] + +plt.figure(figsize=(10, 6)) +plt.suptitle("Quantile-quantile plots of metrics on classification datasets") + +plt.subplot(2, 4, 1) +plt.title("NLL") +plt.scatter(map_quantiles_nlls, temp_scaling_quantiles_nlls, s=3) +_min, _max = min(map_quantiles_nlls.min(), temp_scaling_quantiles_nlls.min()), max( + map_quantiles_nlls.max(), temp_scaling_quantiles_nlls.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("temp scaling quantiles") +plt.grid() + +plt.subplot(2, 4, 2) +plt.title("MSE") +plt.scatter(map_quantiles_mse, temp_scaling_quantiles_mse, s=3) +_min, _max = min(map_quantiles_mse.min(), temp_scaling_quantiles_mse.min()), max( + map_quantiles_mse.max(), temp_scaling_quantiles_mse.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("temp scaling quantiles") +plt.grid() + +plt.subplot(2, 4, 3) +plt.title("ROCAUC") +plt.scatter(map_quantiles_rocauc, temp_scaling_quantiles_rocauc, s=3) +_min, _max = min(map_quantiles_rocauc.min(), temp_scaling_quantiles_rocauc.min()), max( + map_quantiles_rocauc.max(), temp_scaling_quantiles_rocauc.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("temp scaling quantiles") +plt.grid() + +plt.subplot(2, 4, 4) +plt.title("PRAUC") +plt.scatter(map_quantiles_prauc, temp_scaling_quantiles_prauc, s=3) +_min, _max = min(map_quantiles_prauc.min(), temp_scaling_quantiles_prauc.min()), max( + map_quantiles_prauc.max(), temp_scaling_quantiles_prauc.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("temp scaling quantiles") +plt.grid() + +plt.subplot(2, 4, 5) +plt.title("NLL") +plt.scatter(map_quantiles_nlls, mc_conf_quantiles_nlls, s=3) +_min, _max = min(map_quantiles_nlls.min(), mc_conf_quantiles_nlls.min()), max( + map_quantiles_nlls.max(), mc_conf_quantiles_nlls.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("TLMC quantiles") +plt.grid() + +plt.subplot(2, 4, 6) +plt.title("ECE") +plt.scatter(map_quantiles_ece, mc_conf_quantiles_ece, s=3) +_min, _max = min(map_quantiles_ece.min(), mc_conf_quantiles_ece.min()), max( + map_quantiles_ece.max(), mc_conf_quantiles_ece.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("TLMC quantiles") +plt.grid() + +plt.subplot(2, 4, 6) +plt.title("MSE") +plt.scatter(map_quantiles_mse, mc_conf_quantiles_mse, s=3) +_min, _max = min(map_quantiles_mse.min(), mc_conf_quantiles_mse.min()), max( + map_quantiles_mse.max(), mc_conf_quantiles_mse.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("TLMC quantiles") +plt.grid() + +plt.subplot(2, 4, 7) +plt.title("ROCAUC") +plt.scatter(map_quantiles_rocauc, mc_conf_quantiles_rocauc, s=3) +_min, _max = min(map_quantiles_rocauc.min(), mc_conf_quantiles_rocauc.min()), max( + map_quantiles_rocauc.max(), mc_conf_quantiles_rocauc.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("TLMC quantiles") +plt.grid() + +plt.subplot(2, 4, 8) +plt.title("PRAUC") +plt.scatter(map_quantiles_prauc, mc_conf_quantiles_prauc, s=3) +_min, _max = min(map_quantiles_prauc.min(), mc_conf_quantiles_prauc.min()), max( + map_quantiles_prauc.max(), mc_conf_quantiles_prauc.max() +) +plt.plot([_min, _max], [_min, _max], color="gray", linestyle="--", alpha=0.2) +plt.xlabel("MAP quantiles") +plt.ylabel("TLMC quantiles") +plt.grid() + +plt.tight_layout() +plt.show() diff --git a/benchmarks/tabular/dataset.py b/benchmarks/tabular/dataset.py new file mode 100644 index 00000000..dfb8631b --- /dev/null +++ b/benchmarks/tabular/dataset.py @@ -0,0 +1,907 @@ +""" +A large part of this code is a refactoring of +https://github.com/hughsalimbeni/bayesian_benchmarks/blob/master/bayesian_benchmarks/data.py. +""" + +import abc +from datetime import datetime +import logging +import os +import tarfile +from typing import ( + List, + Tuple, +) +from urllib.request import urlopen +import zipfile + +import numpy as np +import pandas as pd +from scipy.io import loadmat +from scipy.io.arff import loadarff + +from fortuna.data.loader import DataLoader + +SUPPORTED_TASKS = ["regression", "classification"] + +_ALL_REGRESSION_DATASETS = {} +_ALL_CLASSIFICATION_DATASETS = {} + + +def add_regression(C): + _ALL_REGRESSION_DATASETS.update({C.name: C}) + return C + + +def add_classification(C): + _ALL_CLASSIFICATION_DATASETS.update({C.name: C}) + return C + + +class Dataset: + def __init__(self, name: str, url: str, task: str, dir: str): + if task not in SUPPORTED_TASKS: + raise ValueError( + """`task={}` not recognized. Only the following tasks are supported: {}.""".format( + SUPPORTED_TASKS + ) + ) + if not os.path.isdir(dir): + raise ValueError( + f"`dir={dir}` was not found. Please pass an existing directory." + ) + self.name = name + self.url = url + self.task = task + self.dir = dir + + @property + def datadir(self): + return os.path.join(self.dir, self.name) + + @property + def datapath(self): + return os.path.join(self.datadir, self.url.split("/")[-1]) + + @abc.abstractmethod + def read(self) -> Tuple[np.ndarray, np.ndarray]: + pass + + def shuffle(self, X: np.array, Y: np.array, seed: int = 0): + N = X.shape[0] + perm = np.arange(N) + np.random.seed(seed) + np.random.shuffle(perm) + return X[perm], Y[perm] + + def normalize(self, Z: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + Z_mean = np.mean(Z, 0, keepdims=True) + Z_std = 1e-6 + np.std(Z, 0, keepdims=True) + return (Z - Z_mean) / Z_std, Z_mean, Z_std + + def preprocess(self, X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + X, self.X_mean, self.X_std = self.normalize(X) + if self.task == "regression": + Y, self.Y_mean, self.Y_std = self.normalize(Y) + return X, Y + + def split( + self, + X: np.array, + Y: np.array, + prop_train: float = 0.8, + prop_val: float = 0.1, + prop_test: int = 0.1, + ) -> Tuple[ + Tuple[np.ndarray, np.ndarray], + Tuple[np.ndarray, np.ndarray], + Tuple[np.ndarray, np.ndarray], + ]: + if prop_train + prop_val + prop_test != 1.0: + raise ValueError( + """The sum of `prop_train`, `prop_val` and `prop_test` must be 1.""" + ) + N = X.shape[0] + n_train = int(N * prop_train) + n_val = int(N * prop_val) + train_data = X[:n_train], Y[:n_train] + val_data = X[n_train : n_train + n_val], Y[n_train : n_train + n_val] + test_data = X[n_train + n_val :], Y[n_train + n_val :] + return train_data, val_data, test_data + + def batch( + self, + train_data: Tuple[np.ndarray, np.ndarray], + val_data: Tuple[np.ndarray, np.ndarray], + test_data: Tuple[np.ndarray, np.ndarray], + batch_size: int = 128, + shuffle_train: bool = False, + shuffle_val: bool = False, + prefetch: bool = True, + ) -> Tuple[DataLoader, DataLoader, DataLoader]: + train_data_loader = DataLoader.from_array_data( + train_data, batch_size=batch_size, shuffle=shuffle_train, prefetch=prefetch + ) + val_data_loader = DataLoader.from_array_data( + val_data, batch_size=batch_size, shuffle=shuffle_val, prefetch=prefetch + ) + test_data_loader = DataLoader.from_array_data( + test_data, batch_size=batch_size, prefetch=prefetch + ) + return train_data_loader, val_data_loader, test_data_loader + + def load( + self, + prop_train: float = 0.8, + prop_val: float = 0.1, + prop_test: int = 0.1, + batch_size: int = 128, + shuffle_train: bool = False, + shuffle_val: bool = False, + prefetch: bool = True, + ) -> Tuple[DataLoader, DataLoader, DataLoader]: + X, Y = self.read() + X, Y = self.preprocess(X, Y) + train_data, val_data, test_data = self.split( + X, Y, prop_train=prop_train, prop_val=prop_val, prop_test=prop_test + ) + return self.batch( + train_data, + val_data, + test_data, + batch_size=batch_size, + shuffle_train=shuffle_train, + shuffle_val=shuffle_val, + prefetch=prefetch, + ) + + @property + def needs_download(self): + return not os.path.isfile(self.datapath) + + def download(self): + if self.needs_download: + logging.info("\nDownloading {} data...".format(self.name)) + + if not os.path.isdir(self.datadir): + os.mkdir(self.datadir) + + filename = os.path.join(self.datadir, self.url.split("/")[-1]) + with urlopen(self.url) as response, open(filename, "wb") as out_file: + data = response.read() + out_file.write(data) + + is_zipped = np.any([z in self.url for z in [".gz", ".zip", ".tar"]]) + if is_zipped: + zip_ref = zipfile.ZipFile(filename, "r") + zip_ref.extractall(self.datadir) + zip_ref.close() + + logging.info("Download completed.".format(self.name)) + else: + logging.info("{} dataset is already available.".format(self.name)) + + +uci_base_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/" + + +@add_regression +class Boston(Dataset): + name = "boston" + url = uci_base_url + "housing/housing.data" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_fwf(self.datapath, header=None).values + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class Concrete(Dataset): + name = "concrete" + url = uci_base_url + "concrete/compressive/Concrete_Data.xls" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_excel(self.datapath).values + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class Energy(Dataset): + name = "energy" + url = uci_base_url + "00242/ENB2012_data.xlsx" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_excel(self.datapath).values[:, :-1] + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class Kin8mn(Dataset): + name = "kin8mn" + url = "https://www.openml.org/data/download/3626/dataset_2175_kin8nm.arff" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def download(self): + if self.needs_download: + logging.info("\nDownloading {} data...".format(self.name)) + + if not os.path.isdir(self.datadir): + os.mkdir(self.datadir) + + with urlopen(self.url) as response, open(self.datapath, "wb") as out_file: + data = response.read() + out_file.write(data) + logging.info("Download completed.".format(self.name)) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = np.array([list(row) for row in loadarff(self.datapath)[0]]) + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class Naval(Dataset): + name = "naval" + url = uci_base_url + "00316/UCI%20CBM%20Dataset.zip" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + @property + def datapath(self): + return os.path.join(self.datadir, "UCI CBM Dataset/data.txt") + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_fwf(self.datapath, header=None).values + X = data[:, :-2] + Y = data[:, -2].reshape(-1, 1) + X = np.delete(X, [8, 11], axis=1) + return X, Y + + +@add_regression +class Power(Dataset): + name = "power" + url = uci_base_url + "00294/CCPP.zip" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + @property + def datapath(self): + return os.path.join(self.datadir, "CCPP/Folds5x2_pp.xlsx") + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_excel(self.datapath).values + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class Protein(Dataset): + name = "protein" + url = uci_base_url + "00265/CASP.csv" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_csv(self.datapath).values + return data[:, 1:], data[:, 0].reshape(-1, 1) + + +@add_regression +class WineRed(Dataset): + name = "winered" + url = uci_base_url + "wine-quality/winequality-red.csv" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_csv(self.datapath, delimiter=";").values + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class WineWhite(Dataset): + name = "winewhite" + url = uci_base_url + "wine-quality/winequality-white.csv" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_csv(self.datapath, delimiter=";").values + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +@add_regression +class Yacht(Dataset): + name = "yacht" + url = uci_base_url + "/00243/yacht_hydrodynamics.data" + task = "regression" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + data = pd.read_fwf(self.datapath, header=None).values[:-1, :] + return data[:, :-1], data[:, -1].reshape(-1, 1) + + +class Delgado(Dataset): + name = "delgado" + url = ( + "http://persoal.citius.usc.es/manuel.fernandez.delgado/papers/jmlr/data.tar.gz" + ) + task = "classification" + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + @property + def datadir(self): + return os.path.join(self.dir, "delgado") + + @property + def datapath(self): + return os.path.join(self.datadir, self.name, self.name + "_R.dat") + + @property + def needs_download(self): + path1 = os.path.join(self.datadir, self.name + "_train_R.dat") + path2 = os.path.join(self.datadir, self.name + "_test_R.dat") + return not os.path.isfile(self.datapath) and not ( + os.path.isfile(path1) and os.path.isfile(path2) + ) + + def download(self): + if self.needs_download: + logging.info("\nDownloading {} data...".format(self.name)) + + if not os.path.isdir(self.datadir): + os.mkdir(self.datadir) + + filename = os.path.join(self.datadir, "delgado.tar.gz") + with urlopen(self.url) as response, open(filename, "wb") as out_file: + data = response.read() + out_file.write(data) + + tar = tarfile.open(filename) + tar.extractall(path=self.datadir) + tar.close() + + logging.info("Download completed.".format(self.name)) + else: + logging.info("{} dataset already available.".format(self.name)) + + def read(self) -> Tuple[np.ndarray, np.ndarray]: + if os.path.isfile(self.datapath): + data = np.array( + pd.read_csv(self.datapath, header=0, delimiter="\t").values + ).astype(float) + else: + data_path1 = os.path.join( + self.datadir, self.name, self.name + "_train_R.dat" + ) + data1 = np.array( + pd.read_csv(data_path1, header=0, delimiter="\t").values + ).astype(float) + + data_path2 = os.path.join( + self.datadir, self.name, self.name + "_test_R.dat" + ) + data2 = np.array( + pd.read_csv(data_path2, header=0, delimiter="\t").values + ).astype(float) + + data = np.concatenate([data1, data2], 0) + + return data[:, :-1], data[:, -1].astype("int32") + + +delgado_datasets = [ + "heart-va", + "connect-4", + "wine", + "tic-tac-toe", + "fertility", + "statlog-german-credit", + "car", + "libras", + "spambase", + "pittsburg-bridges-MATERIAL", + "hepatitis", + "acute-inflammation", + "pittsburg-bridges-TYPE", + "arrhythmia", + "musk-2", + "twonorm", + "nursery", + "breast-cancer-wisc-prog", + "seeds", + "lung-cancer", + "waveform", + "audiology-std", + "trains", + "horse-colic", + "miniboone", + "pittsburg-bridges-SPAN", + "breast-cancer-wisc-diag", + "statlog-heart", + "blood", + "primary-tumor", + "cylinder-bands", + "glass", + "contrac", + "statlog-shuttle", + "zoo", + "musk-1", + "hill-valley", + "hayes-roth", + "optical", + "credit-approval", + "pendigits", + "pittsburg-bridges-REL-L", + "dermatology", + "soybean", + "ionosphere", + "planning", + "energy-y1", + "acute-nephritis", + "pittsburg-bridges-T-OR-D", + "letter", + "titanic", + "adult", + "lymphography", + "statlog-australian-credit", + "chess-krvk", + "bank", + "statlog-landsat", + "heart-hungarian", + "flags", + "mushroom", + "conn-bench-sonar-mines-rocks", + "image-segmentation", + "congressional-voting", + "annealing", + "semeion", + "echocardiogram", + "statlog-image", + "wine-quality-white", + "lenses", + "plant-margin", + "post-operative", + "thyroid", + "monks-2", + "molec-biol-promoter", + "chess-krvkp", + "balloons", + "low-res-spect", + "plant-texture", + "haberman-survival", + "spect", + "plant-shape", + "parkinsons", + "oocytes_merluccius_nucleus_4d", + "conn-bench-vowel-deterding", + "ilpd-indian-liver", + "heart-cleveland", + "synthetic-control", + "vertebral-column-2clases", + "teaching", + "cardiotocography-10clases", + "heart-switzerland", + "led-display", + "molec-biol-splice", + "wall-following", + "statlog-vehicle", + "ringnorm", + "energy-y2", + "oocytes_trisopterus_nucleus_2f", + "yeast", + "oocytes_merluccius_states_2f", + "oocytes_trisopterus_states_5b", + "breast-cancer-wisc", + "steel-plates", + "mammographic", + "monks-3", + "balance-scale", + "ecoli", + "spectf", + "monks-1", + "page-blocks", + "magic", + "pima", + "breast-tissue", + "ozone", + "iris", + "waveform-noise", + "cardiotocography-3clases", + "wine-quality-red", + "vertebral-column-3clases", + "breast-cancer", + "abalone", +] +delgado_datasets.sort() + +for name in delgado_datasets: + + @add_classification + class C(Delgado): + name1 = name + name = "delgado_" + name + url = Delgado.url + task = Delgado.task + + def __init__(self, dir, name=name1, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + +class NYTaxiBase(Dataset, abc.ABC): + name = "nytaxi" + url = "https://www.kaggle.com/competitions/nyc-taxi-trip-duration/data" + task = "regression" + x_bounds = [-74.04, -73.75] + y_bounds = [40.62, 40.86] + too_close_radius = 0.00001 + min_duration = 30 + max_duration = 3 * 3600 + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + @property + def datapath(self): + return os.path.join(self.datadir, "train.csv") + + def download(self): + if self.needs_download: + filename = os.path.join(self.dir, "nyc-taxi-trip-duration.zip") + if not os.path.isfile(filename): + raise FileNotFoundError( + """In order to use this datasets, you need to manually download it from + `<{}>`_. Then you need to store it in {}/nyc-taxi-trip-duration.zip`.""".format( + self.url, self.dir + ) + ) + else: + logging.info("\nUnzipping the {} data...".format(self.name)) + zip_ref = zipfile.ZipFile(filename, "r") + zip_ref.extractall(self.datadir) + zip_ref.close() + + zip_ref = zipfile.ZipFile(os.path.join(self.datadir, "train.zip"), "r") + zip_ref.extractall(self.datadir) + zip_ref.close() + logging.info("Unzipping completed.") + else: + logging.info("{} dataset is already available.".format(self.name)) + + def rescale(self, x, a, b): + return b[0] + (b[1] - b[0]) * x / (a[1] - a[0]) + + def convert_to_day_minute(self, d): + day_of_week = self.rescale(float(d.weekday()), [0, 6], [0, 2 * np.pi]) + time_of_day = self.rescale( + d.time().hour * 60 + d.time().minute, [0, 24 * 60], [0, 2 * np.pi] + ) + return day_of_week, time_of_day + + def process_time(self, pickup_datetime, dropoff_datetime): + d_pickup = datetime.strptime(pickup_datetime, "%Y-%m-%d %H:%M:%S") + d_dropoff = datetime.strptime(dropoff_datetime, "%Y-%m-%d %H:%M:%S") + duration = (d_dropoff - d_pickup).total_seconds() + + pickup_day_of_week, pickup_time_of_day = self.convert_to_day_minute(d_pickup) + dropoff_day_of_week, dropoff_time_of_day = self.convert_to_day_minute(d_dropoff) + + return [ + pickup_day_of_week, + pickup_time_of_day, + dropoff_day_of_week, + dropoff_time_of_day, + duration, + ] + + def _read( + self, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + data = pd.read_csv(self.datapath) + data = data.values + + pickup_loc = np.array((data[:, 5], data[:, 6])).T + dropoff_loc = np.array((data[:, 7], data[:, 8])).T + + ind = np.ones(len(data)).astype(bool) + ind[data[:, 5] < self.x_bounds[0]] = False + ind[data[:, 5] > self.x_bounds[1]] = False + ind[data[:, 6] < self.y_bounds[0]] = False + ind[data[:, 6] > self.y_bounds[1]] = False + + ind[data[:, 7] < self.x_bounds[0]] = False + ind[data[:, 7] > self.x_bounds[1]] = False + ind[data[:, 8] < self.y_bounds[0]] = False + ind[data[:, 8] > self.y_bounds[1]] = False + + logging.info( + "Discarding {} out of bounds {} {}.".format( + np.sum(np.invert(ind).astype(int)), self.x_bounds, self.y_bounds + ) + ) + + early_stop = (data[:, 5] - data[:, 7]) ** 2 + ( + data[:, 6] - data[:, 8] + ) ** 2 < self.too_close_radius + ind[early_stop] = False + logging.info( + "Discarding {} trip less than {} gp dist.".format( + np.sum(early_stop.astype(int)), self.too_close_radius**0.5 + ) + ) + + times = np.array( + [ + self.process_time(d_pickup, d_dropoff) + for (d_pickup, d_dropoff) in data[:, 2:4] + ] + ) + pickup_time = times[:, :2] + dropoff_time = times[:, 2:4] + duration = times[:, 4] + + short_journeys = duration < self.min_duration + ind[short_journeys] = False + logging.info( + "Discarding {} less than {}s journeys.".format( + np.sum(short_journeys.astype(int)), self.min_duration + ) + ) + + long_journeys = duration > self.max_duration + ind[long_journeys] = False + logging.info( + "Discarding {} more than {}h journeys.".format( + np.sum(long_journeys.astype(int)), self.max_duration / 3600.0 + ) + ) + + pickup_loc = pickup_loc[ind, :] + dropoff_loc = dropoff_loc[ind, :] + pickup_time = pickup_time[ind, :] + dropoff_time = dropoff_time[ind, :] + duration = duration[ind] + + logging.info( + "{} total rejected journeys.".format(np.sum(np.invert(ind).astype(int))) + ) + return pickup_loc, dropoff_loc, pickup_time, dropoff_time, duration + + +@add_regression +class NYTaxiTimePrediction(NYTaxiBase): + name = NYTaxiBase.name + "-time" + + def read(self): + path = os.path.join(self.datadir, "taxitime_preprocessed.npz") + if os.path.isfile(path): + with open(path, "rb") as file: + f = np.load(file) + X, Y = f["X"], f["Y"] + + else: + ( + pickup_loc, + dropoff_loc, + pickup_datetime, + dropoff_datetime, + duration, + ) = self._read() + + pickup_sc = np.array( + [ + np.sin(pickup_datetime[:, 0]), + np.cos(pickup_datetime[:, 0]), + np.sin(pickup_datetime[:, 1]), + np.cos(pickup_datetime[:, 1]), + ] + ).T + + X = np.concatenate([pickup_loc, dropoff_loc, pickup_sc], 1) + Y = duration.reshape(-1, 1) + X, Y = np.array(X).astype(float), np.array(Y).astype(float) + + with open(path, "wb") as file: + np.savez(file, X=X, Y=Y) + return X, Y + + +@add_regression +class NYTaxiLocationPrediction(NYTaxiBase): + name = NYTaxiBase.name + "-loc" + + def read(self): + path = os.path.join(self.datadir, "taxiloc_preprocessed.npz") + if os.path.isfile(path): + with open(path, "rb") as file: + f = np.load(file) + X, Y = f["X"], f["Y"] + else: + ( + pickup_loc, + dropoff_loc, + pickup_datetime, + dropoff_datetime, + duration, + ) = self._read() + + pickup_sc = np.array( + [ + np.sin(pickup_datetime[:, 0]), + np.cos(pickup_datetime[:, 0]), + np.sin(pickup_datetime[:, 1]), + np.cos(pickup_datetime[:, 1]), + ] + ).T + X = np.concatenate([pickup_loc, pickup_sc], 1) + Y = dropoff_loc + X, Y = np.array(X).astype(float), np.array(Y).astype(float) + + with open(path, "wb") as file: + np.savez(file, X=X, Y=Y) + + return X, Y + + +class Wilson(Dataset): + name = "wilson" + url = "https://drive.google.com/open?id=0BxWe_IuTnMFcYXhxdUNwRHBKTlU" + task = "regression" + + @property + def datadir(self): + return os.path.join(self.dir, "wilson") + + @property + def datapath(self): + return "{}/{}/{}.mat".format(self.datadir, self.name, self.name) + + def download(self): + if self.needs_download: + filename = os.path.join(self.dir, "wilson.tar.gz") + if not os.path.isfile(filename): + raise FileNotFoundError( + """In order to use this dataset, you must obtain permission to download a zipped file from + the following url: {}. Then you must rename it as `wilson.tar.gz`, and add it to the following + path: {}.""".format( + self.url, filename + ) + ) + else: + logging.info("\nUnzipping `wilson.tar.gz`...") + + def members(tf): + ll = len("uci/") + for member in tf.getmembers(): + if member.path.startswith("uci/"): + member.path = member.path[ll:] + yield member + + with tarfile.open(filename) as tar: + tar.extractall(path=self.datadir, members=members(tar)) + tar.close() + logging.info("Unzipping completed.") + else: + logging.info("{} dataset is already available.".format(self.name)) + + def read(self): + data = loadmat(self.datapath)["data"] + return data[:, :-1], data[:, -1, None] + + +wilson_datasets = [ + "3droad", + "challenger", + "gas", + "servo", + "tamielectric", + "airfoil", + "concrete", + "machine", + "skillcraft", + "wine", + "autompg", + "concreteslump", + "houseelectric", + "parkinsons", + "slice", + "yacht", + "autos", + "elevators", + "housing", + "pendulum", + "sml", + "bike", +] + +for name in wilson_datasets: + + @add_regression + class C(Wilson): + name1 = name + name = name + url = Wilson.url + task = Wilson.task + + def __init__(self, dir, name=name, url=url, task=task): + super().__init__(name=name, url=url, task=task, dir=dir) + + +regression_datasets = list(_ALL_REGRESSION_DATASETS.keys()) +regression_datasets.sort() + +classification_datasets = list(_ALL_CLASSIFICATION_DATASETS.keys()) +classification_datasets.sort() + +assert not bool(set(regression_datasets) & set(classification_datasets)) + + +def download_regression_dataset(name, dir, *args, **kwargs): + dataset = _ALL_REGRESSION_DATASETS[name](dir, *args, **kwargs) + dataset.download() + + +def download_classification_dataset(name, dir, *args, **kwargs): + dataset = _ALL_CLASSIFICATION_DATASETS[name](dir, *args, **kwargs) + dataset.download() + + +def download_all_regression_datasets(dir, *args, **kwargs): + for name in list(_ALL_REGRESSION_DATASETS.keys()): + download_regression_dataset(name, dir, *args, **kwargs) + + +def download_all_classification_datasets(dir, *args, **kwargs): + for name in list(_ALL_CLASSIFICATION_DATASETS.keys()): + download_classification_dataset(name, dir, *args, **kwargs) + + +def load_regression_dataset( + name, dir, *args, **kwargs +) -> Tuple[DataLoader, DataLoader, DataLoader]: + dataset = _ALL_REGRESSION_DATASETS[name](dir) + return dataset.load(*args, **kwargs) + + +def load_classification_dataset( + name, dir, *args, **kwargs +) -> Tuple[DataLoader, DataLoader, DataLoader]: + dataset = _ALL_CLASSIFICATION_DATASETS[name](dir) + return dataset.load(*args, **kwargs) + + +def get_all_classification_dataset_names() -> List[str]: + return classification_datasets + + +def get_all_regression_dataset_names() -> List[str]: + return regression_datasets diff --git a/benchmarks/tabular/run.py b/benchmarks/tabular/run.py new file mode 100644 index 00000000..b8c80b69 --- /dev/null +++ b/benchmarks/tabular/run.py @@ -0,0 +1,532 @@ +from copy import deepcopy +import json +import logging +import os +from time import time +from typing import Union + +from jax.nn import one_hot +import numpy as np +import optax +from sklearn.metrics import ( + average_precision_score, + roc_auc_score, +) +from tqdm import tqdm + +from benchmarks.tabular.dataset import ( + classification_datasets, + download_classification_dataset, + download_regression_dataset, + load_classification_dataset, + load_regression_dataset, + regression_datasets, +) +from fortuna.conformal import ( + BinaryClassificationMulticalibrator, + QuantileConformalRegressor, + TopLabelMulticalibrator, +) +from fortuna.data import DataLoader +from fortuna.metric.classification import ( + accuracy, + brier_score, + expected_calibration_error, + maximum_calibration_error, +) +from fortuna.metric.regression import ( + picp, + rmse, +) +from fortuna.model import ( + MLP, + ScalarHyperparameterModel, +) +from fortuna.prob_model import ( + CalibConfig, + CalibOptimizer, + FitConfig, + FitMonitor, + FitOptimizer, + MAPPosteriorApproximator, + ProbClassifier, + ProbRegressor, +) + +TASKS = ["regression", "classification"] +DATA_DIR = "./datasets/" +COVERAGE_ERROR = 0.05 +N_EPOCHS = 1000 +EARLY_STOPPING_PATIENCE = 2 +TRAIN_LR = 1e-2 +N_CALIB_EPOCHS = 300 +CALIB_LR = 1e-1 +BATCH_SIZE = 512 +PROP_TRAIN = 0.5 +PROP_VAL = 0.3 +PROP_TEST = 0.2 + +assert PROP_TRAIN + PROP_VAL + PROP_TEST == 1 + + +if not os.path.isdir(DATA_DIR): + os.mkdir(DATA_DIR) + + +def get_nll(one_hot_targets, probs): + _probs = one_hot_targets * probs + return -np.sum(np.log(_probs[_probs > 0])) + + +def get_prob_model( + task: str, data_loader: DataLoader +) -> Union[ProbRegressor, ProbClassifier]: + if task == "regression": + prob_model = prob_model_cls( + model=MLP(output_dim=output_dim), + likelihood_log_variance_model=ScalarHyperparameterModel( + output_dim, value=float(np.log(np.var(data_loader.to_array_targets()))) + ), + posterior_approximator=MAPPosteriorApproximator(), + ) + else: + prob_model = prob_model_cls( + model=MLP(output_dim=output_dim), + posterior_approximator=MAPPosteriorApproximator(), + ) + return prob_model + + +all_status = {task: dict() for task in TASKS} +all_metrics = {task: dict() for task in TASKS} + +for task in TASKS: + if task == "regression": + download_fn = download_regression_dataset + load_fn = load_regression_dataset + datasets = regression_datasets + prob_model_cls = ProbRegressor + elif task == "classification": + download_fn = download_classification_dataset + load_fn = load_classification_dataset + datasets = classification_datasets + prob_model_cls = ProbClassifier + else: + raise ValueError(f"`task={task}` not supported.") + + for dataset_name in tqdm(datasets, desc="Dataset"): + print(dataset_name) + # download and load data + download_fn(dataset_name, DATA_DIR) + train_data_loader, val_data_loader, test_data_loader = load_fn( + dataset_name, + DATA_DIR, + shuffle_train=True, + batch_size=BATCH_SIZE, + prop_train=PROP_TRAIN, + prop_val=PROP_VAL, + prop_test=PROP_TEST, + ) + train_targets, val_targets, test_targets = ( + train_data_loader.to_array_targets(), + val_data_loader.to_array_targets(), + test_data_loader.to_array_targets(), + ) + + if task == "classification": + train_unique_targets, val_unique_targets, test_unique_targets = ( + np.unique(train_targets), + np.unique(val_targets), + np.unique(test_targets), + ) + if (len(train_unique_targets) != len(val_unique_targets)) or ( + len(val_unique_targets) != len(test_unique_targets) + ): + logging.warning( + f"Skipping dataset {dataset_name} because the number of labels in train/val/test splits " + f"don't match." + ) + continue + if not ( + np.allclose(train_unique_targets, val_unique_targets) + and np.allclose(val_unique_targets, test_unique_targets) + ): + logging.warning( + f"Skipping dataset {dataset_name} because the labels in train/val/test splits " + f"don't match." + ) + continue + if not np.allclose( + train_unique_targets, np.arange(len(train_unique_targets)) + ): + logging.warning( + f"Skipping dataset {dataset_name} because targets do not follow canonical indices." + ) + continue + if train_data_loader.num_unique_labels == 1: + logging.warning( + f"Skipping dataset {dataset_name} because the number of unique labels per split is 1." + ) + continue + + # find output dimension + if task == "regression": + for batch_inputs, batch_targets in train_data_loader: + output_dim = batch_targets.shape[-1] + break + else: + output_dim = train_data_loader.num_unique_labels + + if task == "regression" and output_dim > 1: + logging.warning( + f"The dimension of the target variables in the {dataset_name} regression dataset " + f"is greater than 1. Skipped." + ) + continue + + all_metrics[task][dataset_name] = dict(map=dict(), temp_scaling=dict()) + if task == "regression": + all_metrics[task][dataset_name]["cqr"] = dict() + all_metrics[task][dataset_name]["multicalibrate"] = dict() + all_metrics[task][dataset_name]["temp_cqr"] = dict() + else: + all_metrics[task][dataset_name]["mc_conf"] = dict() + all_metrics[task][dataset_name]["mc_prob"] = dict() + all_metrics[task][dataset_name]["temp_mc_conf"] = dict() + + # define probabilistic model + prob_model = get_prob_model(task, train_data_loader) + + # train the probabilistic regression + time_init = time() + all_status[task][dataset_name] = prob_model.train( + train_data_loader=train_data_loader, + val_data_loader=val_data_loader, + fit_config=FitConfig( + monitor=FitMonitor(early_stopping_patience=EARLY_STOPPING_PATIENCE), + optimizer=FitOptimizer(method=optax.adam(TRAIN_LR), n_epochs=N_EPOCHS), + ), + ) + all_metrics[task][dataset_name]["map"]["time"] = time() - time_init + + # compute predictive statistics + test_inputs_loader = test_data_loader.to_inputs_loader() + + if task == "classification": + one_hot_test_targets = np.array( + one_hot(test_targets, test_data_loader.num_unique_labels) + ) + + test_means = prob_model.predictive.mean( + inputs_loader=test_inputs_loader, n_posterior_samples=1 + ) + test_modes = prob_model.predictive.mode( + inputs_loader=test_inputs_loader, n_posterior_samples=1 + ) + + # compute metrics + all_metrics[task][dataset_name]["map"]["nll"] = float( + -prob_model.predictive.log_prob( + data_loader=test_data_loader, n_posterior_samples=1 + ).sum() + ) + + if task == "regression": + test_cred_intervals = prob_model.predictive.credible_interval( + inputs_loader=test_inputs_loader, error=COVERAGE_ERROR + ) + + all_metrics[task][dataset_name]["temp_scaling"]["rmse"] = float( + rmse(preds=test_modes, targets=test_targets) + ) + all_metrics[task][dataset_name]["map"]["picp"] = float( + picp( + lower_bounds=test_cred_intervals[:, 0], + upper_bounds=test_cred_intervals[:, 1], + targets=test_targets, + ) + ) + else: + all_metrics[task][dataset_name]["map"]["accuracy"] = float( + accuracy(preds=test_modes, targets=test_targets) + ) + all_metrics[task][dataset_name]["map"]["mse"] = float( + brier_score(probs=test_means, targets=test_targets) + ) + all_metrics[task][dataset_name]["map"]["ece"] = float( + expected_calibration_error( + preds=test_modes, probs=test_means, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["map"]["mce"] = float( + maximum_calibration_error( + preds=test_modes, probs=test_means, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["map"]["rocauc"] = roc_auc_score( + y_true=one_hot_test_targets, y_score=test_means, average="macro" + ) + all_metrics[task][dataset_name]["map"]["prauc"] = average_precision_score( + y_true=one_hot_test_targets, y_score=test_means, average="macro" + ) + + val_inputs_loader = val_data_loader.to_inputs_loader() + + if task == "regression": + # calibrate the credibility intervals + val_cred_intervals = prob_model.predictive.credible_interval( + inputs_loader=val_inputs_loader + ) + + time_init = time() + test_conformal_intervals = QuantileConformalRegressor().conformal_interval( + val_lower_bounds=val_cred_intervals[:, 0], + val_upper_bounds=val_cred_intervals[:, 1], + test_lower_bounds=test_cred_intervals[:, 0], + test_upper_bounds=test_cred_intervals[:, 1], + val_targets=val_targets, + error=COVERAGE_ERROR, + ) + all_metrics[task][dataset_name]["cqr"]["time"] = time() - time_init + all_metrics[task][dataset_name]["cqr"]["picp"] = float( + picp( + lower_bounds=test_conformal_intervals[:, 0], + upper_bounds=test_conformal_intervals[:, 1], + targets=test_targets, + ) + ) + else: + val_modes = prob_model.predictive.mode( + inputs_loader=val_inputs_loader, n_posterior_samples=1 + ) + val_means = prob_model.predictive.mean( + inputs_loader=val_inputs_loader, n_posterior_samples=1 + ) + + time_init = time() + mc_conf = TopLabelMulticalibrator( + n_classes=train_data_loader.num_unique_labels + ) + test_mc_conf, mc_status = mc_conf.calibrate( + targets=val_targets, + probs=val_means, + test_probs=test_means, + ) + all_metrics[task][dataset_name]["mc_conf"]["time"] = time() - time_init + + all_metrics[task][dataset_name]["mc_conf"]["accuracy"] = float( + accuracy(test_mc_conf.argmax(1), test_targets) + ) + all_metrics[task][dataset_name]["mc_conf"]["nll"] = float( + get_nll(one_hot_test_targets, test_mc_conf) + ) + all_metrics[task][dataset_name]["mc_conf"]["mse"] = float( + mc_conf.mean_squared_error(probs=test_mc_conf, targets=test_targets) + ) + all_metrics[task][dataset_name]["mc_conf"]["ece"] = float( + expected_calibration_error( + preds=test_modes, probs=test_mc_conf, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["mc_conf"]["mce"] = float( + maximum_calibration_error( + preds=test_modes, probs=test_mc_conf, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["mc_conf"]["rocauc"] = roc_auc_score( + y_true=one_hot_test_targets, y_score=test_mc_conf, average="macro" + ) + all_metrics[task][dataset_name]["mc_conf"][ + "prauc" + ] = average_precision_score( + y_true=one_hot_test_targets, y_score=test_mc_conf, average="macro" + ) + + if output_dim == 2: + time_init = time() + mc_prob = BinaryClassificationMulticalibrator() + test_mc_prob, mc_status = mc_prob.calibrate( + targets=val_targets, + probs=val_means[:, 1], + test_probs=test_means[:, 1], + ) + probs = np.stack([1 - test_mc_prob, test_mc_prob], axis=1) + all_metrics[task][dataset_name]["mc_prob"]["time"] = time() - time_init + + all_metrics[task][dataset_name]["mc_prob"]["accuracy"] = float( + accuracy(probs.argmax(1), test_targets) + ) + all_metrics[task][dataset_name]["mc_prob"]["nll"] = float( + get_nll(one_hot_test_targets, probs) + ) + all_metrics[task][dataset_name]["mc_prob"]["mse"] = float( + mc_prob.mean_squared_error(probs=test_mc_prob, targets=test_targets) + ) + all_metrics[task][dataset_name]["mc_prob"]["ece"] = float( + expected_calibration_error( + preds=probs.argmax(1), probs=probs, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["mc_prob"]["mce"] = float( + maximum_calibration_error( + preds=probs.argmax(1), probs=probs, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["mc_prob"]["rocauc"] = roc_auc_score( + y_true=one_hot_test_targets, y_score=probs, average="macro" + ) + all_metrics[task][dataset_name]["mc_prob"][ + "prauc" + ] = average_precision_score( + y_true=one_hot_test_targets, y_score=probs, average="macro" + ) + + temp_scaling_prob_model = get_prob_model(task, train_data_loader) + temp_scaling_prob_model.posterior.state = deepcopy(prob_model.posterior.state) + + time_init = time() + temp_scaling_status = temp_scaling_prob_model.calibrate( + calib_data_loader=val_data_loader, + calib_config=CalibConfig( + optimizer=CalibOptimizer( + n_epochs=N_CALIB_EPOCHS, method=optax.adam(CALIB_LR) + ) + ), + ) + all_metrics[task][dataset_name]["temp_scaling"]["time"] = time() - time_init + + test_temp_means = temp_scaling_prob_model.predictive.mean( + inputs_loader=test_inputs_loader, n_posterior_samples=1 + ) + test_temp_modes = temp_scaling_prob_model.predictive.mode( + inputs_loader=test_inputs_loader, n_posterior_samples=1 + ) + + # compute metrics + all_metrics[task][dataset_name]["temp_scaling"]["nll"] = float( + -temp_scaling_prob_model.predictive.log_prob( + data_loader=test_data_loader, n_posterior_samples=1 + ).sum() + ) + + if task == "regression": + test_temp_cred_intervals = ( + temp_scaling_prob_model.predictive.credible_interval( + inputs_loader=test_inputs_loader, error=COVERAGE_ERROR + ) + ) + + all_metrics[task][dataset_name]["temp_scaling"]["rmse"] = float( + rmse(preds=test_temp_modes, targets=test_targets) + ) + all_metrics[task][dataset_name]["temp_scaling"]["picp"] = float( + picp( + lower_bounds=test_temp_cred_intervals[:, 0], + upper_bounds=test_temp_cred_intervals[:, 1], + targets=test_targets, + ) + ) + else: + all_metrics[task][dataset_name]["temp_scaling"]["accuracy"] = float( + accuracy(preds=test_temp_modes, targets=test_targets) + ) + all_metrics[task][dataset_name]["temp_scaling"]["mse"] = float( + brier_score(probs=test_temp_means, targets=test_targets) + ) + all_metrics[task][dataset_name]["temp_scaling"]["ece"] = float( + expected_calibration_error( + preds=test_temp_modes, probs=test_temp_means, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["temp_scaling"]["mce"] = float( + maximum_calibration_error( + preds=test_temp_modes, probs=test_temp_means, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["temp_scaling"]["rocauc"] = roc_auc_score( + y_true=one_hot_test_targets, y_score=test_temp_means, average="macro" + ) + all_metrics[task][dataset_name]["temp_scaling"][ + "prauc" + ] = average_precision_score( + y_true=one_hot_test_targets, y_score=test_temp_means, average="macro" + ) + + if task == "regression": + # calibrate the credibility intervals + temp_scaling_val_cred_intervals = ( + temp_scaling_prob_model.predictive.credible_interval( + inputs_loader=val_inputs_loader + ) + ) + + time_init = time() + temp_scaling_test_conformal_intervals = ( + QuantileConformalRegressor().conformal_interval( + val_lower_bounds=temp_scaling_val_cred_intervals[:, 0], + val_upper_bounds=temp_scaling_val_cred_intervals[:, 1], + test_lower_bounds=test_temp_cred_intervals[:, 0], + test_upper_bounds=test_temp_cred_intervals[:, 1], + val_targets=val_targets, + error=COVERAGE_ERROR, + ) + ) + all_metrics[task][dataset_name]["temp_cqr"]["time"] = time() - time_init + all_metrics[task][dataset_name]["temp_cqr"]["picp"] = float( + picp( + lower_bounds=temp_scaling_test_conformal_intervals[:, 0], + upper_bounds=temp_scaling_test_conformal_intervals[:, 1], + targets=test_targets, + ) + ) + else: + temp_val_modes = temp_scaling_prob_model.predictive.mode( + inputs_loader=val_inputs_loader, n_posterior_samples=1 + ) + temp_val_means = temp_scaling_prob_model.predictive.mean( + inputs_loader=val_inputs_loader, n_posterior_samples=1 + ) + + time_init = time() + temp_mc_conf = TopLabelMulticalibrator( + n_classes=train_data_loader.num_unique_labels + ) + temp_test_mc_conf, mc_status = temp_mc_conf.calibrate( + targets=val_targets, + probs=temp_val_means, + test_probs=test_means, + ) + all_metrics[task][dataset_name]["temp_mc_conf"]["time"] = time() - time_init + + all_metrics[task][dataset_name]["temp_mc_conf"]["accuracy"] = float( + accuracy(temp_test_mc_conf.argmax(1), test_targets) + ) + all_metrics[task][dataset_name]["temp_mc_conf"]["nll"] = float( + get_nll(one_hot_test_targets, temp_test_mc_conf) + ) + all_metrics[task][dataset_name]["temp_mc_conf"]["mse"] = float( + temp_mc_conf.mean_squared_error( + probs=temp_test_mc_conf, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["temp_mc_conf"]["ece"] = float( + expected_calibration_error( + preds=test_temp_modes, probs=temp_test_mc_conf, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["temp_mc_conf"]["mce"] = float( + maximum_calibration_error( + preds=test_temp_modes, probs=temp_test_mc_conf, targets=test_targets + ) + ) + all_metrics[task][dataset_name]["temp_mc_conf"]["rocauc"] = roc_auc_score( + y_true=one_hot_test_targets, y_score=temp_test_mc_conf, average="macro" + ) + all_metrics[task][dataset_name]["temp_mc_conf"][ + "prauc" + ] = average_precision_score( + y_true=one_hot_test_targets, y_score=temp_test_mc_conf, average="macro" + ) + +with open("tabular_results.json", "w") as fp: + json.dump(all_metrics, fp) diff --git a/benchmarks/transformers/sagemaker_entrypoints/prob_model_text_classification_config/default.yaml b/benchmarks/transformers/sagemaker_entrypoints/prob_model_text_classification_config/default.yaml new file mode 100644 index 00000000..5e88964c --- /dev/null +++ b/benchmarks/transformers/sagemaker_entrypoints/prob_model_text_classification_config/default.yaml @@ -0,0 +1,45 @@ +defaults: + - task/sentiment + - model/roberta + - method/sgmcmc_ll + - hyperparams/sghmc_ll + +dataset: + base_data_path: ~ + train_relative_path: "" + test_relative_path: "" + validation_relative_path: "" + + +model: + hparams: + tokenizer_max_length: 512 + max_grad_norm: 1 + adam_eps: 0.00000001 + adam_b2: 0.999 + gradient_checkpointing: "true" + save_every_n_steps: 20000 + keep_top_n_checkpoints: 1 + seed: 42 + disable_jit: False + devices: -1 + +sagemaker: + account_id: ~ + iam_role: ~ + entrypoint: "benchmarks/transformers//prob_model_text_classification.py" + instance_type: "ml.g5.2xlarge" + profile: "default" + region: "us-east-1" + job_name_suffix: ~ + metrics: + - {Name: "train_loss_step", Regex: 'loss: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "train_accuracy_step", Regex: 'accuracy: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "val_loss", Regex: 'val_loss: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "val_accuracy", Regex: 'val_accuracy: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "ind_accuracy", Regex: 'IND Test accuracy: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "ind_ece", Regex: 'IND ECE: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "ood_accuracy", Regex: 'OOD Test accuracy: ([-+]?(\d+(\.\d*)?|\.\d+))'} + - {Name: "ood_ece", Regex: 'OOD ECE: ([-+]?(\d+(\.\d*)?|\.\d+))'} + +output_data_path: ~ diff --git a/fortuna/conformal/classification/binary_multicalibrator.py b/fortuna/conformal/classification/binary_multicalibrator.py index d10cadbb..9afa9580 100644 --- a/fortuna/conformal/classification/binary_multicalibrator.py +++ b/fortuna/conformal/classification/binary_multicalibrator.py @@ -57,8 +57,7 @@ def calibration_error( values=probs, ) - @staticmethod - def mean_squared_error(probs: Array, targets: Array) -> Array: + def mean_squared_error(self, probs: Array, targets: Array) -> Array: return super().mean_squared_error(values=probs, scores=targets) @staticmethod diff --git a/fortuna/conformal/classification/top_label_multicalibrator.py b/fortuna/conformal/classification/top_label_multicalibrator.py index c2b09671..6ad42aa8 100644 --- a/fortuna/conformal/classification/top_label_multicalibrator.py +++ b/fortuna/conformal/classification/top_label_multicalibrator.py @@ -95,7 +95,7 @@ def _get_b( def _patch( self, values: Array, patch: Array, bt: Array, ct: Array, _shift: bool = False ) -> Array: - if jnp.any(jnp.isnan(values)) or jnp.any(values.sum(1, keepdims=True) == 0.0): + if jnp.all(~jnp.isnan(values)) and jnp.all(values.sum(1, keepdims=True) != 0.0): values /= values.sum(1, keepdims=True) return super()._patch(values=values, patch=patch, bt=bt, ct=ct, _shift=_shift) diff --git a/fortuna/metric/classification.py b/fortuna/metric/classification.py index 3c6c8236..3b121f0f 100755 --- a/fortuna/metric/classification.py +++ b/fortuna/metric/classification.py @@ -71,10 +71,9 @@ def compute_counts_confs_accs( Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray] Number of inputs per bin, average confidence score per bin and average accuracy per bin. """ - if probs.ndim != 2: - raise ValueError("""`probs` must be a two-dimensional array.""") - thresholds = jnp.linspace(1 / probs.shape[1], 1, 10) - probs = probs.max(1) + if probs.ndim == 2: + probs = probs.max(-1) + thresholds = jnp.linspace(1 / len(probs), 1, 10) probs = jnp.array(probs) indices = [jnp.where(probs <= thresholds[0])[0]] indices += [ diff --git a/pyproject.toml b/pyproject.toml index fd82ab47..288a9c22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "aws-fortuna" -version = "0.1.25" +version = "0.1.26" description = "A Library for Uncertainty Quantification." authors = ["Gianluca Detommaso ", "Alberto Gasparin "] license = "Apache-2.0"