diff --git a/carte/__init__.py b/carte/__init__.py new file mode 100644 index 0000000..c70c045 --- /dev/null +++ b/carte/__init__.py @@ -0,0 +1,4 @@ +from carte.src import * +from carte.configs import * +#from carte.data import * +from carte.scripts import * \ No newline at end of file diff --git a/carte/configs/__init__.py b/carte/configs/__init__.py new file mode 100644 index 0000000..69d54b2 --- /dev/null +++ b/carte/configs/__init__.py @@ -0,0 +1,4 @@ +from carte.configs.carte_configs import * +from carte.configs.directory import * +from carte.configs.model_parameters import * +from carte.configs.visuailization import * diff --git a/carte/configs/carte_configs.py b/carte/configs/carte_configs.py new file mode 100644 index 0000000..9d46cd5 --- /dev/null +++ b/carte/configs/carte_configs.py @@ -0,0 +1,166 @@ +"""Specific configurations for the CARTE paper.""" + +## Dataset names +carte_datalist = [ + "anime_planet", + "babies_r_us", + "beer_ratings", + "bikedekho", + "bikewale", + "buy_buy_baby", + "cardekho", + "chocolate_bar_ratings", + "clear_corpus", + "coffee_ratings", + "company_employees", + "employee_remuneration", + "employee_salaries", + "fifa22_players", + "filmtv_movies", + "journal_jcr", + "journal_sjr", + "jp_anime", + "k_drama", + "michelin", + "mlds_salaries", + "movies", + "museums", + "mydramalist", + "nba_draft", + "prescription_drugs", + "ramen_ratings", + "roger_ebert", + "rotten_tomatoes", + "spotify", + "us_accidents_counts", + "us_accidents_severity", + "us_presidential", + "used_cars_24", + "used_cars_benz_italy", + "used_cars_dot_com", + "used_cars_pakistan", + "used_cars_saudi_arabia", + "videogame_sales", + "whisky", + "wikiliq_beer", + "wikiliq_spirit", + "wina_pl", + "wine_dot_com_prices", + "wine_dot_com_ratings", + "wine_enthusiasts_prices", + "wine_enthusiasts_ratings", + "wine_vivino_price", + "wine_vivino_rating", + "yelp", + "zomato", +] + +## Dictionary of baseline methods +carte_singletable_baselines = dict() +carte_singletable_baselines["full"] = [ + "carte-gnn", + "catboost", + "sentence-llm-concat-num_histgb", + "sentence-llm-concat-num_xgb", + "sentence-llm-embed-num_histgb", + "sentence-llm-embed-num_xgb", + "tablevectorizer-fasttext_histgb", + "tablevectorizer-fasttext_xgb", + "tablevectorizer-llm_histgb", + "tablevectorizer-llm_xgb", + "tablevectorizer_histgb", + "tablevectorizer_logistic", + "tablevectorizer_mlp", + "tablevectorizer_randomforest", + "tablevectorizer_resnet", + "tablevectorizer_ridge", + "tablevectorizer_xgb", + "tablevectorizer_tabpfn", + "target-encoder_histgb", + "target-encoder_logistic", + "target-encoder_mlp", + "target-encoder_randomforest", + "target-encoder_resnet", + "target-encoder_ridge", + "target-encoder_xgb", + "target-encoder_tabpfn", +] + +carte_singletable_baselines["reduced"] = [ + "carte-gnn", + "catboost", + "sentence-llm-concat-num_xgb", + "sentence-llm-embed-num_xgb", + "tablevectorizer_logistic", + "tablevectorizer_mlp", + "tablevectorizer_randomforest", + "tablevectorizer_resnet", + "tablevectorizer_ridge", + "tablevectorizer_xgb", + "target-encoder_tabpfn", +] + +carte_multitable_baselines = [ + "original_carte-multitable", + "matched_carte-multitable", + "original_catboost-multitable", + "matched_catboost-multitable", + "original-sentence-llm_histgb-multitable", + "matched-sentence-llm_histgb-multitable", +] + + +## Dictionary of method mapping +carte_singletable_baseline_mapping = dict() +carte_singletable_baseline_mapping["carte-gnn"] = "CARTE" + +# Preprocessings +carte_singletable_baseline_mapping["tablevectorizer_"] = "TabVec-" +carte_singletable_baseline_mapping["tablevectorizer-"] = "TabVec-" +carte_singletable_baseline_mapping["target-encoder_"] = "TarEnc-" +carte_singletable_baseline_mapping["fasttext_"] = "FT-" +carte_singletable_baseline_mapping["llm_"] = "LLM-" +carte_singletable_baseline_mapping["sentence-llm-concat-num_"] = "S-LLM-CN-" +carte_singletable_baseline_mapping["sentence-llm-embed-num_"] = "S-LLM-EN-" + +# Estimators +carte_singletable_baseline_mapping["catboost"] = "CatBoost" +carte_singletable_baseline_mapping["xgb"] = "XGB" +carte_singletable_baseline_mapping["histgb"] = "HGB" +carte_singletable_baseline_mapping["randomforest"] = "RF" +carte_singletable_baseline_mapping["ridge"] = "Ridge" +carte_singletable_baseline_mapping["logistic"] = "Logistic" +carte_singletable_baseline_mapping["mlp"] = "MLP" +carte_singletable_baseline_mapping["resnet"] = "ResNet" +carte_singletable_baseline_mapping["tabpfn"] = "TabPFN" + +# Bagging +carte_singletable_baseline_mapping["bagging"] = "Bagging" + +## Colors for visualization +carte_singletable_color_palette = dict() +carte_singletable_color_palette["CARTE"] = "C3" +carte_singletable_color_palette["CatBoost"] = "C0" +carte_singletable_color_palette["TabVec-XGB"] = "C1" +carte_singletable_color_palette["TabVec-RF"] = "C2" +carte_singletable_color_palette["TabVec-Ridge"] = "C4" +carte_singletable_color_palette["TabVec-Logistic"] = "C5" +carte_singletable_color_palette["S-LLM-CN-XGB"] = "C6" +carte_singletable_color_palette["S-LLM-EN-XGB"] = "C7" +carte_singletable_color_palette["TabVec-ResNet"] = "C8" +carte_singletable_color_palette["TabVec-MLP"] = "C9" +carte_singletable_color_palette["TarEnc-TabPFN"] = "#A9561E" + +## Markers for visualization +carte_singletable_markers = dict() +carte_singletable_markers["CARTE"] = "o" +carte_singletable_markers["TabVec-XGB"] = (4, 0, 45) +carte_singletable_markers["TabVec-RF"] = "P" +carte_singletable_markers["CatBoost"] = "X" +carte_singletable_markers["S-LLM-CN-XGB"] = (4, 0, 0) +carte_singletable_markers["S-LLM-EN-XGB"] = "d" +carte_singletable_markers["TabVec-Ridge"] = "v" +carte_singletable_markers["TabVec-Logistic"] = "v" +carte_singletable_markers["TabVec-ResNet"] = "^" +carte_singletable_markers["TabVec-MLP"] = "p" +carte_singletable_markers["TarEnc-TabPFN"] = (5, 1, 0) diff --git a/carte/configs/directory.py b/carte/configs/directory.py new file mode 100644 index 0000000..0130146 --- /dev/null +++ b/carte/configs/directory.py @@ -0,0 +1,24 @@ +""" +Configurations for directory +""" + +from pathlib import Path + +base_path = Path().cwd() +config_directory = dict() +config_directory["base_path"] = base_path + +config_directory["data"] = str(base_path / "data/") +config_directory["pretrained_model"] = str(base_path / "data/etc/kg_pretrained.pt") +config_directory["data_raw"] = str(base_path / "data/data_raw/") +config_directory["data_singletable"] = str(base_path / "data/data_singletable/") +config_directory["data_yago"] = str(base_path / "data/data_yago/") +config_directory["etc"] = str(base_path / "data/etc/") + +config_directory["results"] = str(base_path / "results/") +config_directory["compiled_results"] = str(base_path / "results/compiled_results/") +config_directory["visualization"] = str(base_path / "visualization/") + +# Specify the directory in which you have downloaded each +config_directory["fasttext"] = str(base_path / "data/etc/cc.en.300.bin") +config_directory["ken_embedding"] = str(base_path / "data/etc/ken_embedding.parquet") diff --git a/carte/configs/model_parameters.py b/carte/configs/model_parameters.py new file mode 100644 index 0000000..70265f4 --- /dev/null +++ b/carte/configs/model_parameters.py @@ -0,0 +1,148 @@ +""" +Parameter distributions for hyperparameter optimization +""" + +import numpy as np +from scipy.stats import loguniform, randint, uniform, norm +import copy + + +class loguniform_int: + """Integer valued version of the log-uniform distribution""" + + def __init__(self, a, b): + self._distribution = loguniform(a, b) + + def rvs(self, *args, **kwargs): + """Random variable sample""" + return self._distribution.rvs(*args, **kwargs).astype(int) + + +class norm_int: + """Integer valued version of the normal distribution""" + + def __init__(self, a, b): + self._distribution = norm(a, b) + + def rvs(self, *args, **kwargs): + """Random variable sample""" + if self._distribution.rvs(*args, **kwargs).astype(int) < 1: + return 1 + else: + return self._distribution.rvs(*args, **kwargs).astype(int) + + +param_distributions_total = dict() + +# carte-gnn +param_distributions = dict() +lr_grid = [1e-4, 2.5e-4, 5e-4, 7.5e-4, 1e-3] +param_distributions["learning_rate"] = lr_grid +param_distributions_total["carte-gnn"] = param_distributions + +# histgb +param_distributions = dict() +param_distributions["learning_rate"] = loguniform(1e-2, 10) +param_distributions["max_depth"] = [None, 2, 3, 4] +param_distributions["max_leaf_nodes"] = norm_int(31, 5) +param_distributions["min_samples_leaf"] = norm_int(20, 2) +param_distributions["l2_regularization"] = loguniform(1e-6, 1e3) +param_distributions_total["histgb"] = param_distributions + +# catboost +param_distributions = dict() +param_distributions["max_depth"] = randint(2, 11) +param_distributions["learning_rate"] = loguniform(1e-5, 1) +param_distributions["bagging_temperature"] = uniform(0, 1) +param_distributions["l2_leaf_reg"] = loguniform(1, 10) +param_distributions["iterations"] = randint(400, 1001) +param_distributions["one_hot_max_size"] = randint(2, 26) +param_distributions_total["catboost"] = param_distributions + +# xgb +param_distributions = dict() +param_distributions["n_estimators"] = randint(50, 1001) +param_distributions["max_depth"] = randint(2, 11) +param_distributions["min_child_weight"] = loguniform(1, 100) +param_distributions["subsample"] = uniform(0.5, 1 - 0.5) +param_distributions["learning_rate"] = loguniform(1e-5, 1) +param_distributions["colsample_bylevel"] = uniform(0.5, 1 - 0.5) +param_distributions["colsample_bytree"] = uniform(0.5, 1 - 0.5) +param_distributions["gamma"] = loguniform(1e-8, 7) +param_distributions["lambda"] = loguniform(1, 4) +param_distributions["alpha"] = loguniform(1e-8, 100) +param_distributions_total["xgb"] = param_distributions + +# RandomForest +param_distributions = dict() +param_distributions["n_estimators"] = randint(50, 250) +param_distributions["max_depth"] = [None, 2, 3, 4] +param_distributions["max_features"] = [ + "sqrt", + "log2", + None, + 0.1, + 0.2, + 0.3, + 0.4, + 0.5, + 0.6, + 0.7, + 0.8, + 0.9, +] +param_distributions["min_samples_leaf"] = loguniform_int(0.5, 50.5) +param_distributions["bootstrap"] = [True, False] +param_distributions["min_impurity_decrease"] = [0.0, 0.01, 0.02, 0.05] +param_distributions_total["randomforest"] = param_distributions + + +# resnet +param_distributions = dict() +param_distributions["normalization"] = ["batchnorm", "layernorm"] +param_distributions["num_layers"] = randint(1, 9) +param_distributions["hidden_dim"] = randint(32, 513) +param_distributions["hidden_factor"] = randint(1, 3) +param_distributions["hidden_dropout_prob"] = uniform(0.0, 0.5) +param_distributions["residual_dropout_prob"] = uniform(0.0, 0.5) +param_distributions["learning_rate"] = loguniform(1e-5, 1e-2) +param_distributions["weight_decay"] = loguniform(1e-8, 1e-2) +param_distributions["batch_size"] = [16, 32] +param_distributions_total["resnet"] = param_distributions + +# mlp +param_distributions = dict() +param_distributions["hidden_dim"] = [2**x for x in range(4, 11)] +param_distributions["num_layers"] = randint(1, 5) +param_distributions["dropout_prob"] = uniform(0.0, 0.5) +param_distributions["learning_rate"] = loguniform(1e-5, 1e-2) +param_distributions["weight_decay"] = loguniform(1e-8, 1e-2) +param_distributions["batch_size"] = [16, 32] +param_distributions_total["mlp"] = param_distributions + +# ridge regression +param_distributions = dict() +param_distributions["solver"] = ["svd", "cholesky", "lsqr", "sag"] +param_distributions["alpha"] = loguniform(1e-5, 100) +param_distributions_total["ridge"] = param_distributions + +# logistic regression +param_distributions = dict() +param_distributions["solver"] = ["newton-cg", "lbfgs", "liblinear"] +param_distributions["penalty"] = ["none", "l1", "l2", "elasticnet"] +param_distributions["C"] = loguniform(1e-5, 100) +param_distributions_total["logistic"] = param_distributions + +# tabpfn +param_distributions = dict() +param_distributions_total["tabpfn"] = param_distributions + +# catboost-multitable +param_distributions = copy.deepcopy(param_distributions_total["catboost"]) +param_distributions["source_fraction"] = uniform(0, 1) +param_distributions_total["catboost-multitable"] = param_distributions + +# histgb-multitable +param_distributions = copy.deepcopy(param_distributions_total["histgb"]) +param_distributions["source_fraction"] = uniform(0, 1) +param_distributions_total["histgb-multitable"] = param_distributions diff --git a/carte/configs/visuailization.py b/carte/configs/visuailization.py new file mode 100644 index 0000000..4babbbc --- /dev/null +++ b/carte/configs/visuailization.py @@ -0,0 +1,43 @@ +""" +Visualization configurations +""" + +# Main models +model_color_palette = dict() +model_color_palette["CARTE"] = "C3" +model_color_palette["CatBoost"] = "C0" +model_color_palette["TabVec-XGB"] = "C1" +model_color_palette["TabVec-RF"] = "C2" +model_color_palette["TabVec-Ridge"] = "C4" +model_color_palette["TabVec-Logistic"] = "C5" +model_color_palette["S-LLM-CN-XGB"] = "C6" # "" +model_color_palette["S-LLM-EN-XGB"] = "C7" # "C7" "#C875C4" mediumorchid +model_color_palette["ResNet"] = "C8" +model_color_palette["MLP"] = "C9" +model_color_palette["TabPFN"] = "#A9561E" + +model_color_palette["TabVec-RandomForest"] = "C2" +model_color_palette["TabVec-ResNet"] = "C8" +model_color_palette["TabVec-MLP"] = "C9" +model_color_palette["TarEnc-TabPFN"] = "#A9561E" + + +# model_color_palette["CARTE-B"] = "C3" +# model_color_palette["CatBoost-B"] = "C0" +# model_color_palette["TabVec-XGB-B"] = "C1" +# model_color_palette["TabVec-RF-B"] = "C2" +# model_color_palette["TabVec-Ridge-B"] = "C4" +# model_color_palette["TabVec-Logistic-B"] = "C5" +# model_color_palette["S-LLM-CN-XGB-B"] = "C6" +# model_color_palette["S-LLM-EN-XGB-B"] = "C7" +# model_color_palette["ResNet-B"] = "C8" +# model_color_palette["MLP-B"] = "C9" +# model_color_palette["TabPFN-B"] = "#A9561E" + + +# model_color_palette["TabVec-HGB"] = "#650021" +# model_color_palette["TabVec-TabPFN"] = "#650021" +# model_color_palette["TabVec-FT-XGB"] = "#650021" +# model_color_palette["TabVec-FT-HGB"] = "#650021" + +# model_color_palette["TabLLM"] = "#653700" diff --git a/carte/data/__init__.py b/carte/data/__init__.py new file mode 100644 index 0000000..d153ea5 --- /dev/null +++ b/carte/data/__init__.py @@ -0,0 +1 @@ +from carte.data.data_singletable import * \ No newline at end of file diff --git a/carte/scripts/__init__.py b/carte/scripts/__init__.py new file mode 100644 index 0000000..ac3c58f --- /dev/null +++ b/carte/scripts/__init__.py @@ -0,0 +1,5 @@ +from carte.scripts.compile_results_singletable import * +from carte.scripts.download_data import * +from carte.scripts.evaluate_singletable import * +from carte.scripts.preprocess_lm import * +from carte.scripts.preprocess_raw import * \ No newline at end of file diff --git a/carte/scripts/compile_results_singletable.py b/carte/scripts/compile_results_singletable.py new file mode 100644 index 0000000..c50dc48 --- /dev/null +++ b/carte/scripts/compile_results_singletable.py @@ -0,0 +1,67 @@ +"""Script for compling results""" + +# >>> +if __name__ == "__main__": + import os + import sys + + _project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + os.environ["PROJECT_DIR"] = _project_dir + sys.path.append(_project_dir) + del _project_dir +# <<< + +import json +from glob import glob +from carte.configs.directory import config_directory +import numpy as np +import pandas as pd + + +def _load_config(data_name): + config_data_dir = ( + f"{config_directory['data_singletable']}/{data_name}/config_data.json" + ) + filename = open(config_data_dir) + config_data = json.load(filename) + filename.close() + return config_data + + +if __name__ == "__main__": + + result_dir_base = f"{config_directory['results']}/singletable" + result_filenames = glob(f"{result_dir_base}/*/*.csv*") + + df_score = pd.DataFrame() + for path in result_filenames: + data_name = path.split("/")[-2] + file_name = path.split("/")[-1] + method_name = file_name.split(f"{data_name}_")[1].split("_num_train")[0] + num_train = file_name.split("num_train-")[1].split("_")[0] + random_state = file_name.split("rs-")[1].split(".csv")[0] + + config_data = _load_config(data_name) + task = config_data["task"] + score_measure = "r2" if task == "regression" else "roc_auc" + + score_ = pd.read_csv(path) + score_col = [col for col in score_.columns if score_measure in col][0] + score_[score_col].iloc[0] + + df_score_ = dict() + df_score_["model"] = method_name + df_score_["score"] = score_[score_col].iloc[0] + df_score_["data_name"] = data_name + df_score_["num_train"] = num_train + df_score_["random_state"] = random_state + df_score_["task"] = task + df_score_ = pd.DataFrame([df_score_]) + + df_score = pd.concat([df_score, df_score_], axis=0) + + df_score.reset_index(drop=True, inplace=True) + save_dir = ( + f"{config_directory['compiled_results']}/results_carte_baseline_singletable.csv" + ) + df_score.to_csv(save_dir, index=False) diff --git a/carte/scripts/download_data.py b/carte/scripts/download_data.py new file mode 100644 index 0000000..af08cd5 --- /dev/null +++ b/carte/scripts/download_data.py @@ -0,0 +1,150 @@ +"""Script for downloading required data.""" + +# >>> +if __name__ == '__main__': + import os + import sys + + _project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + os.environ['PROJECT_DIR'] = _project_dir + sys.path.append(_project_dir) + del _project_dir +# <<< + +import shutil +import os +import requests +from zipfile import ZipFile +from carte.configs.directory import config_directory + + +def _download_with_request(url, download_path): + req = requests.get(url, stream=True) + with open(download_path,'wb') as f: + for chunk in req.iter_content(chunk_size=8192): + f.write(chunk) + + +def _download_fasttext(): + import fasttext.util + fasttext.util.download_model('en', if_exists='ignore') + ft_path = str(config_directory["base_path"] / "cc.en.300.bin") + shutil.move(ft_path, config_directory["fasttext"]) + os.remove(str(config_directory["base_path"] / "cc.en.300.bin.gz")) + + +def _download_ken(): + ken_url = "https://figshare.com/ndownloader/files/39142985" + ken_path = config_directory["ken_embedding"] + _download_with_request(ken_url, ken_path) + + +def _download_raw(option="carte"): + url = "https://huggingface.co/datasets/inria-soda/carte-benchmark/resolve/main/data_raw.zip" + download_path = str(config_directory["base_path"] / "data_raw.zip") + _download_with_request(url, download_path) + if option == "carte": + carte_example_data = ["wina_pl", "spotify", "wine_dot_com_prices", "wine_vivino_price"] + with ZipFile(download_path, 'r') as zObject: + for name in carte_example_data: + raw_data_path = f"data_raw/{name}.csv" + zObject.extract(raw_data_path, path=f"{config_directory['data']}") + elif option == "full": + with ZipFile(download_path, 'r') as zObject: + zObject.extractall(path=config_directory["data"]) + zObject.close() + os.remove(download_path) + + +def _download_preprocessed(option="carte", include_llm=False): + if include_llm: + url = "https://huggingface.co/datasets/inria-soda/carte-benchmark/resolve/main/data_singletable.zip" + else: + url = "https://huggingface.co/datasets/inria-soda/carte-benchmark/resolve/main/data_singletable_light.zip" + download_path = str(config_directory["base_path"] / "data_singletable.zip") + _download_with_request(url, download_path) + if option == "carte": + carte_example_data = ["wina_pl", "spotify", "wine_dot_com_prices", "wine_vivino_price"] + with ZipFile(download_path, 'r') as zObject: + for name in carte_example_data: + raw_data_path = f"data_singletable/{name}/raw.parquet" + config_path = f"data_singletable/{name}/config_data.json" + zObject.extract(raw_data_path, path=f"{config_directory['data']}") + zObject.extract(config_path, path=f"{config_directory['data']}") + if include_llm: + external_path = f"data_singletable/{name}/external.pickle" + zObject.extract(external_path, path=f"{config_directory['data']}") + elif option == "full": + with ZipFile(download_path, 'r') as zObject: + zObject.extractall(path=config_directory["data"]) + zObject.close() + os.remove(download_path) + + +# Main +def main(option = "carte", include_raw = False, include_ken = False): + + if os.path.exists(config_directory["fasttext"]): + pass + else: + _download_fasttext() + + if option == "carte": + option_ = "full" + else: + if option == "basic_examples": + option_, include_llm = "carte", False + elif option == "full_examples": + option_, include_llm = "full", False + elif option == "full_benchmark": + option_, include_llm = "full", True + _download_preprocessed(option_, include_llm) + + if include_raw: + _download_raw(option=option_) + + if include_ken: + _download_ken() + + return None + +if __name__ == "__main__": + + # Set parser + import argparse + + parser = argparse.ArgumentParser(description="Download data.") + parser.add_argument( + "-op", + "--option", + type=str, + help="option for downloading", + ) + parser.add_argument( + "-ir", + "--include_raw", + type=str, + help="include raw data for downloading", + ) + parser.add_argument( + "-ik", + "--include_ken", + type=str, + help="include ken data for downloading", + ) + args = parser.parse_args() + + if args.include_raw == "True": + include_raw = True + else: + include_raw = False + + if args.include_ken == "True": + include_ken = True + else: + include_ken = False + + main(args.option, include_raw, include_ken) + + + diff --git a/carte/scripts/download_data.sh b/carte/scripts/download_data.sh new file mode 100644 index 0000000..401fffe --- /dev/null +++ b/carte/scripts/download_data.sh @@ -0,0 +1,9 @@ +# Download data. See README for information on the variables. + +ENV_NAME="myenv" # Change the environment name accordingly +OPTIONS="basic_examples" +INCLUDE_RAW="False" +INCLUDE_KEN="False" + +conda run -n $ENV_NAME python -W ignore scripts/download_data.py -op $OPTIONS -ir $INCLUDE_RAW -ik $INCLUDE_KEN + diff --git a/carte/scripts/evaluate_singletable.py b/carte/scripts/evaluate_singletable.py new file mode 100644 index 0000000..0c25114 --- /dev/null +++ b/carte/scripts/evaluate_singletable.py @@ -0,0 +1,703 @@ +"""Script for evalutating a model of choice for singletables.""" + +# >>> +if __name__ == "__main__": + import os + import sys + + _project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + os.environ["PROJECT_DIR"] = _project_dir + sys.path.append(_project_dir) + del _project_dir +# <<< + +import os +import pickle +import json +import pandas as pd +import numpy as np +import copy + +from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import MinMaxScaler +from category_encoders import TargetEncoder +from sklearn.impute import SimpleImputer +from sklearn.decomposition import PCA +from sklearn.model_selection import ParameterGrid +from carte.configs.directory import config_directory +from carte.configs.carte_configs import carte_datalist, carte_singletable_baselines +from carte.src.evaluate_utils import * +from carte.src.carte_estimator_new import CARTERegressor, CARTEClassifier +from catboost import CatBoostRegressor, CatBoostClassifier +from xgboost import XGBRegressor, XGBClassifier +from tabpfn import TabPFNClassifier +from sklearn.ensemble import ( + HistGradientBoostingRegressor, + HistGradientBoostingClassifier, + RandomForestRegressor, + RandomForestClassifier, + BaggingRegressor, + BaggingClassifier, +) +from sklearn.linear_model import Ridge, LogisticRegression +from carte.src.baseline_singletable_nn import ( + MLPRegressor, + MLPClassifier, + RESNETRegressor, + RESNETClassifier, +) + + +def _load_data(data_name): + """Load data, external data, and configs.""" + + data_dir = f"{config_directory['data_singletable']}/{data_name}/raw.parquet" + data_additional_dir = ( + f"{config_directory['data_singletable']}/{data_name}/external.pickle" + ) + data = pd.read_parquet(data_dir) + data.fillna(value=np.nan, inplace=True) + with open(data_additional_dir, "rb") as pickle_file: + data_additional = pickle.load(pickle_file) + config_data_dir = ( + f"{config_directory['data_singletable']}/{data_name}/config_data.json" + ) + filename = open(config_data_dir) + config_data = json.load(filename) + filename.close() + return data, data_additional, config_data + + +def _prepare_carte_gnn( + data, + data_config, + num_train, + random_state, +): + """Preprocess for CARTE (graph construction).""" + from carte_table_to_graph_old import Table2GraphTransformer + + data_ = data.copy() + X_train, X_test, y_train, y_test = set_split( + data_, + data_config, + num_train, + random_state=random_state, + ) + preprocessor = Table2GraphTransformer() + X_train = preprocessor.fit_transform(X_train, y=y_train) + X_test = preprocessor.transform(X_test) + return X_train, X_test, y_train, y_test + + +def _prepare_catboost( + data, + data_config, + num_train, + random_state, +): + """Preprocess for CatBoost.""" + data_ = data.copy() + _, cat_col_names = col_names_per_type(data, data_config["target_name"]) + data_cat = data_[cat_col_names] + data_cat = data_cat.replace(np.nan, "nan", regex=True) + data_[cat_col_names] = data_cat + for col in cat_col_names: + data_[col] = data_[col].astype("category") + X_train, X_test, y_train, y_test = set_split( + data_, + data_config, + num_train, + random_state=random_state, + ) + # index of categorical columns + cat_features = [X_train.columns.get_loc(col) for col in cat_col_names] + return ( + np.array(X_train), + np.array(X_test), + np.array(y_train), + np.array(y_test), + cat_features, + ) + + +def _prepare_tablevectorizer( + data, + data_config, + num_train, + random_state, + estim_method, +): + """Preprocess with Tablevectorizer.""" + + from skrub import TableVectorizer + + data_ = data.copy() + X_train, X_test, y_train, y_test = set_split( + data_, + data_config, + num_train, + random_state=random_state, + ) + num_col_names, cat_col_names = col_names_per_type(data, data_config["target_name"]) + + # Set preprocessors for categorical and numerical + categorical_preprocessor = TableVectorizer(auto_cast=False, sparse_threshold=0) + numerical_preprocessor = SimpleImputer(strategy="mean") + + # Set final pipeline for preprocessing depending on the method + tree_based_methods = ["xgb", "histgb", "randomforest"] + if estim_method in tree_based_methods: + preprocessor_final = ColumnTransformer( + [ + ("numerical", "passthrough", num_col_names), + ("categorical", categorical_preprocessor, cat_col_names), + ] + ) + elif estim_method in ["tabpfn"]: + preprocessor = ColumnTransformer( + [ + ("numerical", numerical_preprocessor, num_col_names), + ("categorical", categorical_preprocessor, cat_col_names), + ] + ) + preprocessor_final = Pipeline( + [ + ("preprocess", preprocessor), + ("missing", SimpleImputer(strategy="mean")), + ] + ) + else: + preprocessor = ColumnTransformer( + [ + ("numerical", numerical_preprocessor, num_col_names), + ("categorical", categorical_preprocessor, cat_col_names), + ] + ) + preprocessor_final = Pipeline( + [ + ("preprocess", preprocessor), + ("minmax", MinMaxScaler()), + ("missing", SimpleImputer(strategy="mean")), + ] + ) + X_train = preprocessor_final.fit_transform(X_train, y=y_train) + X_test = preprocessor_final.transform(X_test) + + if estim_method in ["tabpfn"]: + if X_train.shape[1] > 100: + n_components = np.min([X_train.shape[0], 100]) + pca_ = PCA(n_components=n_components, svd_solver="full") + X_train = pca_.fit_transform(X_train) + X_test = pca_.transform(X_test) + + return X_train, X_test, y_train, y_test + + +def _prepare_target_encoder( + data, + data_config, + num_train, + random_state, + estim_method, +): + """Preprocess with Target Encoder.""" + data_ = data.copy() + X_train, X_test, y_train, y_test = set_split( + data_, + data_config, + num_train, + random_state=random_state, + ) + num_col_names, cat_col_names = col_names_per_type(data, data_config["target_name"]) + if data_config["task"] == "regression": + target_type = "continuous" + else: + target_type = "binary" + + # Set preprocessors for categorical and numerical + categorical_preprocessor = TargetEncoder( + categories="auto", + target_type=target_type, + random_state=random_state, + ) + numerical_preprocessor = SimpleImputer(strategy="mean") + + # Set final pipeline for preprocessing depending on the method + tree_based_methods = ["xgb", "histgb", "randomforest"] + if estim_method in tree_based_methods: + preprocessor_final = ColumnTransformer( + [ + ("numerical", "passthrough", num_col_names), + ("categorical", categorical_preprocessor, cat_col_names), + ] + ) + elif estim_method in ["tabpfn"]: + preprocessor_final = ColumnTransformer( + [ + ("numerical", numerical_preprocessor, num_col_names), + ("categorical", categorical_preprocessor, cat_col_names), + ] + ) + else: + preprocessor = ColumnTransformer( + [ + ("numerical", numerical_preprocessor, num_col_names), + ("categorical", categorical_preprocessor, cat_col_names), + ] + ) + preprocessor_final = Pipeline( + [ + ("preprocess", preprocessor), + ("minmax", MinMaxScaler()), + ] + ) + X_train = preprocessor_final.fit_transform(X_train, y=y_train) + X_test = preprocessor_final.transform(X_test) + + if estim_method in ["tabpfn"]: + if X_train.shape[1] > 100: + n_components = np.min([X_train.shape[0], 100]) + pca_ = PCA(n_components=n_components, svd_solver="full") + X_train = pca_.fit_transform(X_train) + X_test = pca_.transform(X_test) + + return X_train, X_test, y_train, y_test + + +def _prepare_llm( + data, + data_config, + num_train, + random_state, +): + """Prepare the llm data. It loads the preprocessed data.""" + data_ = data.copy() + data_.drop(columns=data_config["entity_name"], inplace=True) + X_train, X_test, y_train, y_test = set_split( + data_, + data_config, + num_train, + random_state, + ) + + col_llm, col_not_llm = X_train.columns[:1024], X_train.columns[1024:] + + X_train_llm, X_train_ = X_train[col_llm], X_train[col_not_llm] + X_test_llm, X_test_ = X_test[col_llm], X_test[col_not_llm] + + if num_train > 1024: + pca = PCA().set_output(transform="pandas") + reduced_data_train = pca.fit_transform(X_train_llm) + dim_reduce_ = np.where(np.cumsum(pca.explained_variance_ratio_) > 0.9)[0][0] + dim_reduce = min(dim_reduce_, 300) + reduced_data_train = reduced_data_train.iloc[:, : dim_reduce + 1] + reduced_data_test = pca.transform(X_test_llm).iloc[:, : dim_reduce + 1] + X_train = pd.concat([reduced_data_train, X_train_], axis=1) + X_train = X_train.to_numpy().astype(np.float32) + X_test = pd.concat([reduced_data_test, X_test_], axis=1) + X_test = X_test.to_numpy().astype(np.float32) + + return X_train, X_test, y_train, y_test + + +def _assign_estimator( + estim_method, + task, + device, + cat_features, + bagging, +): + """Assign the specific estimator to train model.""" + + # Set number of models for NN-based methods + if bagging: + num_model = 1 + else: + num_model = 15 + + if estim_method == "carte-gnn": + fixed_params = dict() + fixed_params["batch_size"] = 16 + fixed_params["num_model"] = num_model + fixed_params["device"] = device + fixed_params["n_jobs"] = num_model + fixed_params["random_state"] = 0 + if task == "regression": + estimator = CARTERegressor(**fixed_params) + else: + estimator = CARTEClassifier(**fixed_params) + elif estim_method == "catboost": + fixed_params = dict() + fixed_params["cat_features"] = cat_features + fixed_params["verbose"] = False + fixed_params["allow_writing_files"] = False + fixed_params["thread_count"] = 1 + fixed_params["leaf_estimation_iterations"] = 1 + fixed_params["max_ctr_complexity"] = 1 + if task == "regression": + estimator = CatBoostRegressor(**fixed_params) + else: + estimator = CatBoostClassifier(**fixed_params) + elif estim_method == "xgb": + fixed_params = dict() + fixed_params["booster"] = "gbtree" + fixed_params["tree_method"] = "exact" # exact approx hist + if task == "regression": + estimator = XGBRegressor(**fixed_params) + else: + estimator = XGBClassifier(**fixed_params) + elif estim_method == "histgb": + fixed_params = dict() + if task == "regression": + estimator = HistGradientBoostingRegressor(**fixed_params) + else: + estimator = HistGradientBoostingClassifier(**fixed_params) + elif estim_method == "randomforest": + fixed_params = dict() + if task == "regression": + estimator = RandomForestRegressor(**fixed_params) + else: + estimator = RandomForestClassifier(**fixed_params) + elif estim_method == "ridge": + fixed_params = dict() + estimator = Ridge(**fixed_params) + elif estim_method == "logistic": + fixed_params = dict() + estimator = LogisticRegression(**fixed_params) + elif estim_method == "mlp": + fixed_params = dict() + fixed_params["num_model"] = num_model + fixed_params["device"] = device + fixed_params["n_jobs"] = num_model + fixed_params["random_state"] = 0 + if task == "regression": + estimator = MLPRegressor(**fixed_params) + else: + estimator = MLPClassifier(**fixed_params) + elif estim_method == "resnet": + fixed_params = dict() + fixed_params["num_model"] = num_model + fixed_params["device"] = device + fixed_params["n_jobs"] = num_model + fixed_params["random_state"] = 0 + if task == "regression": + estimator = RESNETRegressor(**fixed_params) + else: + estimator = RESNETClassifier(**fixed_params) + elif estim_method == "tabpfn": + estimator = TabPFNClassifier() + return estimator + + +def _assign_bagging_estimator(estimator_base, estim_method, task): + """Assign the bagging estimator if bagging set to true.""" + bagging_estimator = copy.deepcopy(estimator_base) + if estim_method in ["carte-gnn", "mlp", "resnet"]: + fixed_params = dict() + fixed_params["num_model"] = 15 + fixed_params["n_jobs"] = 15 + bagging_estimator.__dict__.update(fixed_params) + else: + bagging_params = dict() + bagging_params["estimator"] = estimator_base + bagging_params["n_estimators"] = 15 + bagging_params["max_samples"] = 0.8 + bagging_params["n_jobs"] = 15 + bagging_params["random_state"] = 0 + if task == "regression": + bagging_estimator = BaggingRegressor(**bagging_params) + else: + bagging_estimator = BaggingClassifier(**bagging_params) + + return bagging_estimator + + +# Run evaluation +def run_model( + data_name, + num_train, + method, + random_state, + bagging, + device, +): + """Run model for specific experiment setting.""" + # Load data + data, data_additional, data_config = _load_data(data_name) + + # Basic settings + target_name = data_config["target_name"] + entity_name = data_config["entity_name"] + task = data_config["task"] + _, result_criterion = set_score_criterion(task) + cat_features = None # overriden by prepare_... functions if needed + + # Set methods + method_parse = method.split("_") + estim_method = method_parse[-1] + preprocess_method = method_parse[0] + + # Stop for exceptions - Regression/Classification only methods, tabpfn > 1024 + reg_only_methods = ["tablevectorizer_ridge", "target-encoder_ridge"] + cls_only_methods = [ + method for method in carte_singletable_baselines["full"] if "tabpfn" in method + ] + cls_only_methods += [ + method for method in carte_singletable_baselines["full"] if "logistic" in method + ] + if (data_config["task"] == "regression") and (method in cls_only_methods): + return None + elif (data_config["task"] == "classification") and (method in reg_only_methods): + return None + elif (num_train > 1024) and (estim_method == "tabpfn"): + return None + + # Prepare data + if "fasttext" in preprocess_method: + data_fasttext = data_additional["fasttext"].copy() + data_fasttext.drop_duplicates(subset=entity_name, inplace=True) + data = data.merge(right=data_fasttext, how="left", on=entity_name) + elif "llm" in preprocess_method: + if preprocess_method.split("-")[0] == "sentence": + data_ = data_additional[preprocess_method].copy() + data = pd.concat([data_, data[[target_name, entity_name]]], axis=1) + data.dropna(subset=target_name, inplace=True) + else: + data_llm = data_additional["llm"].copy() + data_llm.drop_duplicates(subset=entity_name, inplace=True) + data = data.merge(right=data_llm, how="left", on=entity_name) + else: + pass + + # Preprocess data + if "carte-gnn" in preprocess_method: + X_train, X_test, y_train, y_test = _prepare_carte_gnn( + data, + data_config, + num_train, + random_state, + ) + elif "catboost" in preprocess_method: + X_train, X_test, y_train, y_test, cat_features = _prepare_catboost( + data, + data_config, + num_train, + random_state, + ) + elif "tablevectorizer" in preprocess_method: + X_train, X_test, y_train, y_test = _prepare_tablevectorizer( + data, + data_config, + num_train, + random_state, + estim_method, + ) + elif "target-encoder" in preprocess_method: + X_train, X_test, y_train, y_test = _prepare_target_encoder( + data, + data_config, + num_train, + random_state, + estim_method, + ) + elif "llm" in preprocess_method: + X_train, X_test, y_train, y_test = _prepare_llm( + data, + data_config, + num_train, + random_state, + ) + + # Assign estimators + best_params = extract_best_params(data_name, method, num_train, random_state) + estimator = _assign_estimator( + estim_method, + task, + device, + cat_features, + bagging, + ) + estimator.__dict__.update(best_params) + estimator_bagging = _assign_bagging_estimator(estimator, estim_method, task) + + # Create directory for saving results + result_save_dir_base = f"{config_directory['results']}/singletable/{data_name}" + if not os.path.exists(result_save_dir_base): + os.makedirs(result_save_dir_base, exist_ok=True) + + # Run without bagging strategy + marker = f"{data_name}_{method}_num_train-{num_train}_rs-{random_state}" + results_model_dir = result_save_dir_base + f"/{marker}.csv" + + # Do not run if result already exists + if os.path.exists(results_model_dir): + pass + else: + estimator.fit(X_train, y_train) + if task == "regression": + y_pred = estimator.predict(X_test) + else: + y_pred = estimator.predict_proba(X_test) + y_pred = reshape_pred_output(y_pred) + y_pred = check_pred_output(y_train, y_pred) + score = return_score(y_test, y_pred, task) + + results_ = dict() + results_[result_criterion[0]] = score[0] + results_[result_criterion[1]] = score[1] + results_model = pd.DataFrame([results_], columns=result_criterion[:2]) + results_model.columns = f"{method}_" + results_model.columns + results_model.to_csv(results_model_dir, index=False) + + if bagging: + # Run with bagging strategy + estimator_bagging.fit(X_train, y_train) + if task == "regression": + y_pred = estimator_bagging.predict(X_test) + else: + y_pred = estimator_bagging.predict_proba(X_test) + y_pred = reshape_pred_output(y_pred) + y_pred = check_pred_output(y_train, y_pred) + score = return_score(y_test, y_pred, task) + + results_ = dict() + results_[result_criterion[0]] = score[0] + results_[result_criterion[1]] = score[1] + results_model = pd.DataFrame([results_], columns=result_criterion[:2]) + results_model.columns = f"{method}_" + results_model.columns + + marker = f"{data_name}_{method}-bagging_num_train-{num_train}_rs-{random_state}" + results_model_dir = result_save_dir_base + f"/{marker}.csv" + results_model.to_csv(results_model_dir, index=False) + + return None + + +# Main +def main(data_name, num_train, method, random_state, bagging, device): + + # Setting for train size + if "all" in data_name: + data_name_list = carte_datalist + else: + data_name_list = data_name + if isinstance(data_name_list, list) == False: + data_name_list = [data_name_list] + + # Setting for train size + if "all" in num_train: + num_train = [32, 64, 128, 256, 512, 1024, 2048] + else: + if isinstance(num_train, list) == False: + num_train = [num_train] + num_train = list(map(int, num_train)) + else: + num_train = list(map(int, num_train)) + + # Setting for bagging + if bagging == "True": + bagging = True + else: + bagging = False + + # Setting for methods + if "full" in method: + method_list = carte_singletable_baselines["full"] + elif "reduced" in method: + assert bagging == False + method_list = carte_singletable_baselines["reduced"] + elif "f-r" in method: + method_list = set(carte_singletable_baselines["full"]) + method_list -= set(carte_singletable_baselines["reduced"]) + method_list = list(method_list) + method_list.sort() + else: + method_list = method + if isinstance(method_list, list) == False: + method_list = [method_list] + + # Setting for random state + if "all" in random_state: + random_state = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + else: + if isinstance(random_state, list) == False: + random_state = [random_state] + random_state = list(map(int, random_state)) + else: + random_state = list(map(int, random_state)) + + # List out all the cases and run + args_dict = dict() + args_dict["data_name"] = data_name_list + args_dict["num_train"] = num_train + args_dict["method"] = method_list + args_dict["random_state"] = random_state + args_dict["device"] = [device] + args_dict["bagging"] = [bagging] + args_list = list(ParameterGrid(args_dict)) + + # Depending on the specific machine or computing environment, you may want to parallelize the evaluation. + for args in args_list: + run_model(**args) + marker = f"{args['data_name']}_{args['method']}_num_train-{args['num_train']}_rs-{args['random_state']}" + print(marker + " is complete") + + +if __name__ == "__main__": + + # Set parser + import argparse + + parser = argparse.ArgumentParser(description="Evaluate model for singletables.") + parser.add_argument( + "-dn", + "--data_name", + nargs="+", + type=str, + help="dataset to evaluate", + ) + parser.add_argument( + "-nt", + "--num_train", + nargs="+", + type=str, + help="Number of train", + ) + parser.add_argument( + "-m", + "--method", + nargs="+", + type=str, + help="Method to evaluate", + ) + parser.add_argument( + "-rs", + "--random_state", + nargs="+", + type=str, + help="Random_state", + ) + parser.add_argument( + "-b", + "--bagging", + type=str, + help="include bagging strategy for evaluation", + ) + parser.add_argument( + "-dv", + "--device", + type=str, + help="Device, cpu or cuda", + ) + args = parser.parse_args() + + main( + args.data_name, + args.num_train, + args.method, + args.random_state, + args.bagging, + args.device, + ) diff --git a/carte/scripts/preprocess_lm.py b/carte/scripts/preprocess_lm.py new file mode 100644 index 0000000..3916347 --- /dev/null +++ b/carte/scripts/preprocess_lm.py @@ -0,0 +1,121 @@ +""" Python script for preparing datasets for evaluation +""" + +# >>> +if __name__ == '__main__': + import os + import sys + + _project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + os.environ['PROJECT_DIR'] = _project_dir + sys.path.append(_project_dir) + del _project_dir +# <<< + +import pandas as pd +import numpy as np +import pickle +import json +import os +from carte.configs.directory import config_directory +from carte.configs.carte_configs import carte_datalist +from carte.src.preprocess_utils import ( + extract_fasttext_features, + extract_llm_features, + table2llmfeatures, +) + + +def data_preprocess(data_name: str, device: str = "cuda:0"): + + # Load data + data_pd_dir = f"{config_directory['data_singletable']}/{data_name}/raw.parquet" + data_pd = pd.read_parquet(data_pd_dir) + data_pd.fillna(value=np.nan, inplace=True) + + # Basic settings for the data + config_data_dir = f"{config_directory['data_singletable']}/{data_name}/config_data.json" + filename = open(config_data_dir) + config_data = json.load(filename) + + # Set the data without the target + data_X = data_pd.drop(columns=config_data["target_name"]) + + data = dict() + data_fasttext = None + data_llm = None + data_sentence_llm_embed_num = None + data_sentence_llm_concat_num = None + + if config_data["entity_name"] is not None: + data_fasttext = extract_fasttext_features( + data=data_X, + extract_col_name=config_data["entity_name"], + ) + data_llm = extract_llm_features( + data=data_X, + extract_col_name=config_data["entity_name"], + device=device, + ) + else: + pass + + data_sentence_llm_embed_num = table2llmfeatures( + data=data_X, + embed_numeric=True, + device=device, + ) + data_sentence_llm_concat_num = table2llmfeatures( + data=data_X, + embed_numeric=False, + device=device, + ) + + data["fasttext"] = data_fasttext + data["llm"] = data_llm + data["sentence-llm-embed-num"] = data_sentence_llm_embed_num + data["sentence-llm-concat-num"] = data_sentence_llm_concat_num + + save_dir = f"{config_directory['data_singletable']}/{data_name}/external.pickle" + + with open(save_dir, "wb") as pickle_file: + pickle.dump(data, pickle_file) + + +def main(datalist, device: str = "cuda:0"): + + datalist_total = carte_datalist + + # Setting methods + if "all" in datalist: + data_list = datalist_total + else: + data_list = datalist + if isinstance(data_list, list) == False: + data_list = list(data_list) + + for data_name in data_list: + data_preprocess(data_name=data_name, device=device) + print(f"{data_name} complete!") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Data Preparation") + parser.add_argument( + "-dt", + "--datalist", + nargs="+", + type=str, + help="List of data", + ) + parser.add_argument( + "-de", + "--device", + type=str, + help="Device", + ) + args = parser.parse_args() + + main(args.datalist, args.device) diff --git a/carte/scripts/preprocess_raw.py b/carte/scripts/preprocess_raw.py new file mode 100644 index 0000000..26e83a6 --- /dev/null +++ b/carte/scripts/preprocess_raw.py @@ -0,0 +1,1444 @@ +"""Script for preprocessing raw data.""" + +# >>> +if __name__ == "__main__": + import os + import sys + + _project_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + os.environ["PROJECT_DIR"] = _project_dir + sys.path.append(_project_dir) + del _project_dir +# <<< + +import os +import json +import ast +import pandas as pd +import numpy as np +from carte.configs.directory import config_directory +from carte.configs.carte_configs import carte_datalist + + +def _drop_high_null(data, proportion=0.5): + """Drop columns with high fraction of missing values""" + null_num = np.array([data[col].isnull().sum() for col in data.columns]) + null_crit = int(len(data) * proportion) + null_col = list(data.columns[null_num > null_crit]) + return data.drop(columns=null_col) + + +def _drop_single_unique(data): + """Drop columns with single unique values.""" + num_unique_cols = [col for col in data.columns if data[col].nunique() == 1] + return data.drop(columns=num_unique_cols) + + +def _load_raw_data(data_name, file_type="csv", sep=","): + """Load the raw data for preprocessing.""" + data_dir = f"{config_directory['data_raw']}/{data_name}.{file_type}" + if file_type == "csv": + data = pd.read_csv(data_dir, sep=sep) + elif file_type == "json": + data_file = open(data_dir) + data = [] + for line in data_file: + data.append(json.loads(line)) + data = pd.DataFrame(data) + data_file.close() + data.columns = data.columns.str.replace(" ", "_") + data.columns = data.columns.str.replace("\n", "_") + data.columns = data.columns.str.replace("%", "Percentage") + data.replace("\n", " ", regex=True, inplace=True) + return data + + +def _save_processed_data(data_name, data, target_name, entity_name, task, repeated): + """Save the preprocessed data and configs.""" + # save the data + save_dir = f"{config_directory['data_singletable']}/{data_name}/" + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + data.to_parquet(save_dir + "raw.parquet") + # save the config file + config = dict() + config["entity_name"] = entity_name + config["target_name"] = target_name + config["task"] = task + config["repeated"] = repeated + with open(save_dir + "config_data.json", "w") as outfile: + json.dump(config, outfile) + return None + + +def preprocess_data(data_name): + """Preprocess the raw data with the given name of the dataset.""" + + # Load data + data = _load_raw_data(data_name) + + # Preoprocess depending on each data + if data_name == "anime_planet": + # basic info + target_name = "Rating_Score" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.replace("Unknown", np.nan, inplace=True) + target_name = "Rating_Score" + data.dropna(subset=target_name, inplace=True) + data[target_name] = data[target_name].astype("float") + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("Anime-PlanetID") + drop_col.append("Number_Votes") + drop_col.append("Url") + data.drop(columns=drop_col, inplace=True) + data["Finished"] = data["Finished"].astype("str") + data["Episodes"] = data["Episodes"].astype("float") + data["Duration"] = data["Duration"].astype("float") + elif data_name == "babies_r_us": + # basic info + target_name = "price" + entity_name = "title" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("int_id") + drop_col.append("ext_id") + drop_col.append("SKU") + data.drop(columns=drop_col, inplace=True) + temp = data["is_discounted"].copy() + temp = temp.astype("str") + data["is_discounted"] = temp + elif data_name == "beer_ratings": + # basic info + target_name = "review_overall" + entity_name = "Beer_Name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [col for col in data.columns if "review" in col] + drop_col.remove(target_name) + data.drop(columns=drop_col, inplace=True) + numeric_cols = data.select_dtypes(exclude="object").columns.to_list() + data[numeric_cols] = data[numeric_cols].astype("float") + data.rename(columns={"Beer_Name_(Full)": "Beer_Name"}, inplace=True) + elif data_name == "bikedekho": + # basic info + target_name = "price" + entity_name = "bike_name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("id") + data.drop(columns=drop_col, inplace=True) + data["model_year"] = data["model_year"].astype("str") + data["km_driven"] = data["km_driven"].astype("float") + elif data_name == "bikewale": + # basic info + target_name = "price" + entity_name = "bike_name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] >= 500 + data = data[mask] + data[target_name] = np.emath.logn(10, data[target_name]) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("id") + data.drop(columns=drop_col, inplace=True) + data["model_year"] = data["model_year"].astype("str") + data["km_driven"] = data["km_driven"].astype("float") + elif data_name == "buy_buy_baby": + # basic info + target_name = "price" + entity_name = "title" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name] + 1) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("int_id") + drop_col.append("ext_id") + drop_col.append("SKU") + drop_col.append("company_free") + data.drop(columns=drop_col, inplace=True) + temp = data["is_discounted"].copy() + temp = temp.astype("str") + temp[temp == "True"] = "1" + temp[temp == "False"] = "0" + data["is_discounted"] = temp + elif data_name == "cardekho": + # basic info + target_name = "price" + entity_name = "model" + task = "regression" + repeated = False + # preprocess + data.rename(columns={"km": "mileage"}, inplace=True) + data["model_year"] = data["model_year"].astype(str) + target_name = "price" + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(100, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "chocolate_bar_ratings": + # basic info + target_name = "Rating" + entity_name = "Specific_Bean_Origin_or_Bar_Name" + task = "classification" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + temp = data["Rating"].copy() + temp[temp < 3.25] = 0 + temp[temp != 0] = 1 + data["Rating"] = temp + data = _drop_high_null(data) + data = _drop_single_unique(data) + data.drop(columns="REF", inplace=True) + data.columns = data.columns.str.replace(" ", "_") + data["Review_Date"] = data["Review_Date"].astype("str") + data["Cocoa_Percent"] = data["Cocoa_Percent"].str.replace("%", "") + data["Cocoa_Percent"] = data["Cocoa_Percent"].astype("float") + elif data_name == "clear_corpus": + # basic info + target_name = "BT_Easiness" + entity_name = "Title" + task = "regression" + repeated = False + # preprocess + data = data.replace("?", np.nan) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("ID") + drop_col.append("BT_s.e.") + drop_col.append("MPAA_#Avg") + drop_col.append("MPAA__#Max") + data.drop(columns=drop_col, inplace=True) + data.reset_index(drop=True, inplace=True) + data["Pub_Year"] = data["Pub_Year"].astype("str") + data["Pub_Year"] = data["Pub_Year"].str.split(".").str[0] + numeric_cols = data.select_dtypes(exclude="object").columns.to_list() + data[numeric_cols] = data[numeric_cols].astype("float") + elif data_name == "coffee_ratings": + # basic info + target_name = "rating" + entity_name = "name" + task = "classification" + repeated = False + # preprocess + temp = data[target_name].copy() + temp[temp <= 93] = 0 + temp[temp != 0] = 1 + data[target_name] = temp + data.reset_index(drop=True, inplace=True) + data[target_name] = data[target_name].astype("float") + data.dropna(subset=target_name, inplace=True) + data.drop_duplicates(subset=["name"], inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("slug") + drop_col.append("all_text") + drop_col.append("review_date") + drop_col.append("est_price") + drop_col.append("aroma") + drop_col.append("acid") + drop_col.append("body") + drop_col.append("flavor") + drop_col.append("aftertaste") + drop_col.append("agtron") + data.drop(columns=drop_col, inplace=True) + elif data_name == "company_employees": + # basic info + target_name = "current_employee_estimate" + entity_name = "name" + task = "regression" + repeated = False + # preprocess + data.drop(columns=["Unnamed:_0"], inplace=True) + data.dropna(subset=target_name, inplace=True) + data.drop_duplicates(subset="name", keep=False, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = data[target_name].astype("float") + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_cols = [] + drop_cols.append("country") + drop_cols.append("total_employee_estimate") + data.drop(columns=drop_cols, inplace=True) + data["year_founded"] = data["year_founded"].astype("str") + data["year_founded"] = data["year_founded"].str.split(".").str[0] + temp = data["year_founded"].copy() + temp[temp == "nan"] = np.nan + data["year_founded"] = temp + num_cols = data.select_dtypes(exclude="object").columns + data[num_cols] = data[num_cols].astype("float") + elif data_name == "employee_remuneration": + # Exception with different sep + data = _load_raw_data(data_name, sep=";") + # basic info + target_name = "Remuneration" + entity_name = "Title" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data.drop(columns=["Name"], inplace=True) + data["Year"] = data["Year"].astype("str") + elif data_name == "employee_salaries": + # basic info + target_name = "current_annual_salary" + entity_name = "employee_position_title" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["year_first_hired"] = data["year_first_hired"].astype("str") + elif data_name == "fifa22_players": + # basic info + target_name = "wage_eur" + entity_name = "name" + task = "regression" + repeated = False + # preprocess + drop_col_url = [col for col in data.columns if "_url" in col] + drop_col_id = [col for col in data.columns if "_id" in col] + data.drop(columns=drop_col_url + drop_col_id, inplace=True) + data = data[data.columns[:-68]] + data.rename(columns={"short_name": "name"}, inplace=True) + data.drop_duplicates(subset=["name"], inplace=True) + data.reset_index(drop=True, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("long_name") + drop_col.append("overall") + drop_col.append("potential") + drop_col.append("league_name") + drop_col.append("league_level") + drop_col.append("weak_foot") + drop_col.append("skill_moves") + drop_col.append("real_face") + data.drop(columns=drop_col, inplace=True) + data["club_jersey_number"] = data["club_jersey_number"].astype("str") + data["club_jersey_number"] = data["club_jersey_number"].str.split(".").str[0] + data["club_contract_valid_until"] = data["club_contract_valid_until"].astype( + "str" + ) + data["club_contract_valid_until"] = ( + data["club_contract_valid_until"].str.split(".").str[0] + ) + num_cols = data.select_dtypes(exclude="object").columns + data[num_cols] = data[num_cols].astype("float") + elif data_name == "filmtv_movies": + # basic info + target_name = "public_vote" + entity_name = "title" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["year"] = data["year"].astype(str) + data["duration"] = data["duration"].astype(float) + drop_col = [] + drop_col.append("filmtv_id") + drop_col.append("avg_vote") + drop_col.append("total_votes") + drop_col.append("humor") + drop_col.append("rhythm") + drop_col.append("effort") + drop_col.append("tension") + drop_col.append("erotism") + data.drop(columns=drop_col, inplace=True) + elif data_name == "journal_jcr": + # basic info + target_name = "2021_JIF" + entity_name = "Journal_name" + task = "regression" + repeated = False + # preprocess + data.replace("N/A", np.nan, regex=True, inplace=True) + num_cols = data.columns[4:8] + num_cols = num_cols.append(data.columns[10:]) + for col in num_cols: + data[col] = data[col].str.replace(",", "") + data[col] = data[col].astype("float") + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name] + 1) + data = _drop_high_null(data) + data = _drop_single_unique(data) + remove_cols = [] + remove_cols.append("Total_Citations") + remove_cols.append("2021_JCI") + remove_cols.append("JIF_Without_Self_Cites") + remove_cols.append("5_Year_JIF") + remove_cols.append("Immediacy_Index") + remove_cols.append("Normalized_Eigenfactor") + remove_cols.append("Eigenfactor") + remove_cols.append("Article_Influence_Score") + remove_cols.append("Total_Articles") + data.drop(columns=remove_cols, inplace=True) + elif data_name == "journal_sjr": + # Exception with different sep + data = _load_raw_data(data_name, sep=";") + # basic info + target_name = "H_index" + entity_name = "Title" + task = "regression" + repeated = False + # preprocess + col_keep = list(data.columns[[2, 3, 4, 7]]) + list(data.columns)[-6:] + data = data[col_keep] + data.columns = data.columns.str.replace(" ", "_") + temp1 = data["Issn"].str.split(",").str[0] + temp1 = temp1.rename("ISSN") + data["Issn"] = temp1 + temp2 = data["Issn"].str.split(",").str[1] + temp2 = temp2.rename("e-ISSN") + data["e-ISSN"] = temp2 + data.drop_duplicates(subset="Title", inplace=True) + target_name = "H_index" + data.dropna(subset=target_name, inplace=True) + data[target_name] = np.log10(data[target_name] + 1) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "jp_anime": + # basic info + target_name = "Score" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.replace("UNKNOWN", np.nan, inplace=True) + data.replace("Unknown", np.nan, inplace=True) + mask = data["English_name"].isnull() + temp = data["English_name"].copy() + temp[mask] = data["Name"][mask] + data["English_name"] = temp + data.reset_index(drop=True, inplace=True) + data.dropna(subset=target_name, inplace=True) + data[target_name] = data[target_name].astype("float") + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + temp = data["Aired"].copy() + data["Start_Date"] = temp.str.split(" to ").str[0] + data["End_Date"] = temp.str.split(" to ").str[1] + keep_col = list(data.columns) + keep_col.remove("anime_id") + keep_col.remove("Name") + keep_col.remove("Other_name") + keep_col.remove("Scored_By") + keep_col.remove("Image_URL") + keep_col.remove("Rank") + keep_col.remove("Aired") + data = data[keep_col] + data.rename(columns={"English_name": "Name"}, inplace=True) + num_cols = data.select_dtypes(exclude="object").columns + data[num_cols] = data[num_cols].astype("float") + data["Rating"] = data["Rating"].str.split(" - ").str[0] + data.drop_duplicates(subset="Name", inplace=True) + data.reset_index(drop=True, inplace=True) + data["Episodes"] = data["Episodes"].astype(float) + temp = data["Duration"].copy() + temp = temp.astype(str) + temp = temp.str.replace(" per ep", "", regex=False) + temp1 = temp.str.split(" hr").str[0] + temp1[~temp1.str.isnumeric()] = "0" + temp1 = temp1.astype(float) * 60 + temp2 = temp.str.split(" hr").str[1] + temp2 = temp2.astype(str) + temp2 = temp2.str.replace(" min", "", regex=False) + temp2 = temp2.str.replace(" ", "", regex=False) + temp2[~temp2.str.isnumeric()] = "0" + temp2 = temp2.astype(float) + temp3 = temp.copy() + temp3[temp.str.contains("hr")] = "nan" + temp3 = temp3.str.replace(" min", "", regex=False) + temp3 = temp3.str.replace(" ", "", regex=False) + temp3[~temp3.str.isnumeric()] = "0" + temp3 = temp3.astype(float) + temp = temp1 + temp2 + temp3 + temp[temp == 0] = np.nan + data["Duration"] = temp + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "k_drama": + # basic info + target_name = "score" + entity_name = "Kdrama_name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data[target_name] = data[target_name].astype("float") + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_cols = [] + drop_cols.append("scored_by") + drop_cols.append("Ranked") + data.drop(columns=drop_cols, inplace=True) + data["Content_Rating"] = data["Content_Rating"].str.split(" - ").str[0] + elif data_name == "michelin": + # basic info + target_name = "Award" + entity_name = "Kdrama_name" + task = "classification" + repeated = False + # preprocess + temp = data["Award"].copy() + temp[temp.str.contains("MICHELIN")] = "1" + temp[temp.str.contains("Bib Gourmand")] = "0" + temp = temp.astype("float") + data["Award"] = temp + data.rename(columns={"WebsiteUrl": "Website_Url"}, inplace=True) + data.rename( + columns={"FacilitiesAndServices": "Facilities_And_Services"}, inplace=True + ) + data.rename(columns={"PhoneNumber": "Phone_Number"}, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["Facilities_And_Services"] = data["Facilities_And_Services"].str.replace( + ",", ", " + ) + drop_col = [] + drop_col.append("Phone_Number") + drop_col.append("Url") + drop_col.append("Price") + drop_col.append("Facilities_And_Services") + data.drop(columns=drop_col, inplace=True) + elif data_name == "mlds_salaries": + # basic info + target_name = "salary_in_usd" + entity_name = "job_title" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data[target_name] = data[target_name].astype("float") + data[target_name] = np.log10(data[target_name]) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["work_year"] = data["work_year"].astype("str") + data["remote_ratio"] = data["remote_ratio"].astype("str") + mapping = dict() + mapping["experience_level"] = dict() + mapping["experience_level"]["SE"] = "Senior-level / Expert" + mapping["experience_level"]["EN"] = "Entry-level / Junior" + mapping["experience_level"]["MI"] = "Mid-level / Intermediate" + mapping["experience_level"]["EX"] = "Executive-level / Director" + mapping["employment_type"] = dict() + mapping["employment_type"]["FT"] = "Full-time" + mapping["employment_type"]["PT"] = "Part-time" + mapping["employment_type"]["CT"] = "Contract" + mapping["employment_type"]["FL"] = "Freelance" + mapping["remote_ratio"] = dict() + mapping["remote_ratio"]["0"] = "No remote work" + mapping["remote_ratio"]["50"] = "Partially remote" + mapping["remote_ratio"]["100"] = "Fully remote" + mapping["company_size"] = dict() + mapping["company_size"]["M"] = "Medium" + mapping["company_size"]["L"] = "Large" + mapping["company_size"]["S"] = "Small" + for name in mapping.keys(): + temp = data[name].copy() + temp = temp.map(mapping[name]) + data[name] = temp + drop_col = [] + drop_col.append("salary") + drop_col.append("salary_currency") + data.drop(columns=drop_col, inplace=True) + elif data_name == "movies": + # basic info + target_name = "revenue" + entity_name = "title" + task = "regression" + repeated = False + # preprocess + mask = data["revenue"] >= 1000 # >= 10000000 + data = data[mask] + data.dropna(subset="revenue", inplace=True) + data.reset_index(drop=True, inplace=True) + temp = data["budget"].copy() + mask = temp.str.contains(".jpg") + data = data[~mask] + data.reset_index(drop=True, inplace=True) + temp = data["runtime"].copy() + mask = temp == 0 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data["budget"] = data["budget"].astype("float") + temp = data["budget"].copy() + temp[temp == 0] = np.nan + data["budget"] = temp + data["popularity"] = data["popularity"].astype("float") + data.fillna(value=np.nan, inplace=True) + data.columns = data.columns.str.replace(" ", "_") + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + adjust_cols = [ + "belongs_to_collection", + "genres", + "production_companies", + "production_countries", + "spoken_languages", + ] + extract_name = ["name", "name", "name", "name", "iso_639_1"] + for i in range(len(adjust_cols)): + if adjust_cols[i] in data.columns: + col = [] + for idx in range(len(data)): + temp = data[adjust_cols[i]][idx] + if str(temp) == "nan": + col.append(np.nan) + else: + temp = ast.literal_eval(temp) + if isinstance(temp, list) is False: + temp = [temp] + if len(temp) == 0: + col.append(np.nan) + else: + temp = pd.DataFrame(temp, index=None) + temp[extract_name[i]] = temp[extract_name[i]] + ", " + col.append(temp[extract_name[i]].sum()[:-2]) + col = pd.Series(col) + col = col.rename(adjust_cols[i]) + data[adjust_cols[i]] = col + else: + pass + drop_col = [] + drop_col.append("id") + drop_col.append("imdb_id") + drop_col.append("overview") + drop_col.append("poster_path") + drop_col.append("original_title") + drop_col.append("original_language") + data.drop(columns=drop_col, inplace=True) + data.drop_duplicates(subset=["title", target_name], inplace=True) + data.reset_index(drop=True, inplace=True) + elif data_name == "museums": + # basic info + target_name = "Revenue" + entity_name = "Museum_Name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] > 0 + data = data[mask].copy() + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(100, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("Museum_ID") + drop_col.append("Income") + data.drop(columns=drop_col, inplace=True) + num_cols = data.select_dtypes(exclude="object").columns.tolist() + num_cols.remove(target_name) + data[num_cols] = data[num_cols].astype("str") + for col in num_cols: + data[col] = data[col].str.strip(".0") + data.reset_index(drop=True, inplace=True) + elif data_name == "mydramalist": + # basic info + target_name = "rating" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + temp = data["category"].copy() + mask = temp == "Drama" + data = data[mask] + data.reset_index(drop=True, inplace=True) + temp = data["country"].copy() + mask = temp == "South Korea" + data = data[~mask] + data.reset_index(drop=True, inplace=True) + for col in data.select_dtypes(include="object").columns: + temp = data[col].copy() + temp = temp.astype(str) + temp[temp.str.isspace()] = np.nan + temp[temp == "nan"] = np.nan + data[col] = temp + data.replace("“", "", regex=True, inplace=True) + data.replace("”", "", regex=True, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("url") + data.drop(columns=drop_col, inplace=True) + temp = data["duration"].copy() + temp = temp.astype(str) + mask = temp.str.contains("hr") + temp[~mask] = "0" + temp = temp.str.split("hr").str[0] + temp1 = temp.astype(float) * 60 + temp = data["duration"].copy() + temp = temp.astype(str) + temp[mask] = "0" + temp = temp.str.split("min").str[0] + temp2 = temp.astype(float) + data["duration"] = temp1 + temp2 + elif data_name == "nba_draft": + # basic info + target_name = "value_over_replacement" + entity_name = "player" + task = "classification" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + temp = data[target_name].copy() + temp[temp <= 0] = 0 + temp[temp != 0] = 1 + data[target_name] = temp + keep_col = [] + keep_col.append(target_name) + keep_col.append("year") + keep_col.append("overall_pick") + keep_col.append("team") + keep_col.append("player") + keep_col.append("college") + keep_col.append("years_active") + data = data[keep_col] + data["year"] = data["year"].astype("str") + data["overall_pick"] = data["overall_pick"].astype("str") + data.reset_index(drop=True, inplace=True) + elif data_name == "prescription_drugs": + # basic info + target_name = "WAC_at_Introduction" + entity_name = "Drug_Product_Description" + task = "regression" + repeated = False + # preprocess + unnamed_col = [col for col in data.columns if "Unnamed:" in col] + data.drop(columns=unnamed_col, inplace=True) + temp = data["Estimated_Number_of_Patients"].copy() + temp[temp == 0] = np.nan + data["Estimated_Number_of_Patients"] = temp + temp = data["Date_Introduced_to_Market"].copy() + temp = temp.str.split("-").str[0] + data["Date_Introduced_to_Market"] = temp + data.dropna( + subset=["Drug_Product_Description", "WAC_at_Introduction"], inplace=True + ) + data.reset_index(drop=True, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data, 0.9) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("NDC_Number") + data.drop(columns=drop_col, inplace=True) + elif data_name == "ramen_ratings": + # basic info + target_name = "Stars" + entity_name = "Variety" + task = "classification" + repeated = False + # preprocess + data["Stars"] = data["Stars"].str.replace("NS", "-1") + data["Stars"] = data["Stars"].str.replace("NR", "-1") + data["Stars"] = data["Stars"].str.replace("Unrated", "-1") + data["Stars"] = data["Stars"].str.split("/").str[0] + data["Stars"] = data["Stars"].astype("float") + temp = data["Stars"].copy() + temp[temp == -1] = np.nan + data["Stars"] = temp + data.dropna(subset="Stars", inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("Review_#") + data.drop(columns=drop_col, inplace=True) + temp = data["Stars"].copy() + temp[temp < 4] = 0 + temp[temp != 0] = 1 + data["Stars"] = temp + data.drop_duplicates(inplace=True) + data.reset_index(drop=True, inplace=True) + elif data_name == "roger_ebert": + # basic info + target_name = "critic_rating" + entity_name = "movie_name" + task = "classification" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] > 2 + data = data[mask] + data.reset_index(drop=True, inplace=True) + temp = data[target_name].copy() + temp[temp < 3.5] = 0 + temp[temp != 0] = 1 + data[target_name] = temp + data = _drop_high_null(data) + data = _drop_single_unique(data) + data.drop(columns="id", inplace=True) + data["year"] = data["year"].astype("str") + data["year"] = data["year"].str[:4] + temp = data["duration"].str.extract(r"([0-9]+)")[0] + temp = temp.astype("float") + data["duration"] = temp + elif data_name == "rotten_tomatoes": + # basic info + target_name = "Rating_Value" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.drop(columns="Id", inplace=True) + data.drop(columns="ReviewCount", inplace=True) + data.drop(columns="Actors", inplace=True) + data.rename(columns={"RatingValue": "Rating_Value"}, inplace=True) + data.rename(columns={"RatingCount": "Rating_Count"}, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data["Year"] = data["Year"].astype("str") + data["Creator"] = data["Creator"].str.replace(",", ", ", regex=False) + data["Cast"] = data["Cast"].str.replace(",", ", ", regex=False) + data["Genre"] = data["Genre"].str.replace(",", ", ", regex=False) + data["Country"] = data["Country"].str.replace(",", ", ", regex=False) + data["Language"] = data["Language"].str.replace(",", ", ", regex=False) + data["Release_Date"] = data["Release_Date"].str.split("(").str[0] + data["Duration"] = data["Duration"].str.replace("min", "") + data["Duration"] = data["Duration"].astype("float") + data["Rating_Count"] = data["Rating_Count"].str.replace(",", "").astype("float") + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "spotify": + # basic info + target_name = "popularity" + entity_name = "track" + task = "classification" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_cols = [] + drop_cols.append("uri") + data.drop(columns=drop_cols, inplace=True) + data["time_signature"] = data["time_signature"].astype("str") + data["sections"] = data["sections"].astype("str") + data["key"] = data["key"].astype("str") + data["duration_ms"] = data["duration_ms"].astype("float") + temp = data["mode"].copy() + mapping = {1: "Major", 0: "Minor"} + temp = temp.map(mapping) + data["mode"] = temp + elif data_name == "us_accidents_counts": + # basic info + target_name = "Counts" + entity_name = "City" + task = "regression" + repeated = False + # preprocess + elif data_name == "us_accidents_severity": + # basic info + target_name = "Severity" + entity_name = "Location" + task = "classification" + repeated = False + # preprocess + elif data_name == "us_presidential": + # basic info + target_name = "target" + entity_name = "region" + task = "regression" + repeated = False + # preprocess + elif data_name == "used_cars_24": + # basic info + target_name = "Price" + entity_name = "Model" + task = "regression" + repeated = False + # preprocess + drop_col = [] + drop_col.append("Unnamed:_0") + drop_col.append("EMI_(monthly)") + data.drop(columns=drop_col, inplace=True) + data.rename(columns={"Driven_(Kms)": "Mileage"}, inplace=True) + data["Model"] = data["Car_Brand"] + " " + data["Model"] + temp = data["Ownership"].copy() + temp = temp.astype(str) + temp[temp == "1"] = "First" + temp[temp == "2"] = "Second" + temp[temp == "3"] = "Third" + temp[temp == "4"] = "Fourth" + data["Ownership"] = temp + data["Model_Year"] = data["Model_Year"].astype(str) + data["Mileage"] = data["Mileage"].astype(float) + data["Price"] = data["Price"].astype(float) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(100, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + for col in data.select_dtypes(include="object").columns: + temp = data[col].copy() + temp = temp.astype(str) + temp[temp == "nan"] = np.nan + data[col] = temp + elif data_name == "used_cars_benz_italy": + # Exception with different sep + data = _load_raw_data(data_name, sep=";") + # basic info + target_name = "price" + entity_name = "model" + task = "regression" + repeated = False + # preprocess + data.replace("unknown", np.nan, inplace=True) + drop_col = [] + drop_col.append("Unnamed:_0") + data.drop(columns=drop_col, inplace=True) + data["model"] = data["brand"] + " " + data["model"] + mapping = dict() + mapping["fuel"] = dict() + mapping["fuel"]["d"] = "diesel" + mapping["fuel"]["g"] = "petrol" + mapping["fuel"]["e"] = "electric" + mapping["fuel"]["l"] = "lpg" + mapping["seller_type"] = dict() + mapping["seller_type"]["d"] = "dealer" + mapping["seller_type"]["p"] = "private" + for col in mapping.keys(): + temp = data[col].copy() + temp = temp.map(mapping[col]) + data[col] = temp + rename_map = dict() + rename_map["first_reg"] = "first_registration_date" + rename_map["mileage_km"] = "mileage" + rename_map["power_hp"] = "power" + data.rename(columns=rename_map, inplace=True) + data["mileage"] = data["mileage"].astype(float) + data["price"] = data["price"].astype(float) + temp = data["power"].copy() + temp = temp.astype(str) + temp[~temp.str.isnumeric()] = np.nan + temp = temp.astype(float) + data["power"] = temp + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] > 100 + data = data[mask] + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(10, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "used_cars_dot_com": + # basic info + target_name = "price" + entity_name = "model" + task = "regression" + repeated = False + # preprocess + data.rename(columns={"milage": "mileage"}, inplace=True) + data["model_year"] = data["model_year"].astype(str) + temp = data["mileage"].copy() + temp = temp.str.replace(" mi.", "", regex=False).str.replace( + ",", "", regex=False + ) + temp = temp.astype(float) + data["mileage"] = temp + temp = data["price"].copy() + temp = temp.str.replace("$", "", regex=False).str.replace(",", "", regex=False) + temp = temp.astype(float) + data["price"] = temp + data["model"] = data["brand"] + " " + data["model"] + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(100, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "used_cars_pakistan": + # basic info + target_name = "Price" + entity_name = "Model" + task = "regression" + repeated = False + # preprocess + data.rename(columns={"Make": "Brand"}, inplace=True) + data.rename(columns={"Make_Year": "Year"}, inplace=True) + data.rename(columns={"CC": "Engine_Capacity"}, inplace=True) + data["Year"] = data["Year"].astype(str) + data["Engine_Capacity"] = data["Engine_Capacity"].astype(float) + data["Mileage"] = data["Mileage"].astype(float) + data["Model"] = data["Brand"] + " " + data["Model"] + ", " + data["Version"] + drop_col = [] + drop_col.append("Brand") + drop_col.append("Version") + data.drop(columns=drop_col, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(100, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "used_cars_saudi_arabia": + # basic info + target_name = "Price" + entity_name = "Model" + task = "regression" + repeated = False + # preprocess + data["Year"] = data["Year"].astype(str) + data["Mileage"] = data["Mileage"].astype(float) + data["Negotiable"] = data["Negotiable"].astype(str) + data["Model"] = data["Make"] + " " + data["Type"] + drop_col = [] + drop_col.append("Make") + drop_col.append("Type") + data.drop(columns=drop_col, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] < 10 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data[target_name] = np.emath.logn(100, data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "videogame_sales": + # basic info + target_name = "Global_Sales" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log10(data[target_name] * 1e6) + drop_col = [col for col in data.columns if "Sales" in col] + drop_col.remove(target_name) + drop_col.append("Rank") + data.drop(columns=drop_col, inplace=True) + data["Year"] = data["Year"].astype("str") + data["Year"] = data["Year"].str.split(".").str[0] + temp = data["Year"].copy() + temp[temp == "nan"] = np.nan + data["Year"] = temp + data = _drop_high_null(data) + data = _drop_single_unique(data) + data.drop_duplicates(subset=["Name", "Year", "Global_Sales"], inplace=True) + data.reset_index(drop=True, inplace=True) + elif data_name == "whisky": + # basic info + target_name = "Meta_Critic" + entity_name = "Whisky" + task = "classification" + repeated = False + # preprocess + temp = data["Cost"] + map = dict() + map["$$$$$+"] = "over 300 CAD" + map["$$$$$"] = "between 125 and 300 CAD" + map["$$$$"] = "between 70 and 125 CAD" + map["$$$"] = "between 50 and 70 CAD" + map["$$"] = "between 30 and 50 CAD" + map["$"] = "less than 30 CAD" + data["Cost"] = temp.map(map) + temp = data["Cluster"] + map = dict() + map["A"] = "Full-bodied, sweet, pronounced sherry, fruity, honey, spicy" + map["B"] = "Full-bodied, sweet, pronounced sherry, fruity, floral, malty" + map["C"] = "Full-bodied, sweet, pronounced sherry, fruity, floral, nutty, spicy" + map["E"] = "Medium-bodied, medium-sweet, fruity, honey, malty, winey" + map["F"] = "Full-bodied, sweet, malty, fruity, spicy, smoky" + map["G"] = "Light-bodied, sweet, apéritif-style, honey, floral, fruity, spicy" + map["H"] = "Very light-bodied, sweet, apéritif-style, malty, fruity, floral" + map["I"] = "Medium-bodied, medium-sweet, smoky, medicinal, spicy, fruity, nutty" + map["J"] = "Full-bodied, dry, very smoky, pungent" + map["R0"] = "No Rye whisky" + map["R1"] = "Low Rye whisky" + map["R2"] = "Standard Rye whisky" + map["R3"] = "High Rye whisky" + map["R4"] = "Strong Rye whisky" + data["Cluster"] = temp.map(map) + data.drop(columns=["STDEV", "#", "Super_Cluster"], inplace=True) + data.fillna(value=np.nan, inplace=True) + data.reset_index(drop=True, inplace=True) + temp = data["Meta_Critic"].copy() + temp[temp <= 8.6] = 0 + temp[temp != 0] = 1 + data["Meta_Critic"] = temp + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "wikiliq_beer": + # basic info + target_name = "Price" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.replace("None", np.nan, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = data[target_name].str.replace("$", "", regex=False) + data[target_name] = data[target_name].astype(float) + mask = data[target_name].copy() == 0 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("Unnamed:_0") + drop_col.append("Rating") + data.drop(columns=drop_col, inplace=True) + data["ABV"] = data["ABV"].str[:-1] + data["ABV"] = data["ABV"].astype(float) + data["Rate_Count"] = data["Rate_Count"].astype(float) + elif data_name == "wikiliq_spirit": + # basic info + target_name = "Price" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = data[target_name].str.replace("$", "", regex=False) + data[target_name] = data[target_name].astype(float) + mask = data[target_name].copy() == 0 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + drop_col = [] + drop_col.append("Unnamed:_0") + drop_col.append("Rating") + data.drop(columns=drop_col, inplace=True) + data["ABV"] = data["ABV"].str[:-1] + data["ABV"] = data["ABV"].astype(float) + data["Rate_Count"] = data["Rate_Count"].astype(float) + data.replace("®", "", regex=True, inplace=True) + data.replace("™", "", regex=True, inplace=True) + elif data_name == "wina_pl": + # basic info + target_name = "price" + entity_name = "name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log10(data[target_name]) + data["vegan"] = data["vegan"].astype(str) + data["natural"] = data["natural"].astype(str) + data["vintage"] = data["vintage"].astype(str) + data["vintage"] = data["vintage"].str[:4] + temp = data["vintage"].copy() + temp[temp == "nan"] = np.nan + data["vintage"] = temp + data["volume"] = data["volume"] * 1000 + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "wine_dot_com_prices": + # basic info + target_name = "Prices" + entity_name = "Names" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] == 0 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + temp = data["Names"].copy() + data["Year"] = temp.str[-4:] + temp = data["Countrys"].copy() + data["Grapes"] = temp.str.split("from").str[0] + data["Region"] = temp.str.split("from").str[-1] + temp = data["Capacity"].copy() + temp = temp.str.replace("ml", "", regex=False) + temp = temp.astype("float") + data["Capacity"] = temp + drop_col = [] + drop_col.append("Countrys") + data.drop(columns=drop_col, inplace=True) + elif data_name == "wine_dot_com_ratings": + # basic info + target_name = "Ratings" + entity_name = "Names" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] == 0 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + temp = data["Names"].copy() + data["Year"] = temp.str[-4:] + temp = data["Countrys"].copy() + data["Grapes"] = temp.str.split("from").str[0] + data["Region"] = temp.str.split("from").str[-1] + temp = data["Capacity"].copy() + temp = temp.str.replace("ml", "", regex=False) + temp = temp.astype("float") + data["Capacity"] = temp + drop_col = [] + drop_col.append("Countrys") + data.drop(columns=drop_col, inplace=True) + elif data_name == "wine_enthusiasts_prices": + # basic info + target_name = "price" + entity_name = "title" + task = "regression" + repeated = False + # preprocess + drop_col = [] + drop_col.append("Unnamed:_0") + drop_col.append("region_1") + drop_col.append("region_2") + drop_col.append("taster_twitter_handle") + data.drop(columns=drop_col, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "wine_enthusiasts_ratings": + # basic info + target_name = "points" + entity_name = "title" + task = "regression" + repeated = False + # preprocess + drop_col = [] + drop_col.append("Unnamed:_0") + drop_col.append("region_1") + drop_col.append("region_2") + drop_col.append("taster_twitter_handle") + data.drop(columns=drop_col, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "wine_vivino_price": + # basic info + target_name = "Price" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + mask = data[target_name] == 0 + data = data[~mask] + data.reset_index(drop=True, inplace=True) + data[target_name] = np.log(data[target_name]) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["Number_Of_Ratings"] = data["Number_Of_Ratings"].astype(float) + data["Region"] = data["Region"] + ", " + data["Country"] + drop_col = [] + drop_col.append("Country") + data.drop(columns=drop_col, inplace=True) + elif data_name == "wine_vivino_rating": + # basic info + target_name = "Rating" + entity_name = "Name" + task = "regression" + repeated = False + # preprocess + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["Number_Of_Ratings"] = data["Number_Of_Ratings"].astype(float) + elif data_name == "yelp": + # Exception with different file_type + data = _load_raw_data(data_name, file_type="json") + # basic info + target_name = "stars" + entity_name = "name" + task = "classification" + repeated = False + # preprocess + temp = data["categories"].copy() + mask = temp.str.contains("Restaurants") | temp.str.contains("Food") + data = data[mask].copy() + data.reset_index(drop=True, inplace=True) + data.dropna(subset=[target_name], inplace=True) + data.reset_index(drop=True, inplace=True) + temp = data["stars"].copy() + temp[temp <= 3.5] = 0 + temp[temp != 0] = 1 + data["stars"] = temp + temp = data["attributes"].copy() + temp = temp.to_list() + temp1 = [{} if x is None else x for x in temp] + attributes_df = pd.DataFrame(temp1) + attribute_extract_cols = [] + attribute_extract_cols.append(("RestaurantsPriceRange2", "price_range")) + for col in attribute_extract_cols: + data[col[1]] = attributes_df[col[0]] + temp = data[col[1]].copy() + temp[temp.isnull()] = np.nan + temp[temp == "None"] = np.nan + data[col[1]] = temp + temp = data["hours"].copy() + temp = temp.astype("str") + temp = temp.str.extractall(r"([A-Z]+)") + temp = temp.groupby(level=0).sum()[0] + temp = temp.str.replace("N", "") + temp = temp.str.len() + temp = temp.astype("float") + temp[temp == 0] = np.nan + data["number_of_days_open"] = temp + temp = data["is_open"].copy() + temp[temp == 1] = "open" + temp[temp == 0] = "closed" + temp = temp.astype("str") + data["is_open"] = temp + data["review_count"] = data["review_count"].astype("float") + data.drop(columns="hours", inplace=True) + data.drop(columns="attributes", inplace=True) + data.drop(columns="business_id", inplace=True) + data = _drop_high_null(data) + data = _drop_single_unique(data) + elif data_name == "zomato": + # basic info + target_name = "rating" + entity_name = "name" + task = "classification" + repeated = False + # preprocess + data[target_name].replace("--", np.nan, inplace=True) + data.dropna(subset=target_name, inplace=True) + data.reset_index(drop=True, inplace=True) + data[target_name] = data[target_name].astype("float") + temp = data[target_name].copy() + temp[temp < 4] = 0 + temp[temp != 0] = 1 + data[target_name] = temp + data = _drop_high_null(data) + data = _drop_single_unique(data) + data["cost"] = data["cost"].str[1:] + data["cost"] = data["cost"].astype("float") + drop_col = [] + drop_col.append("Unnamed:_0") + drop_col.append("id") + drop_col.append("menu") + data.drop(columns=drop_col, inplace=True) + + # Save data + _save_processed_data(data_name, data, target_name, entity_name, task, repeated) + + return None + + +# Main +def main(data_name_list): + + if "all" in data_name_list: + data_name_list = carte_datalist + else: + if isinstance(data_name_list, list) == False: + data_name_list = [data_name_list] + + for data_name in data_name_list: + preprocess_data(data_name) + print(f"{data_name} complete!") + + return None + + +if __name__ == "__main__": + + # Set parser + import argparse + + parser = argparse.ArgumentParser(description="Preprocess raw data.") + parser.add_argument( + "-dt", + "--data_name_list", + nargs="+", + type=str, + help="data_name to preprocess", + ) + args = parser.parse_args() + + main(args.data_name_list) diff --git a/carte/src/__init__.py b/carte/src/__init__.py new file mode 100644 index 0000000..fcda92d --- /dev/null +++ b/carte/src/__init__.py @@ -0,0 +1,9 @@ +from carte.src.baseline_multitable import * +from carte.src.baseline_singletable_nn import * +from carte.src.carte_estimator import * +from carte.src.carte_model import * +from carte.src.carte_gridsearch import * +from carte.src.carte_table_to_graph import * +from carte.src.evaluate_utils import * +from carte.src.visualization_utils import * +from carte.src.preprocess_utils import * diff --git a/carte/src/baseline_multitable.py b/carte/src/baseline_multitable.py new file mode 100644 index 0000000..c03271b --- /dev/null +++ b/carte/src/baseline_multitable.py @@ -0,0 +1,634 @@ +"""Baselines for multitable problem.""" + +import pandas as pd +import numpy as np + +from typing import Union +from sklearn.model_selection import train_test_split +from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin +from sklearn.utils.validation import check_is_fitted, check_random_state +from sklearn.metrics import r2_score, roc_auc_score +from joblib import Parallel, delayed + +from catboost import CatBoostRegressor, CatBoostClassifier +from xgboost import XGBRegressor, XGBClassifier +from sklearn.ensemble import ( + HistGradientBoostingRegressor, + HistGradientBoostingClassifier, +) + +class GradientBoostingMultitableBase(BaseEstimator): + """Base class for Gradient Boosting Multitable Estimator.""" + + def __init__( + self, + *, + source_data, + source_fraction, + num_model, + val_size, + random_state, + n_jobs, + ): + self.source_data = source_data + self.source_fraction = source_fraction + self.num_model = num_model + self.val_size = val_size + self.random_state = random_state + self.n_jobs = n_jobs + + def fit(self, X, y): + """Fit the model. + + Parameters + ---------- + X : Pandas dataframe of the target dataset (n_samples) + The input samples. + + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Fitted estimator. + """ + + # Preliminary settings + self.is_fitted_ = False + self.X_ = X + self.y_ = y + self._set_gb_method() + + # Set random_state + random_state = check_random_state(self.random_state) + random_state_list = [random_state.randint(10000) for _ in range(self.num_model)] + + # Run parallel for different train/validation split + result_fit = Parallel(n_jobs=self.n_jobs)( + delayed(self._run_fit_with_source_split)(X, y, rs) + for rs in random_state_list + ) + + # Store the required results that may be used later + self.estimator_list_ = [model for (model, _) in result_fit] + self.valid_loss_ = [valid_loss for (_, valid_loss) in result_fit] + + self.is_fitted_ = True + + return self + + + def _run_fit_with_source_split(self, X, y, random_state): + """Train each model corresponding to the random_state with the split on Source and train/validtion on Target. + + Returns the trained estimator, and the validation loss of the train model. + """ + + # Set validation by val_size + stratify = None + if self._estimator_type == "classifier": + stratify = self.y_ + dx_train, dx_valid, dy_train, dy_valid = train_test_split( + X, + y, + test_size=self.val_size, + shuffle=True, + stratify=stratify, + random_state=random_state, + ) + + # Set source data + X_train_source, y_train_source = self._load_source_data(random_state) + + # Total dataset + X_train = pd.concat([dx_train, X_train_source], axis=0) + y_train = pd.concat([dy_train, y_train_source], axis=0) + + # Set estimator, run fit/predict to obtain validation loss + estimator = self._set_estimator() + estimator.fit(X_train, y_train) + y_pred = self._generate_output(estimator, dx_valid) + valid_loss = self._return_score(dy_valid, y_pred) + + return estimator, valid_loss + + def _load_source_data(self, random_state): + """Loads the Source data and extract based on the defined fraction of Source. + + Applies stratification on the Source data based on their sizes. + The max. size of the source data is set at 10,000 to prevent overfitting on the Source. + """ + # Set train_size (max = 10000) + if len(self.source_data["X"]) > 10000: + train_size = 10000 / len(self.source_data["X"]) * self.source_fraction + else: + train_size = self.source_fraction + # Set split for source data + if self._estimator_type == "regressor": + stratify = self.source_data["domain_indicator"] + if self._estimator_type == "classifier": + y_source_temp = self.source_data["y"].copy() + y_source_temp = y_source_temp.astype(str) + stratify = self.source_data["domain_indicator"] + "_" + y_source_temp + X_train_source, _, y_train_source, _ = train_test_split( + self.source_data["X"], + self.source_data["y"], + train_size=train_size, + random_state=random_state, + shuffle=True, + stratify=stratify, + ) + return X_train_source, y_train_source + + def _generate_output(self, estimator, X): + """Generate output on the given estimator and X.""" + + # Predict + if self._estimator_type == "regressor": + y_pred = estimator.predict(X) + else: + y_pred = estimator.predict_proba(X) + # Reshape prediction + if self._estimator_type == "classifier": + num_pred = len(y_pred) + if y_pred.shape == (num_pred, 2): + y_pred = y_pred[:, 1] + elif y_pred.shape == (num_pred, 1): + y_pred = y_pred.ravel() + else: + pass + # Control for nan in prediction + if np.isnan(y_pred).sum() > 0: + mean_pred = np.mean(self.y_) + y_pred[np.isnan(y_pred)] = mean_pred + return y_pred + + def _return_score(self, y, y_pred): + """Return the score based on the task.""" + if self._estimator_type == "regressor": + score = r2_score(y, y_pred) + else: + score = roc_auc_score(y, y_pred) + return score + + def _set_estimator(self): + """Set the estimator according to the model of Gradient-Boosted Trees.""" + + fixed_params = dict() + if self.gb_method_ == "catboost": + fixed_params["cat_features"] = self.cat_features_ + fixed_params["verbose"] = False + fixed_params["allow_writing_files"] = False + fixed_params["thread_count"] = self.thread_count + fixed_params["max_ctr_complexity"] = 2 + catboost_params = dict() + catboost_params["max_depth"] = self.max_depth + catboost_params["learning_rate"] = self.learning_rate + catboost_params["bagging_temperature"] = self.bagging_temperature + catboost_params["l2_leaf_reg"] = self.l2_leaf_reg + catboost_params["one_hot_max_size"] = self.one_hot_max_size + catboost_params["iterations"] = self.iterations + if self._estimator_type == "regressor": + estimator_ = CatBoostRegressor(**fixed_params, **catboost_params) + else: + estimator_ = CatBoostClassifier(**fixed_params, **catboost_params) + elif self.gb_method_ == "xgboost": + fixed_params["booster"] = "gbtree" + fixed_params["tree_method"] = "exact" # exact approx hist + xgb_params = dict() + xgb_params["n_estimators"] = self.n_estimators + xgb_params["max_depth"] = self.max_depth + xgb_params["min_child_weight"] = self.min_child_weight + xgb_params["subsample"] = self.subsample + xgb_params["learning_rate"] = self.learning_rate + xgb_params["colsample_bylevel"] = self.colsample_bylevel + xgb_params["colsample_bytree"] = self.colsample_bytree + xgb_params["gamma"] = self.reg_gamma + xgb_params["lambda"] = self.reg_lambda + xgb_params["alpha"] = self.reg_alpha + if self._estimator_type == "regressor": + estimator_ = XGBRegressor(**fixed_params, **xgb_params) + else: + estimator_ = XGBClassifier(**fixed_params, **xgb_params) + elif self.gb_method_ == "histgb": + histgb_params = dict() + histgb_params["learning_rate"] = self.learning_rate + histgb_params["max_depth"] = self.max_depth + histgb_params["max_leaf_nodes"] = self.max_leaf_nodes + histgb_params["min_samples_leaf"] = self.min_samples_leaf + histgb_params["l2_regularization"] = self.l2_regularization + if self._estimator_type == "regressor": + estimator_ = HistGradientBoostingRegressor(**fixed_params, **histgb_params) + else: + estimator_ = HistGradientBoostingClassifier(**fixed_params, **histgb_params) + return estimator_ + + def _set_gb_method(self,): + self.gb_method_ = None + return None + +class GradientBoostingRegressorBase(RegressorMixin, GradientBoostingMultitableBase): + """Base class for Gradient Boosting Multitable Regressor.""" + + def __init__( + self, + *, + source_data, + source_fraction, + num_model, + val_size, + random_state, + n_jobs, + ): + super(GradientBoostingRegressorBase, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + def predict(self, X): + """Predict values for X. Returns the average of predicted values over all the models. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted values. + """ + check_is_fitted(self, "is_fitted_") + # Obtain output + X_test = X.copy() + out = [estimator.predict(X_test) for estimator in self.estimator_list_] + if self.num_model == 1: + out = np.array(out).squeeze().transpose() + else: + out = np.array(out).squeeze().transpose() + out = np.mean(out, axis=1) + # Control for nan in prediction + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + return out + + +class GradientBoostingClassifierBase(ClassifierMixin, GradientBoostingMultitableBase): + """Base class for Gradient Boosting Multitable Classifier.""" + + def __init__( + self, + *, + source_data, + source_fraction, + num_model, + val_size, + random_state, + n_jobs, + ): + super(GradientBoostingClassifierBase, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + def predict(self, X): + """Predict classes for X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted classes. + """ + check_is_fitted(self, "is_fitted_") + return np.round(self.predict_proba(X)) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + p : ndarray, shape (n_samples,) for binary classification or (n_samples, n_classes) + The class probabilities of the input samples. + """ + + check_is_fitted(self, "is_fitted_") + # Obtain output + out = [estimator.predict_proba(X)[:, 1] for estimator in self.estimator_list_] + if self.num_model == 1: + out = np.array(out).transpose() + else: + out = np.array(out).squeeze().transpose() + out = np.mean(out, axis=1) + # Control for nan in prediction + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + return out + + def decision_function(self, X): + """Compute the decision function of X.""" + decision = self.predict_proba(X) + return decision + + +class CatBoostMultitableRegressor(GradientBoostingRegressorBase): + """Base class for CatBoost Multitable Regressor.""" + + def __init__( + self, + *, + source_data: dict = {}, + max_depth: int = 6, + learning_rate: float = 0.03, + bagging_temperature: float = 1, + l2_leaf_reg: float = 3.0, + one_hot_max_size: int = 2, + iterations: int = 1000, + thread_count: int = 1, + source_fraction: float = 0.5, + num_model: int = 1, + val_size: float = 0.1, + random_state: int = 0, + n_jobs: int = 1, + ): + super(CatBoostMultitableRegressor, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + self.max_depth = max_depth + self.learning_rate = learning_rate + self.bagging_temperature = bagging_temperature + self.l2_leaf_reg = l2_leaf_reg + self.one_hot_max_size = one_hot_max_size + self.iterations = iterations + self.thread_count = thread_count + + def _set_gb_method(self,): + """Set the Gradient-Boosting method. + + For CatBoost, it sets the required indicators of categorical columns. + """ + self.gb_method_ = "catboost" + # Set column names + X_total_train = pd.concat([self.X_, self.source_data["X"]], axis=0) + self.cat_col_names_ = X_total_train.select_dtypes( + include="object" + ).columns.tolist() + self.cat_features_ = [ + X_total_train.columns.get_loc(col) for col in self.cat_col_names_ + ] + return None + + +class CatBoostMultitableClassifier(GradientBoostingClassifierBase): + """Base class for CatBoost Multitable Classifier.""" + + def __init__( + self, + *, + source_data: dict = {}, + max_depth: int = 6, + learning_rate: float = 0.03, + bagging_temperature: float = 1, + l2_leaf_reg: float = 3.0, + one_hot_max_size: int = 2, + iterations: int = 1000, + thread_count: int = 1, + source_fraction: float = 0.5, + num_model: int = 1, + val_size: float = 0.1, + random_state: int = 0, + n_jobs: int = 1, + ): + super(CatBoostMultitableClassifier, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + self.max_depth = max_depth + self.learning_rate = learning_rate + self.bagging_temperature = bagging_temperature + self.l2_leaf_reg = l2_leaf_reg + self.one_hot_max_size = one_hot_max_size + self.iterations = iterations + self.thread_count = thread_count + + def _set_gb_method(self,): + """Set the Gradient-Boosting method. + + For CatBoost, it sets the required indicators of categorical columns. + """ + self.gb_method_ = "catboost" + # Set column names + X_total_train = pd.concat([self.X_, self.source_data["X"]], axis=0) + self.cat_col_names_ = X_total_train.select_dtypes( + include="object" + ).columns.tolist() + self.cat_features_ = [ + X_total_train.columns.get_loc(col) for col in self.cat_col_names_ + ] + return None + +class HistGBMultitableRegressor(GradientBoostingRegressorBase): + """Base class for Historgram Gradient Boosting Multitable Regressor.""" + + def __init__( + self, + *, + source_data: dict = {}, + learning_rate: float = 0.1, + max_depth: Union[None, int] = None, + max_leaf_nodes: int = 31, + min_samples_leaf: int = 20, + l2_regularization: float = 0, + source_fraction: float = 0.5, + num_model: int = 1, + val_size: float = 0.1, + random_state: int = 0, + n_jobs: int = 1, + ): + super(HistGBMultitableRegressor, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + self.learning_rate = learning_rate + self.max_depth = max_depth + self.max_leaf_nodes = max_leaf_nodes + self.min_samples_leaf = min_samples_leaf + self.l2_regularization = l2_regularization + + def _set_gb_method(self,): + """Set the Gradient-Boosting method.""" + self.gb_method_ = "histgb" + return None + + +class HistGBMultitableClassifier(GradientBoostingClassifierBase): + """Base class for Historgram Gradient Boosting Multitable Classifier.""" + + def __init__( + self, + *, + source_data: dict = {}, + learning_rate: float = 0.1, + max_depth: Union[None, int] = None, + max_leaf_nodes: int = 31, + min_samples_leaf: int = 20, + l2_regularization: float = 0, + source_fraction: float = 0.5, + num_model: int = 1, + val_size: float = 0.1, + random_state: int = 0, + n_jobs: int = 1, + ): + super(HistGBMultitableClassifier, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + self.learning_rate = learning_rate + self.max_depth = max_depth + self.max_leaf_nodes = max_leaf_nodes + self.min_samples_leaf = min_samples_leaf + self.l2_regularization = l2_regularization + + def _set_gb_method(self,): + """Set the Gradient-Boosting method.""" + self.gb_method_ = "histgb" + return None + + +class XGBoostMultitableRegressor(GradientBoostingRegressorBase): + """Base class for XGBoost Multitable Regressor.""" + + def __init__( + self, + *, + source_data: dict = {}, + n_estimators: int = 100, + max_depth: int = 6, + min_child_weight: float = 1, + subsample: float = 1, + learning_rate: float = 0.3, + colsample_bylevel: float = 1, + colsample_bytree: float = 1, + reg_gamma: float = 0, + reg_lambda: float = 1, + reg_alpha: float = 0, + source_fraction: float = 0.5, + num_model: int = 1, + val_size: float = 0.1, + random_state: int = 0, + n_jobs: int = 1, + ): + super(XGBoostMultitableRegressor, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + self.n_estimators = n_estimators + self.max_depth = max_depth + self.min_child_weight = min_child_weight + self.subsample = subsample + self.learning_rate = learning_rate + self.colsample_bylevel = colsample_bylevel + self.colsample_bytree = colsample_bytree + self.reg_gamma = reg_gamma + self.reg_lambda = reg_lambda + self.reg_alpha = reg_alpha + + def _set_gb_method(self,): + """Set the Gradient-Boosting method.""" + self.gb_method_ = "xgboost" + return None + + +class XGBoostMultitableClassifier(GradientBoostingClassifierBase): + """Base class for XGBoost Multitable Classifier.""" + + def __init__( + self, + *, + source_data: dict = {}, + n_estimators: int = 100, + max_depth: int = 6, + min_child_weight: float = 1, + subsample: float = 1, + learning_rate: float = 0.3, + colsample_bylevel: float = 1, + colsample_bytree: float = 1, + reg_gamma: float = 0, + reg_lambda: float = 1, + reg_alpha: float = 0, + source_fraction: float = 0.5, + num_model: int = 1, + val_size: float = 0.1, + random_state: int = 0, + n_jobs: int = 1, + ): + super(XGBoostMultitableClassifier, self).__init__( + source_data=source_data, + source_fraction = source_fraction, + num_model = num_model, + val_size = val_size, + random_state = random_state, + n_jobs = n_jobs, + ) + + self.n_estimators = n_estimators + self.max_depth = max_depth + self.min_child_weight = min_child_weight + self.subsample = subsample + self.learning_rate = learning_rate + self.colsample_bylevel = colsample_bylevel + self.colsample_bytree = colsample_bytree + self.reg_gamma = reg_gamma + self.reg_lambda = reg_lambda + self.reg_alpha = reg_alpha + + def _set_gb_method(self,): + """Set the Gradient-Boosting method.""" + self.gb_method_ = "xgboost" + return None \ No newline at end of file diff --git a/carte/src/baseline_singletable_nn.py b/carte/src/baseline_singletable_nn.py new file mode 100644 index 0000000..3e7e920 --- /dev/null +++ b/carte/src/baseline_singletable_nn.py @@ -0,0 +1,775 @@ +"""Neural network baseline for comparison.""" + +import torch +import torch.nn as nn +import numpy as np +import copy +from typing import Union +from torch import Tensor +from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin +from sklearn.model_selection import train_test_split +from sklearn.utils.validation import check_is_fitted, check_random_state +from torch.utils.data import Dataset, DataLoader +from tqdm import tqdm +from joblib import Parallel, delayed + + +## Simple MLP model +class MLP_Model(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + dropout_prob: float, + num_layers: int, + ): + super().__init__() + + self.initial = nn.Linear(input_dim, hidden_dim) + + self.mlp_block = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.LayerNorm(hidden_dim), + nn.ReLU(), + nn.Dropout(dropout_prob), + ) + self.layers = nn.Sequential(*[self.mlp_block for _ in range(num_layers)]) + + self.classifier = nn.Linear(hidden_dim, output_dim) + + def forward(self, X): + X = self.initial(X) + X = self.layers(X) + X = self.classifier(X) + return X + + +## Residual Block +class Residual_Block(nn.Module): + def __init__( + self, + input_dim: int, + output_dim: int, + hidden_factor: int, + normalization: Union[str, None] = "layernorm", + hidden_dropout_prob: float = 0.2, + residual_dropout_prob: float = 0.2, + ): + super().__init__() + + self.lin1 = nn.Linear(input_dim, output_dim * hidden_factor) + self.lin2 = nn.Linear(output_dim * hidden_factor, output_dim) + self.relu = nn.ReLU() + self.dropout_hidden = nn.Dropout(hidden_dropout_prob) + self.dropout_residual = nn.Dropout(residual_dropout_prob) + + self.norm1: Union[nn.BatchNorm1d, nn.LayerNorm, None] + self.norm2: Union[nn.BatchNorm1d, nn.LayerNorm, None] + if normalization == "batchnorm": + self.norm1 = nn.BatchNorm1d(output_dim * hidden_factor) + self.norm2 = nn.BatchNorm1d(output_dim) + elif normalization == "layernorm": + self.norm1 = nn.LayerNorm(output_dim * hidden_factor) + self.norm2 = nn.LayerNorm(output_dim) + else: + self.norm1 = self.norm2 = None + + def reset_parameters(self) -> None: + self.lin1.reset_parameters() + self.lin2.reset_parameters() + if self.norm1 is not None: + self.norm1.reset_parameters() + if self.norm2 is not None: + self.norm2.reset_parameters() + + def forward(self, x: Tensor): + out = self.lin1(x) + out = self.norm1(out) if self.norm1 else out + out = self.relu(out) + out = self.dropout_hidden(out) + + out = self.lin2(out) + out = self.norm2(out) if self.norm2 else out + out = self.relu(out) + out = self.dropout_residual(out) + + out = out + x + out = self.relu(out) + + return out + + +## Resnet model +class RESNET_Model(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + **block_args + ): + super(RESNET_Model, self).__init__() + + self.initial = nn.Linear(input_dim, hidden_dim) + + self.layers = nn.ModuleList( + [ + Residual_Block( + input_dim=hidden_dim, output_dim=hidden_dim, **block_args + ) + for _ in range(num_layers) + ] + ) + + self.classifer = nn.Linear(hidden_dim, output_dim) + + def forward(self, X): + X = self.initial(X) + + for l in self.layers: + X = l(X) + + X = self.classifer(X) + return X + + +class TabularDataset(Dataset): + def __init__(self, X, y): + self.X = X + self.y = y + + def __len__(self): + return self.X.size(0) + + def __getitem__(self, idx): + return self.X[idx], self.y[idx] + + +class MLPBase(BaseEstimator): + """Base class for MLP.""" + + def __init__( + self, + *, + hidden_dim, + learning_rate, + weight_decay, + batch_size, + val_size, + num_model, + max_epoch, + early_stopping_patience, + n_jobs, + device, + random_state, + disable_pbar, + ): + self.hidden_dim = hidden_dim + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.batch_size = batch_size + self.val_size = val_size + self.num_model = num_model + self.max_epoch = max_epoch + self.early_stopping_patience = early_stopping_patience + self.n_jobs = n_jobs + self.device = device + self.random_state = random_state + self.disable_pbar = disable_pbar + + def fit(self, X, y): + # Preliminary settings + self.is_fitted_ = False + self.device_ = torch.device(self.device) + self.X_ = X + self.y_ = y + self._set_task_specific_settings() + + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + if isinstance(y, Tensor) == False: + y = torch.tensor(y, dtype=torch.float32) + + # Set random_state + random_state = check_random_state(self.random_state) + random_state_list = [random_state.randint(1000) for _ in range(self.num_model)] + + # Fit model + result_fit = Parallel(n_jobs=self.n_jobs)( + delayed(self._run_train_with_early_stopping)(X, y, rs) + for rs in random_state_list + ) + + # Store the required results that may be used later + self.model_list_ = [model for (model, _, _) in result_fit] + self.valid_loss_ = [valid_loss for (_, valid_loss, _) in result_fit] + self.random_state_list_ = [rs for (_, _, rs) in result_fit] + self.is_fitted_ = True + + return self + + def _run_train_with_early_stopping(self, X, y, random_state): + """Train each model corresponding to the random_state with the early_stopping patience. + + This mode of training sets train/valid set for the early stopping criterion. + Returns the trained model, train and validation loss at the best epoch, and the random_state. + """ + # Set validation by val_size + stratify = None + if self.model_task_ == "classification": + stratify = self.y_ + X_train, X_valid, y_train, y_valid = train_test_split( + X, + y, + test_size=self.val_size, + shuffle=True, + random_state=random_state, + stratify=stratify, + ) + + ds_train = TabularDataset(X_train, y_train) + + # Load model and optimizer + input_dim = X.size(1) + model_run_train = self._load_model(input_dim) + model_run_train.to(self.device_) + optimizer = torch.optim.AdamW( + model_run_train.parameters(), + lr=self.learning_rate, + weight_decay=self.weight_decay, + ) + + # Train model + train_loader = DataLoader(ds_train, batch_size=self.batch_size, shuffle=False) + valid_loss_best = 9e15 + + es_counter = 0 + model_best_ = copy.deepcopy(model_run_train) + for _ in tqdm( + range(1, self.max_epoch + 1), + desc=f"Model No. {random_state}", + disable=self.disable_pbar, + ): + self._run_epoch(model_run_train, optimizer, train_loader) + valid_loss = self._eval(model_run_train, X_valid, y_valid) + if valid_loss < valid_loss_best: + valid_loss_best = valid_loss + model_best_ = copy.deepcopy(model_run_train) + es_counter = 0 + else: + es_counter += 1 + if es_counter > self.early_stopping_patience: + break + model_best_.eval() + return model_best_, valid_loss_best, random_state + + def _run_epoch(self, model, optimizer, train_loader): + """Run an epoch of the input model. + + With each epoch, it updates the model and the optimizer. + """ + model.train() + for data_X, data_y in train_loader: + optimizer.zero_grad() # Clear gradients. + data_X = data_X.to(self.device_) + data_y = data_y.to(self.device_) + out = model(data_X) # Perform a single forward pass. + target = data_y + out = out.view(-1).to(torch.float64) + target = target.to(torch.float64) + loss = self.criterion_(out, target) # Compute the loss. + loss.backward() # Derive gradients. + optimizer.step() # Update parameters based on gradients. + + def _eval(self, model, X, y): + """Run an evaluation of the input data on the input model. + + Returns the selected loss of the input data from the input model. + """ + X = X.to(self.device_) + y = y.to(self.device_) + with torch.no_grad(): + model.eval() + out = model(X) + target = y + out = out.view(-1).to(torch.float64) + target = target.to(torch.float64) + loss_eval = self.criterion_(out, target) + loss_eval = round(loss_eval.detach().item(), 4) + return loss_eval + + def _set_task_specific_settings(self): + self.criterion_ = None + self.output_dim_ = None + self.model_task_ = None + + def _load_model(self, input_dim): + return None + + +class BaseMLPEstimator(MLPBase): + """Base class for MLP Estimator.""" + + def __init__( + self, + *, + hidden_dim: int = 256, + num_layers: int = 2, + dropout_prob: float = 0.2, + learning_rate: float = 1e-3, + weight_decay: float = 1e-2, + batch_size: int = 128, + val_size: float = 0.1, + num_model: int = 1, + max_epoch: int = 200, + early_stopping_patience: Union[None, int] = 10, + n_jobs: int = 1, + device: str = "cpu", + random_state: int = 0, + disable_pbar: bool = True, + ): + super(BaseMLPEstimator, self).__init__( + hidden_dim=hidden_dim, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size, + val_size=val_size, + num_model=num_model, + max_epoch=max_epoch, + early_stopping_patience=early_stopping_patience, + n_jobs=n_jobs, + device=device, + random_state=random_state, + disable_pbar=disable_pbar, + ) + + self.num_layers = num_layers + self.dropout_prob = dropout_prob + + def _load_model(self, input_dim): + """Load the MLP model for training. + + Returns the model that can be used for training. + """ + + # Set seed for torch - for reproducibility + random_state = check_random_state(self.random_state) + model_seed = random_state.randint(10000) + torch.manual_seed(model_seed) + + model_config = dict() + model_config["input_dim"] = input_dim + model_config["hidden_dim"] = self.hidden_dim + model_config["output_dim"] = self.output_dim_ + model_config["dropout_prob"] = self.dropout_prob + model_config["num_layers"] = self.num_layers + model = MLP_Model(**model_config) + return model + + +class MLPRegressor(RegressorMixin, BaseMLPEstimator): + """ """ + + def __init__( + self, + *, + loss: str = "squared_error", + hidden_dim: int = 256, + num_layers: int = 2, + dropout_prob: float = 0.2, + learning_rate: float = 1e-3, + weight_decay: float = 1e-2, + batch_size: int = 128, + val_size: float = 0.1, + num_model: int = 1, + max_epoch: int = 200, + early_stopping_patience: Union[None, int] = 10, + n_jobs: int = 1, + device: str = "cpu", + random_state: int = 0, + disable_pbar: bool = True, + ): + super(MLPRegressor, self).__init__( + hidden_dim=hidden_dim, + num_layers=num_layers, + dropout_prob=dropout_prob, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size, + val_size=val_size, + num_model=num_model, + max_epoch=max_epoch, + early_stopping_patience=early_stopping_patience, + n_jobs=n_jobs, + device=device, + random_state=random_state, + disable_pbar=disable_pbar, + ) + + self.loss = loss + + def predict(self, X): + check_is_fitted(self, "is_fitted_") + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + X = X.to(self.device_) + + # Obtain the predicitve output + with torch.no_grad(): + out = [model(X).cpu().detach().numpy() for model in self.model_list_] + + if self.num_model == 1: + out = np.array(out).squeeze().transpose() + else: + out = np.array(out).squeeze().transpose() + out = np.mean(out, axis=1) + + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + + return out + + def _set_task_specific_settings(self): + if self.loss == "squared_error": + self.criterion_ = torch.nn.MSELoss() + elif self.loss == "absolute_error": + self.criterion_ = torch.nn.L1Loss() + + self.output_dim_ = 1 + self.model_task_ = "regression" + + +class MLPClassifier(ClassifierMixin, BaseMLPEstimator): + """ """ + + def __init__( + self, + *, + loss: str = "binary_crossentropy", + hidden_dim: int = 256, + num_layers: int = 2, + dropout_prob: float = 0.2, + learning_rate: float = 1e-3, + weight_decay: float = 1e-2, + batch_size: int = 128, + val_size: float = 0.1, + num_model: int = 1, + max_epoch: int = 200, + early_stopping_patience: Union[None, int] = 10, + n_jobs: int = 1, + device: str = "cpu", + random_state: int = 0, + disable_pbar: bool = True, + ): + super(MLPClassifier, self).__init__( + hidden_dim=hidden_dim, + num_layers=num_layers, + dropout_prob=dropout_prob, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size, + val_size=val_size, + num_model=num_model, + max_epoch=max_epoch, + early_stopping_patience=early_stopping_patience, + n_jobs=n_jobs, + device=device, + random_state=random_state, + disable_pbar=disable_pbar, + ) + + self.loss = loss + + def predict(self, X): + check_is_fitted(self, "is_fitted_") + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + + if self.loss == "binary_crossentropy": + return np.round(self.predict_proba(X)) + elif self.loss == "categorical_crossentropy": + return np.argmax(self.predict_proba(X), axis=1) + + def predict_proba(self, X): + check_is_fitted(self, "is_fitted_") + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + X = X.to(self.device_) + return self._get_predict_prob(X) + + def decision_function(self, X): + decision = self.predict_proba(X) + if decision.shape[1] == 1: + decision = decision.ravel() + return decision + + def _get_predict_prob(self, X): + # Obtain the predicitve output + with torch.no_grad(): + out = [model(X).cpu().detach().numpy() for model in self.model_list_] + out = np.mean(out, axis=0) + if self.loss == "binary_crossentropy": + out = 1 / (1 + np.exp(-out)) + elif self.loss == "categorical_crossentropy": + out = np.exp(out) / sum(np.exp(out)) + return out + + def _set_task_specific_settings(self): + if self.loss == "binary_crossentropy": + self.criterion_ = torch.nn.BCEWithLogitsLoss() + elif self.loss == "categorical_crossentropy": + self.criterion_ = torch.nn.CrossEntropyLoss() + + self.output_dim_ = len(np.unique(self.y_)) + if self.output_dim_ == 2: + self.output_dim_ -= 1 + self.criterion_ = torch.nn.BCEWithLogitsLoss() + + self.model_task_ = "classification" + + +class BaseRESNETEstimator(MLPBase): + """Base class for RESNET Estimator.""" + + def __init__( + self, + *, + normalization: Union[str, None] = "layernorm", + num_layers: int = 4, + hidden_dim: int = 256, + hidden_factor: int = 2, + hidden_dropout_prob: float = 0.2, + residual_dropout_prob: float = 0.2, + learning_rate: float = 1e-3, + weight_decay: float = 1e-2, + batch_size: int = 128, + val_size: float = 0.1, + num_model: int = 1, + max_epoch: int = 200, + early_stopping_patience: Union[None, int] = 10, + n_jobs: int = 1, + device: str = "cpu", + random_state: int = 0, + disable_pbar: bool = True, + ): + super(BaseRESNETEstimator, self).__init__( + hidden_dim=hidden_dim, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size, + val_size=val_size, + num_model=num_model, + max_epoch=max_epoch, + early_stopping_patience=early_stopping_patience, + n_jobs=n_jobs, + device=device, + random_state=random_state, + disable_pbar=disable_pbar, + ) + + self.normalization = normalization + self.num_layers = num_layers + self.hidden_factor = hidden_factor + self.hidden_dropout_prob = hidden_dropout_prob + self.residual_dropout_prob = residual_dropout_prob + + def _load_model(self, input_dim): + """Load the RESNET model for training. + + Returns the model that can be used for training. + """ + + # Set seed for torch - for reproducibility + random_state = check_random_state(self.random_state) + model_seed = random_state.randint(10000) + torch.manual_seed(model_seed) + + model_config = dict() + model_config["input_dim"] = input_dim + model_config["hidden_dim"] = self.hidden_dim + model_config["output_dim"] = self.output_dim_ + model_config["hidden_factor"] = self.hidden_factor + model_config["hidden_dropout_prob"] = self.hidden_dropout_prob + model_config["residual_dropout_prob"] = self.residual_dropout_prob + model_config["normalization"] = self.normalization + model_config["num_layers"] = self.num_layers + + model = RESNET_Model(**model_config) + return model + + +class RESNETRegressor(RegressorMixin, BaseRESNETEstimator): + """ """ + + def __init__( + self, + *, + loss: str = "squared_error", + normalization: Union[str, None] = "layernorm", + num_layers: int = 4, + hidden_dim: int = 256, + hidden_factor: int = 2, + hidden_dropout_prob: float = 0.2, + residual_dropout_prob: float = 0.2, + learning_rate: float = 1e-3, + weight_decay: float = 1e-2, + batch_size: int = 128, + val_size: float = 0.1, + num_model: int = 1, + max_epoch: int = 200, + early_stopping_patience: Union[None, int] = 10, + n_jobs: int = 1, + device: str = "cpu", + random_state: int = 0, + disable_pbar: bool = True, + ): + super(RESNETRegressor, self).__init__( + normalization=normalization, + num_layers=num_layers, + hidden_dim=hidden_dim, + hidden_factor=hidden_factor, + hidden_dropout_prob=hidden_dropout_prob, + residual_dropout_prob=residual_dropout_prob, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size, + val_size=val_size, + num_model=num_model, + max_epoch=max_epoch, + early_stopping_patience=early_stopping_patience, + n_jobs=n_jobs, + device=device, + random_state=random_state, + disable_pbar=disable_pbar, + ) + + self.loss = loss + + def predict(self, X): + check_is_fitted(self, "is_fitted_") + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + X = X.to(self.device_) + + # Obtain the predicitve output + with torch.no_grad(): + out = [model(X).cpu().detach().numpy() for model in self.model_list_] + + if self.num_model == 1: + out = np.array(out).squeeze().transpose() + else: + out = np.array(out).squeeze().transpose() + out = np.mean(out, axis=1) + + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + + return out + + def _set_task_specific_settings(self): + if self.loss == "squared_error": + self.criterion_ = torch.nn.MSELoss() + elif self.loss == "absolute_error": + self.criterion_ = torch.nn.L1Loss() + + self.output_dim_ = 1 + self.model_task_ = "regression" + + +class RESNETClassifier(ClassifierMixin, BaseRESNETEstimator): + """ """ + + def __init__( + self, + *, + loss: str = "binary_crossentropy", + normalization: Union[str, None] = "layernorm", + num_layers: int = 4, + hidden_dim: int = 256, + hidden_factor: int = 2, + hidden_dropout_prob: float = 0.2, + residual_dropout_prob: float = 0.2, + learning_rate: float = 1e-3, + weight_decay: float = 1e-2, + batch_size: int = 128, + val_size: float = 0.1, + num_model: int = 1, + max_epoch: int = 200, + early_stopping_patience: Union[None, int] = 10, + n_jobs: int = 1, + device: str = "cpu", + random_state: int = 0, + disable_pbar: bool = True, + ): + super(RESNETClassifier, self).__init__( + normalization=normalization, + num_layers=num_layers, + hidden_dim=hidden_dim, + hidden_factor=hidden_factor, + hidden_dropout_prob=hidden_dropout_prob, + residual_dropout_prob=residual_dropout_prob, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size, + val_size=val_size, + num_model=num_model, + max_epoch=max_epoch, + early_stopping_patience=early_stopping_patience, + n_jobs=n_jobs, + device=device, + random_state=random_state, + disable_pbar=disable_pbar, + ) + + self.loss = loss + + def predict(self, X): + check_is_fitted(self, "is_fitted_") + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + + if self.loss == "binary_crossentropy": + return np.round(self.predict_proba(X)) + elif self.loss == "categorical_crossentropy": + return np.argmax(self.predict_proba(X), axis=1) + + def predict_proba(self, X): + check_is_fitted(self, "is_fitted_") + if isinstance(X, Tensor) == False: + X = torch.tensor(X, dtype=torch.float32) + X = X.to(self.device_) + return self._get_predict_prob(X) + + def decision_function(self, X): + decision = self.predict_proba(X) + if decision.shape[1] == 1: + decision = decision.ravel() + return decision + + def _get_predict_prob(self, X): + # Obtain the predicitve output + with torch.no_grad(): + out = [model(X).cpu().detach().numpy() for model in self.model_list_] + out = np.mean(out, axis=0) + if self.loss == "binary_crossentropy": + out = 1 / (1 + np.exp(-out)) + elif self.loss == "categorical_crossentropy": + out = np.exp(out) / sum(np.exp(out)) + return out + + def _set_task_specific_settings(self): + if self.loss == "binary_crossentropy": + self.criterion_ = torch.nn.BCEWithLogitsLoss() + elif self.loss == "categorical_crossentropy": + self.criterion_ = torch.nn.CrossEntropyLoss() + + self.output_dim_ = len(np.unique(self.y_)) + if self.output_dim_ == 2: + self.output_dim_ -= 1 + self.criterion_ = torch.nn.BCEWithLogitsLoss() + + self.model_task_ = "classification" diff --git a/carte/src/carte_estimator.py b/carte/src/carte_estimator.py new file mode 100644 index 0000000..ac67ad4 --- /dev/null +++ b/carte/src/carte_estimator.py @@ -0,0 +1,1541 @@ +"""CARTE estimators for regression and classification.""" + +import torch +import numpy as np +import pandas as pd +import copy +import math +from typing import Union +from torcheval.metrics import ( + MeanSquaredError, + R2Score, + BinaryAUROC, + BinaryNormalizedEntropy, + BinaryAUPRC, + MulticlassAUROC, +) +from torch import Tensor +from torch_geometric.loader import DataLoader +from torch_geometric.data import Batch +from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, ShuffleSplit, StratifiedShuffleSplit, ParameterGrid, train_test_split +from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin +from sklearn.utils.validation import check_is_fitted, check_random_state +from joblib import Parallel, delayed +from tqdm import tqdm +from scipy.special import softmax +from carte.src.carte_model import CARTE_NN_Model, CARTE_NN_Model_Ablation +from carte.configs.directory import config_directory + + +class BaseCARTEEstimator(BaseEstimator): + """Base class for CARTE Estimator.""" + + def __init__( + self, + *, + num_layers, + load_pretrain, + freeze_pretrain, + learning_rate, + batch_size, + max_epoch, + dropout, + val_size, + cross_validate, + early_stopping_patience, + num_model, + random_state, + n_jobs, + device, + disable_pbar, + ): + self.num_layers = num_layers + self.load_pretrain = load_pretrain + self.freeze_pretrain = freeze_pretrain + self.learning_rate = learning_rate + self.batch_size = batch_size + self.max_epoch = max_epoch + self.dropout = dropout + self.val_size = val_size + self.cross_validate = cross_validate + self.early_stopping_patience = early_stopping_patience + self.num_model = num_model + self.random_state = random_state + self.n_jobs = n_jobs + self.device = device + self.disable_pbar = disable_pbar + + def fit(self, X, y): + """Fit the CARTE model. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Fitted estimator. + """ + # Preliminary settings + self.is_fitted_ = False + self.device_ = torch.device(self.device) + self.X_ = X + self.y_ = y + self._set_task_specific_settings() + + # Set the cv-splits + splits = self._set_train_valid_split() + + # Fit model + result_fit = Parallel(n_jobs=self.n_jobs)( + delayed(self._run_train_with_early_stopping)(X, split_index) + for split_index in splits + ) + + # Store the required results that may be used later + self.model_list_ = [model for (model, _) in result_fit] + self.valid_loss_ = [valid_loss for (_, valid_loss) in result_fit] + self.weights_ = np.array([1/self.num_model]*self.num_model) + self.is_fitted_ = True + + return self + + def _run_train_with_early_stopping(self, X, split_index): + """Train each model corresponding to the random_state with the early_stopping patience. + + This mode of training sets train/valid set for the early stopping criterion. + Returns the trained model, and the validation loss at the best epoch. + """ + + # Set datasets + ds_train = [X[i] for i in split_index[0]] + ds_valid = [X[i] for i in split_index[1]] + + # Set validation batch for evaluation + ds_valid_eval = self._set_data_eval(data=ds_valid) + + # Load model and optimizer + model_run_train = self._load_model() + model_run_train.to(self.device_) + optimizer = torch.optim.AdamW( + model_run_train.parameters(), lr=self.learning_rate + ) + + # Train model + train_loader = DataLoader(ds_train, batch_size=self.batch_size, shuffle=False) + valid_loss_best = 9e15 + es_counter = 0 + model_best_ = copy.deepcopy(model_run_train) + for _ in tqdm( + range(1, self.max_epoch + 1), + desc=f"Model No. xx", + disable=self.disable_pbar, + ): + self._run_epoch(model_run_train, optimizer, train_loader) + valid_loss = self._eval(model_run_train, ds_valid_eval) + if valid_loss < valid_loss_best: + valid_loss_best = valid_loss + model_best_ = copy.deepcopy(model_run_train) + es_counter = 0 + else: + es_counter += 1 + if es_counter > self.early_stopping_patience: + break + model_best_.eval() + return model_best_, valid_loss_best + + def _run_epoch(self, model, optimizer, train_loader): + """Run an epoch of the input model. + + Each epoch consists of steps that update the model and the optimizer. + """ + model.train() + for data in train_loader: # Iterate in batches over the training dataset. + self._run_step(model, data, optimizer) + + def _run_step(self, model, data, optimizer): + """Run a step of the training. + + With each step, it updates the model and the optimizer. + """ + optimizer.zero_grad() # Clear gradients. + data.to(self.device_) # Send to device + out = model(data) # Perform a single forward pass. + target = data.y # Set target + if self.output_dim_ == 1: + out = out.view(-1).to(torch.float32) # Reshape outputSet head index + target = target.to(torch.float32) # Reshape target + loss = self.criterion_(out, target) # Compute the loss. + loss.backward() # Derive gradients. + optimizer.step() # Update parameters based on gradients. + + def _eval(self, model, ds_eval): + """Run an evaluation of the input data on the input model. + + Returns the selected loss of the input data from the input model. + """ + with torch.no_grad(): + model.eval() + out = model(ds_eval) + target = ds_eval.y + if self.output_dim_ == 1: + out = out.view(-1).to(torch.float32) + target = target.to(torch.float32) + self.valid_loss_metric_.update(out, target) + loss_eval = self.valid_loss_metric_.compute() + loss_eval = loss_eval.detach().item() + if self.valid_loss_flag_ == "neg": + loss_eval = -1 * loss_eval + self.valid_loss_metric_.reset() + return loss_eval + + def _set_train_valid_split(self): + """Train/validation split for the bagging strategy. + + The style of split depends on the cross_validate parameter. + Reuturns the train/validation split with KFold cross-validation. + """ + + if self._estimator_type == "regressor": + if self.cross_validate: + n_splits = int(1 / self.val_size) + n_repeats = int(self.num_model / n_splits) + splitter = RepeatedKFold( + n_splits=n_splits, n_repeats=n_repeats, random_state=self.random_state, + ) + else: + splitter = ShuffleSplit(n_splits = self.num_model, test_size=self.val_size, random_state=self.random_state) + splits = [ + (train_index, test_index) + for train_index, test_index in splitter.split(np.arange(0, len(self.X_))) + ] + else: + if self.cross_validate: + n_splits = int(1 / self.val_size) + n_repeats = int(self.num_model / n_splits) + splitter = RepeatedStratifiedKFold( + n_splits=n_splits, n_repeats=n_repeats, random_state=self.random_state, + ) + else: + splitter = StratifiedShuffleSplit(n_splits = self.num_model, test_size=self.val_size, random_state=self.random_state) + splits = [ + (train_index, test_index) + for train_index, test_index in splitter.split( + np.arange(0, len(self.X_)), self.y_ + ) + ] + + return splits + + def _set_data_eval(self, data): + """Constructs the aggregated graph object from the list of data. + + This is consistent with the graph object from torch_geometric. + Returns the aggregated graph object. + """ + make_batch = Batch() + with torch.no_grad(): + ds_eval = make_batch.from_data_list(data, follow_batch=["edge_index"]) + ds_eval.to(self.device_) + return ds_eval + + def _generate_output(self, X, model_list, weights): + """Generate the output from the trained model. + + Returns the output (prediction) of input X. + """ + + # Obtain the batch to feed into the network + ds_predict_eval = self._set_data_eval(data=X) + with torch.no_grad(): + out = [ + model(ds_predict_eval).cpu().detach().numpy() for model in model_list + ] + out = np.array(out).squeeze().transpose() + if len(model_list) != 1: + out = np.average(out, weights=weights, axis=1) + + # Change if the task is classification + if self.loss == "binary_crossentropy": + out = 1 / (1 + np.exp(-out)) + elif self.loss == "categorical_crossentropy": + out = softmax(out, axis=1) + + # Control for nulls in prediction + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + return out + + def _set_task_specific_settings(self): + """Set task specific settings for regression and classfication. + """ + + if self._estimator_type == "regressor": + if self.loss == "squared_error": + self.criterion_ = torch.nn.MSELoss() + elif self.loss == "absolute_error": + self.criterion_ = torch.nn.L1Loss() + if self.scoring == "squared_error": + self.valid_loss_metric_ = MeanSquaredError() + self.valid_loss_flag_ = "pos" + elif self.scoring == "r2_score": + self.valid_loss_metric_ = R2Score() + self.valid_loss_flag_ = "neg" + self.output_dim_ = 1 + elif self._estimator_type == "classifier": + if self.loss == "binary_crossentropy": + self.criterion_ = torch.nn.BCEWithLogitsLoss() + elif self.loss == "categorical_crossentropy": + self.criterion_ = torch.nn.CrossEntropyLoss() + self.output_dim_ = len(np.unique(self.y_)) + if self.output_dim_ == 2: + self.output_dim_ -= 1 + self.criterion_ = torch.nn.BCEWithLogitsLoss() + if self.scoring == "auroc": + self.valid_loss_metric_ = BinaryAUROC() + self.valid_loss_flag_ = "neg" + elif self.scoring == "binary_entropy": + self.valid_loss_metric_ = BinaryNormalizedEntropy(from_logits = True) + self.valid_loss_flag_ = "neg" + elif self.scoring == "auprc": + self.valid_loss_metric_ = BinaryAUPRC() + self.valid_loss_flag_ = "neg" + if self.loss == "categorical_crossentropy": + self.valid_loss_metric_ = MulticlassAUROC(num_classes=self.output_dim_) + self.valid_loss_flag_ = "neg" + self.classes_ = np.unique(self.y_) + self.valid_loss_metric_.to(self.device_) + + def _load_model(self): + """Load the CARTE model for training. + + This loads the pretrained weights if the parameter load_pretrain is set to True. + The freeze of the pretrained weights are controlled by the freeze_pretrain parameter. + + Returns the model that can be used for training. + """ + # Model configuration + model_config = dict() + model_config["input_dim_x"] = self.X_[0].x.size(1) + model_config["input_dim_e"] = self.X_[0].x.size(1) + model_config["hidden_dim"] = self.X_[0].x.size(1) + model_config["ff_dim"] = self.X_[0].x.size(1) + model_config["num_heads"] = 12 + model_config["num_layers"] = self.num_layers-1 + model_config["output_dim"] = self.output_dim_ + model_config["dropout"] = self.dropout + + # Set seed for torch - for reproducibility + random_state = check_random_state(self.random_state) + model_seed = random_state.randint(10000) + torch.manual_seed(model_seed) + + # Set model architecture + model = CARTE_NN_Model(**model_config) + + # Load the pretrained weights if specified + if self.load_pretrain: + dir_model = config_directory["pretrained_model"] + pretrain_model_dict = torch.load(dir_model, map_location=self.device_) + initial_x_keys = [ + key for key in pretrain_model_dict.keys() if "initial_x" in key + ] + for key in initial_x_keys: + pretrain_model_dict[key + "_pretrain"] = pretrain_model_dict.pop(key) + model.load_state_dict(pretrain_model_dict, strict=False) + + # Freeze the pretrained weights if specified + if self.freeze_pretrain: + for param in model.ft_base.read_out_block.parameters(): + param.requires_grad = False + for param in model.ft_base.layers.parameters(): + param.requires_grad = False + + return model + + +class CARTERegressor(RegressorMixin, BaseCARTEEstimator): + """CARTE Regressor for Regression tasks. + + This estimator is GNN-based model compatible with the CARTE pretrained model. + + Parameters + ---------- + loss : {'squared_error', 'absolute_error'}, default='squared_error' + The loss function used for backpropagation. + scoring : {'r2_score', 'squared_error'}, default='r2_score' + The scoring function used for validation. + num_layers : int, default=1 + The number of layers for the NN model + load_pretrain : bool, default=True + Indicates whether to load pretrained weights or not + freeze_pretrain : bool, default=True + Indicates whether to freeze the pretrained weights in the training or not + learning_rate : float, default=1e-3 + The learning rate of the model. The model uses AdamW as the optimizer + batch_size : int, default=16 + The batch size used for training + max_epoch : int or None, default=500 + The maximum number of epoch for training + dropout : float, default=0 + The dropout rate for training + val_size : float, default=0.1 + The size of the validation set used for early stopping + cross_validate : bool, default=False + Indicates whether to use cross-validation strategy for train/validation split + early_stopping_patience : int or None, default=40 + The early stopping patience when early stopping is used. + If set to None, no early stopping is employed + num_model : int, default=1 + The total number of models used for Bagging strategy + random_state : int or None, default=0 + Pseudo-random number generator to control the train/validation data split + if early stoppingis enabled, the weight initialization, and the dropout. + Pass an int for reproducible output across multiple function calls. + n_jobs : int, default=1 + Number of jobs to run in parallel. Training the estimator the score are parallelized + over the number of models. + device : {"cpu", "gpu"}, default="cpu", + The device used for the estimator. + disable_pbar : bool, default=True + Indicates whether to show progress bars for the training process. + """ + + def __init__( + self, + *, + loss: str = "squared_error", + scoring: str = "r2_score", + num_layers: int = 1, + load_pretrain: bool = True, + freeze_pretrain: bool = True, + learning_rate: float = 1e-3, + batch_size: int = 16, + max_epoch: int = 500, + dropout: float = 0, + val_size: float = 0.2, + cross_validate: bool = False, + early_stopping_patience: Union[None, int] = 40, + num_model: int = 1, + random_state: int = 0, + n_jobs: int = 1, + device: str = "cpu", + disable_pbar: bool = True, + ): + super(CARTERegressor, self).__init__( + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + cross_validate=cross_validate, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + ) + + self.loss = loss + self.scoring = scoring + + def predict(self, X): + """Predict values for X. Returns the average of predicted values over all the models. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted values. + """ + + check_is_fitted(self, "is_fitted_") + + out = self._generate_output(X=X, model_list = self.model_list_, weights=None) + + return out + + +class CARTEClassifier(ClassifierMixin, BaseCARTEEstimator): + """CARTE Classifier for Classification tasks. + + This estimator is GNN-based model compatible with the CARTE pretrained model. + + Parameters + ---------- + loss : {'binary_crossentropy', 'categorical_crossentropy'}, default='binary_crossentropy' + The loss function used for backpropagation. + scoring : {'auroc', 'auprc', 'binary_entropy'}, default='auroc' + The scoring function used for validation. + num_layers : int, default=1 + The number of layers for the NN model + load_pretrain : bool, default=True + Indicates whether to load pretrained weights or not + freeze_pretrain : bool, default=True + Indicates whether to freeze the pretrained weights in the training or not + learning_rate : float, default=1e-3 + The learning rate of the model. The model uses AdamW as the optimizer + batch_size : int, default=16 + The batch size used for training + max_epoch : int or None, default=500 + The maximum number of epoch for training + dropout : float, default=0 + The dropout rate for training + val_size : float, default=0.1 + The size of the validation set used for early stopping + cross_validate : bool, default=False + Indicates whether to use cross-validation strategy for train/validation split + early_stopping_patience : int or None, default=40 + The early stopping patience when early stopping is used. + If set to None, no early stopping is employed + num_model : int, default=1 + The total number of models used for Bagging strategy + random_state : int or None, default=0 + Pseudo-random number generator to control the train/validation data split + if early stoppingis enabled, the weight initialization, and the dropout. + Pass an int for reproducible output across multiple function calls. + n_jobs : int, default=1 + Number of jobs to run in parallel. Training the estimator the score are parallelized + over the number of models. + device : {"cpu", "gpu"}, default="cpu", + The device used for the estimator. + disable_pbar : bool, default=True + Indicates whether to show progress bars for the training process. + """ + + def __init__( + self, + *, + loss: str = "binary_crossentropy", + scoring: str = "auroc", + num_layers: int = 1, + load_pretrain: bool = True, + freeze_pretrain: bool = True, + learning_rate: float = 1e-3, + batch_size: int = 16, + max_epoch: int = 500, + dropout: float = 0, + val_size: float = 0.2, + cross_validate: bool = False, + early_stopping_patience: Union[None, int] = 40, + num_model: int = 1, + random_state: int = 0, + n_jobs: int = 1, + device: str = "cpu", + disable_pbar: bool = True, + ): + super(CARTEClassifier, self).__init__( + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + cross_validate=cross_validate, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + ) + + self.loss = loss + self.scoring = scoring + + def predict(self, X): + """Predict classes for X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted classes. + """ + check_is_fitted(self, "is_fitted_") + + if self.loss == "binary_crossentropy": + return np.round(self.predict_proba(X)) + elif self.loss == "categorical_crossentropy": + return np.argmax(self.predict_proba(X), axis=1) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + p : ndarray, shape (n_samples,) for binary classification or (n_samples, n_classes) + The class probabilities of the input samples. + """ + check_is_fitted(self, "is_fitted_") + return self._get_predict_prob(X) + + def decision_function(self, X): + """Compute the decision function of X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + decision : ndarray, shape (n_samples,) + """ + decision = self.predict_proba(X) + if decision.shape[1] == 1: + decision = decision.ravel() + return decision + + def _get_predict_prob(self, X): + """Return the average of the outputs over all the models. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + raw_predictions : array, shape (n_samples,) + The raw predicted values. + """ + + out = self._generate_output(X=X, model_list = self.model_list_, weights=None) + + return out + + +class IdxIterator: + """Class for iterating indices to set up the batch for CARTE Multitables""" + + def __init__( + self, + n_batch: int, + domain_indicator: Tensor, + target_fraction: float, + ): + self.n_batch = n_batch + self.target_fraction = target_fraction + self.domain_indicator = domain_indicator + + # Number of samples for target and source + self.num_t = (domain_indicator == 0).sum().item() + self.count_t = torch.ones(self.num_t) + + self.num_source_domain = domain_indicator.unique().size(0) - 1 + + domain_list = domain_indicator.unique() + source_domain_list = domain_list[domain_list != 0] + + self.num_s = [(domain_indicator == x).sum().item() for x in source_domain_list] + + count_s_ = [torch.ones(x) for x in self.num_s] + self.count_s = count_s_[0] + for x in range(1, self.num_source_domain): + self.count_s = torch.block_diag(self.count_s, count_s_[x]) + if self.num_source_domain == 1: + self.count_s = self.count_s.reshape(1, -1) + self.count_s_fixed = copy.deepcopy(self.count_s) + + self.train_flag = None + + self.set_num_samples() + + def set_num_samples(self): + self.num_samples_t = math.ceil(self.n_batch * self.target_fraction) + n_batch_source_total = int((self.n_batch - self.num_samples_t)) + num_samples_s = [ + int(n_batch_source_total / self.num_source_domain) + for _ in range(self.num_source_domain) + ] + if sum(num_samples_s) != n_batch_source_total: + num_samples_s[ + torch.randint(0, self.num_source_domain, (1,)) + ] += n_batch_source_total - sum(num_samples_s) + self.num_samples_s = num_samples_s + + def sample(self): + idx_batch_t = torch.multinomial( + self.count_t, num_samples=self.num_samples_t, replacement=False + ) + self.count_t[idx_batch_t] -= 1 + + idx_batch_s = torch.tensor([]).to(dtype=torch.long) + for x in range(self.num_source_domain): + idx_batch_s_ = torch.multinomial( + self.count_s[x], num_samples=self.num_samples_s[x], replacement=False + ) + self.count_s[x, idx_batch_s_] -= 1 + idx_batch_s = torch.hstack([idx_batch_s, idx_batch_s_]) + if torch.sum(self.count_s[x, :]) < self.num_samples_s[x]: + self.count_s[x] = self.count_s_fixed[x, :] + + if torch.sum(self.count_t) < self.num_samples_t: + self.count_t = torch.ones(self.num_t) + self.train_flag = False + + return idx_batch_t, idx_batch_s + + +class BaseCARTEMultitableEstimator(BaseCARTEEstimator): + """Base class for CARTE Multitable Estimator.""" + + def __init__( + self, + *, + source_data, + num_layers, + load_pretrain, + freeze_pretrain, + learning_rate, + batch_size, + max_epoch, + dropout, + val_size, + target_fraction, + early_stopping_patience, + num_model, + random_state, + n_jobs, + device, + disable_pbar, + ): + + super(BaseCARTEMultitableEstimator, self).__init__( + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + cross_validate=False, # overridden + ) + + self.source_data = source_data + self.target_fraction = target_fraction + + def fit(self, X, y): + """Fit the CARTE Multitable model. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples of the target data. + + y : array-like of shape (n_samples,) + Target values. + + Returns + ------- + self : object + Fitted estimator. + """ + + # Preliminary settings + self.is_fitted_ = False + self.device_ = torch.device(self.device) + self.X_ = X + self.y_ = y + self._set_task_specific_settings() + + # Set random_state, source list, and grid for parallelism + random_state = check_random_state(self.random_state) + random_state_list = [random_state.randint(1000) for _ in range(self.num_model)] + self.source_list_total_ = list(self.source_data.keys()) + ["target"] + grid = {"source": self.source_list_total_, "random_state": random_state_list} + model_space_total = list(ParameterGrid(grid)) + + # Fit model + result_fit = Parallel(n_jobs=self.n_jobs)( + delayed(self._run_train_with_early_stopping)(model_space) + for model_space in model_space_total + ) + + self.result_fit_ = result_fit + + # Store the required results that may be used later + self.model_list_ = [model for (model, _, _, _) in result_fit] + self.valid_loss_ = [valid_loss for (_, valid_loss, _, _) in result_fit] + self.source_list_ = [sl for (_, _, sl, _) in result_fit] + self.random_state_list_ = [rs for (_, _, _, rs) in result_fit] + self.is_fitted_ = True + + val_loss_mean_ = [] + val_loss_total_ = [] + for source_name in self.source_list_total_: + idx_ = np.where(np.array(self.source_list_) == source_name)[0] + val_loss_total_ += [self.valid_loss_[idx] for idx in idx_] + val_loss_mean_ += [np.array(val_loss_total_).mean()] + val_loss_mean_ = -1 * np.array(val_loss_mean_) + val_loss_total_ = -1 * np.array(val_loss_total_) + weights = val_loss_mean_ / val_loss_total_.std() + self.weights_ = np.exp(weights) / sum(np.exp(weights)) + + return self + + def _run_train_with_early_stopping(self, model_space): + """Train each model corresponding to the random_state with the early_stopping patience. + + This mode of training sets train/valid set for the early stopping criterion. + Returns the trained model, train and validation loss at the best epoch, and the random_state. + """ + + # Set random_state and source data + random_state = model_space["random_state"] + if model_space["source"] == "target": + target_only_flag = True + source_data = None + else: + source_data = self.source_data[model_space["source"]] + target_only_flag = False + + # Target dataset + y_target = [data.y.cpu().detach().numpy() for data in self.X_] + stratify = None + if self._estimator_type == "classifier": + stratify = y_target + ds_train_target, ds_valid_target = train_test_split( + self.X_, + test_size=self.val_size, + shuffle=True, + stratify=stratify, + random_state=random_state, + ) + + # Source dataset + ds_train_source, ds_valid_source = self._set_source_data( + source_data, + ds_valid_target, + random_state, + ) + + # Set validation batch for evaluation + ds_valid = ds_valid_target + ds_valid_source + ds_train = ds_train_target + ds_train_source + ds_valid_eval = self._set_data_eval(data=ds_valid) + + # Load model and optimizer + model_run_train = self._load_model() + model_run_train.to(self.device_) + optimizer = torch.optim.AdamW( + model_run_train.parameters(), lr=self.learning_rate + ) + + # Train model + valid_loss_best = 9e15 + es_counter = 0 + model_best_ = copy.deepcopy(model_run_train) + + if target_only_flag: + train_loader = DataLoader( + ds_train, batch_size=self.batch_size, shuffle=False + ) + else: + domain_indicator = torch.tensor([data.domain for data in ds_train]) + idx_iterator = IdxIterator( + n_batch=self.batch_size, + domain_indicator=domain_indicator, + target_fraction=self.target_fraction, + ) + + for _ in tqdm( + range(1, self.max_epoch + 1), + desc=f"Model No. xx", + disable=self.disable_pbar, + ): + + # Run epoch + if target_only_flag: + self._run_epoch(model_run_train, optimizer, train_loader) + else: + self._run_epoch_multitable( + ds_train_source, + ds_train_target, + model_run_train, + optimizer, + idx_iterator, + ) + + # Obtain validation losses + valid_loss = self._eval(model_run_train, ds_valid_eval) + + # Update model + if valid_loss < valid_loss_best: + valid_loss_best = valid_loss + model_best_ = copy.deepcopy(model_run_train) + es_counter = 0 + else: + es_counter += 1 + if es_counter > self.early_stopping_patience: + break + model_best_.eval() + return model_best_, valid_loss_best, model_space["source"], random_state + + def _run_epoch_multitable( + self, + ds_source, + ds_target, + model, + optimizer, + idx_iterator, + ): + """Run an epoch for multitable of the input model.""" + model.train() + idx_iterator.train_flag = True + while idx_iterator.train_flag: + idx_batch_target, idx_batch_source = idx_iterator.sample() + ds_source_batch = [ds_source[idx] for idx in idx_batch_source] + ds_target_batch = [ds_target[idx] for idx in idx_batch_target] + ds_batch = ds_source_batch + ds_target_batch + ds_train = self._set_data_eval(data=ds_batch) + self._run_step(data=ds_train, model=model, optimizer=optimizer) + + def _set_source_data(self, source_data, ds_valid_target, random_state): + """Prepare the source data for training.""" + if source_data is None: + return [], [] + else: + y_source = [data.y.cpu().detach().numpy() for data in source_data] + stratify = [data.domain for data in source_data] + stratify = np.array(stratify) + if self._estimator_type == "classifier": + y_source = [data.y.cpu().detach().numpy() for data in source_data] + y_source = pd.Series(y_source) + y_source = y_source.astype(str) + stratify = pd.Series(stratify) + stratify = stratify.astype(str) + stratify = stratify + "_" + y_source + ds_train_source, ds_valid_source = train_test_split( + source_data, + test_size=len(ds_valid_target), + shuffle=True, + stratify=stratify, + random_state=random_state, + ) + return ds_train_source, ds_valid_source + + +class CARTEMultitableRegressor(RegressorMixin, BaseCARTEMultitableEstimator): + """CARTE Multitable Regressor for Regression tasks. + + This estimator is GNN-based model compatible with the CARTE pretrained model. + + Parameters + ---------- + loss : {'squared_error', 'absolute_error'}, default='squared_error' + The loss function used for backpropagation. + scoring : {'r2_score', 'squared_error'}, default='r2_score' + The scoring function used for validation. + source_date : dict, default={} + The source data used in multitable estimator. + num_layers : int, default=1 + The number of layers for the NN model + load_pretrain : bool, default=True + Indicates whether to load pretrained weights or not + freeze_pretrain : bool, default=True + Indicates whether to freeze the pretrained weights in the training or not + learning_rate : float, default=1e-3 + The learning rate of the model. The model uses AdamW as the optimizer + batch_size : int, default=16 + The batch size used for training + max_epoch : int or None, default=500 + The maximum number of epoch for training + dropout : float, default=0 + The dropout rate for training + val_size : float, default=0.1 + The size of the validation set used for early stopping + target_fraction : float, default=0.125 + The fraction of target data inside of a batch when training + early_stopping_patience : int or None, default=40 + The early stopping patience when early stopping is used. + If set to None, no early stopping is employed + num_model : int, default=1 + The total number of models used for Bagging strategy + random_state : int or None, default=0 + Pseudo-random number generator to control the train/validation data split + if early stoppingis enabled, the weight initialization, and the dropout. + Pass an int for reproducible output across multiple function calls. + n_jobs : int, default=1 + Number of jobs to run in parallel. Training the estimator the score are parallelized + over the number of models. + device : {"cpu", "gpu"}, default="cpu", + The device used for the estimator. + disable_pbar : bool, default=True + Indicates whether to show progress bars for the training process. + """ + + def __init__( + self, + *, + loss: str = "squared_error", + scoring: str = "r2_score", + source_data: dict = {}, + num_layers: int = 1, + load_pretrain: bool = True, + freeze_pretrain: bool = True, + learning_rate: float = 1e-3, + batch_size: int = 16, + max_epoch: int = 500, + dropout: float = 0, + val_size: float = 0.2, + target_fraction: float = 0.125, + early_stopping_patience: Union[None, int] = 40, + num_model: int = 1, + random_state: int = 0, + n_jobs: int = 1, + device: str = "cpu", + disable_pbar: bool = True, + ): + super(CARTEMultitableRegressor, self).__init__( + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + source_data=source_data, + target_fraction=target_fraction, + ) + + self.loss = loss + self.scoring = scoring + + def predict(self, X): + """Predict values for X. + + Returns the weighted average of the singletable model and all pairwise model with 1-source. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted values. + """ + + out = [] + for source_name in self.source_list_total_: + idx_ = np.where(np.array(self.source_list_) == source_name)[0] + model_list = [self.model_list_[idx] for idx in idx_] + out += [self._generate_output(X, model_list=model_list, weights=None)] + out = np.array(out).squeeze().transpose() + out = np.average(out, weights=self.weights_, axis=1) + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + return out + + +class CARTEMultitableClassifer(ClassifierMixin, BaseCARTEMultitableEstimator): + """CARTE Multitable Classifier for Classification tasks. + + This estimator is GNN-based model compatible with the CARTE pretrained model. + + Parameters + ---------- + loss : {'binary_crossentropy', 'categorical_crossentropy'}, default='binary_crossentropy' + The loss function used for backpropagation. + scoring : {'auroc', 'auprc', 'binary_entropy'}, default='auroc' + The scoring function used for validation. + source_date : dict, default={} + The source data used in multitable estimator. + num_layers : int, default=1 + The number of layers for the NN model + load_pretrain : bool, default=True + Indicates whether to load pretrained weights or not + freeze_pretrain : bool, default=True + Indicates whether to freeze the pretrained weights in the training or not + learning_rate : float, default=1e-3 + The learning rate of the model. The model uses AdamW as the optimizer + batch_size : int, default=16 + The batch size used for training + max_epoch : int or None, default=500 + The maximum number of epoch for training + dropout : float, default=0 + The dropout rate for training + val_size : float, default=0.1 + The size of the validation set used for early stopping + target_fraction : float, default=0.125 + The fraction of target data inside of a batch when training + early_stopping_patience : int or None, default=40 + The early stopping patience when early stopping is used. + If set to None, no early stopping is employed + num_model : int, default=1 + The total number of models used for Bagging strategy + random_state : int or None, default=0 + Pseudo-random number generator to control the train/validation data split + if early stoppingis enabled, the weight initialization, and the dropout. + Pass an int for reproducible output across multiple function calls. + n_jobs : int, default=1 + Number of jobs to run in parallel. Training the estimator the score are parallelized + over the number of models. + device : {"cpu", "gpu"}, default="cpu", + The device used for the estimator. + disable_pbar : bool, default=True + Indicates whether to show progress bars for the training process. + """ + + def __init__( + self, + *, + loss: str = "binary_crossentropy", + scoring: str = "auroc", + source_data: dict = {}, + num_layers: int = 1, + load_pretrain: bool = True, + freeze_pretrain: bool = True, + learning_rate: float = 1e-3, + batch_size: int = 16, + max_epoch: int = 500, + dropout: float = 0, + val_size: float = 0.2, + target_fraction: float = 0.125, + early_stopping_patience: Union[None, int] = 40, + num_model: int = 1, + random_state: int = 0, + n_jobs: int = 1, + device: str = "cpu", + disable_pbar: bool = True, + ): + super(CARTEMultitableClassifer, self).__init__( + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + source_data=source_data, + target_fraction=target_fraction, + ) + + self.loss = loss + self.scoring = scoring + + def predict(self, X): + """Predict classes for X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + y : ndarray, shape (n_samples,) + The predicted classes. + """ + check_is_fitted(self, "is_fitted_") + + if self.loss == "binary_crossentropy": + return np.round(self.predict_proba(X)) + elif self.loss == "categorical_crossentropy": + return np.argmax(self.predict_proba(X), axis=1) + + def predict_proba(self, X): + """Predict class probabilities for X. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + p : ndarray, shape (n_samples,) for binary classification or (n_samples, n_classes) + The class probabilities of the input samples. + """ + check_is_fitted(self, "is_fitted_") + return self._get_predict_prob(X) + + def decision_function(self, X): + """Compute the decision function of ``X``. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + decision : ndarray, shape (n_samples,) + """ + decision = self.predict_proba(X) + if decision.shape[1] == 1: + decision = decision.ravel() + return decision + + def _get_predict_prob(self, X): + """Returns the weighted average of the singletable model and all pairwise model with 1-source. + + Parameters + ---------- + X : list of graph objects with size (n_samples) + The input samples. + + Returns + ------- + raw_predictions : array, shape (n_samples,) + The raw predicted values. + """ + + out = [] + for source_name in self.source_list_total_: + idx_ = np.where(np.array(self.source_list_) == source_name)[0] + model_list = [self.model_list_[idx] for idx in idx_] + out += [self._generate_output(X, model_list=model_list, weights=None)] + out = np.array(out).squeeze().transpose() + out = np.average(out, weights=self.weights_, axis=1) + # Transform according to loss + if self.loss == "binary_crossentropy": + out = 1 / (1 + np.exp(-out)) + elif self.loss == "categorical_crossentropy": + out = softmax(out, axis=1) + # Control for nulls in prediction + if np.isnan(out).sum() > 0: + mean_pred = np.mean(self.y_) + out[np.isnan(out)] = mean_pred + return out + + +class CARTE_AblationRegressor(CARTERegressor): + """CARTE Ablation Regressor for Regression tasks. + + This estimator is GNN-based model compatible with the CARTE pretrained model. + Note that this is an implementation for the ablation study of CARTE + + Parameters + ---------- + ablation_method : {'exclude-edge', 'exclude-attention', 'exclude-attention-edge'}, default='exclude-edge' + The ablation method for CARTE Estimators. + loss : {'squared_error', 'absolute_error'}, default='squared_error' + The loss function used for backpropagation. + scoring : {'r2_score', 'squared_error'}, default='r2_score' + The scoring function used for validation. + num_layers : int, default=1 + The number of layers for the NN model + load_pretrain : bool, default=True + Indicates whether to load pretrained weights or not + freeze_pretrain : bool, default=True + Indicates whether to freeze the pretrained weights in the training or not + learning_rate : float, default=1e-3 + The learning rate of the model. The model uses AdamW as the optimizer + batch_size : int, default=16 + The batch size used for training + max_epoch : int or None, default=500 + The maximum number of epoch for training + dropout : float, default=0 + The dropout rate for training + val_size : float, default=0.1 + The size of the validation set used for early stopping + cross_validate : bool, default=False + Indicates whether to use cross-validation strategy for train/validation split + early_stopping_patience : int or None, default=40 + The early stopping patience when early stopping is used. + If set to None, no early stopping is employed + num_model : int, default=1 + The total number of models used for Bagging strategy + random_state : int or None, default=0 + Pseudo-random number generator to control the train/validation data split + if early stoppingis enabled, the weight initialization, and the dropout. + Pass an int for reproducible output across multiple function calls. + n_jobs : int, default=1 + Number of jobs to run in parallel. Training the estimator the score are parallelized + over the number of models. + device : {"cpu", "gpu"}, default="cpu", + The device used for the estimator. + disable_pbar : bool, default=True + Indicates whether to show progress bars for the training process. + """ + def __init__( + self, + *, + ablation_method: str = "exclude-edge", + loss: str = "squared_error", + scoring: str = "r2_score", + num_layers: int = 1, + load_pretrain: bool = True, + freeze_pretrain: bool = True, + learning_rate: float = 1e-3, + batch_size: int = 16, + max_epoch: int = 500, + dropout: float = 0, + val_size: float = 0.2, + cross_validate: bool = False, + early_stopping_patience: Union[None, int] = 40, + num_model: int = 1, + random_state: int = 0, + n_jobs: int = 1, + device: str = "cpu", + disable_pbar: bool = True, + ): + + super(CARTE_AblationRegressor, self).__init__( + loss=loss, + scoring=scoring, + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + cross_validate=cross_validate, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + ) + + self.ablation_method = ablation_method + + def _load_model(self): + """Load the CARTE Ablation model for training. + + This loads the pretrained weights if the parameter load_pretrain is set to True. + The freeze of the pretrained weights are controlled by the freeze_pretrain parameter. + + Returns the model depending on the ablation method that can be used for training. + """ + + # Model configuration + model_config = dict() + model_config["ablation_method"] = self.ablation_method + model_config["input_dim_x"] = self.X_[0].x.size(1) + model_config["input_dim_e"] = self.X_[0].x.size(1) + model_config["hidden_dim"] = self.X_[0].x.size(1) + model_config["ff_dim"] = self.X_[0].x.size(1) + model_config["num_heads"] = 12 + model_config["num_layers"] = self.num_layers-1 + model_config["output_dim"] = self.output_dim_ + model_config["dropout"] = self.dropout + + # Set seed for torch - for reproducibility + random_state = check_random_state(self.random_state) + model_seed = random_state.randint(10000) + torch.manual_seed(model_seed) + + # Set model architecture + model = CARTE_NN_Model_Ablation(**model_config) + + # Load the pretrained weights if specified + if self.load_pretrain: + dir_model = config_directory["pretrained_model"] + model.load_state_dict( + torch.load(dir_model, map_location=self.device_), strict=False + ) + # Freeze the pretrained weights if specified + if self.freeze_pretrain: + for param in model.ft_base.read_out_block.parameters(): + param.requires_grad = False + for param in model.ft_base.layers.parameters(): + param.requires_grad = False + + # Set architecture for ablation + if self.ablation_method == "exclude-edge": + model.ft_base.initial_e = torch.nn.Identity() + + return model + + +class CARTE_AblationClassifier(CARTEClassifier): + """CARTE Ablation Classifier for Classification tasks. + + This estimator is GNN-based model compatible with the CARTE pretrained model. + Note that this is an implementation for the ablation study of CARTE + + Parameters + ---------- + ablation_method : {'exclude-edge', 'exclude-attention', 'exclude-attention-edge'}, default='exclude-edge' + The ablation method for CARTE Estimators. + loss : {'binary_crossentropy', 'categorical_crossentropy'}, default='binary_crossentropy' + The loss function used for backpropagation. + scoring : {'auroc', 'auprc', 'binary_entropy'}, default='auroc' + The scoring function used for validation. + num_layers : int, default=1 + The number of layers for the NN model + load_pretrain : bool, default=True + Indicates whether to load pretrained weights or not + freeze_pretrain : bool, default=True + Indicates whether to freeze the pretrained weights in the training or not + learning_rate : float, default=1e-3 + The learning rate of the model. The model uses AdamW as the optimizer + batch_size : int, default=16 + The batch size used for training + max_epoch : int or None, default=500 + The maximum number of epoch for training + dropout : float, default=0 + The dropout rate for training + val_size : float, default=0.1 + The size of the validation set used for early stopping + cross_validate : bool, default=False + Indicates whether to use cross-validation strategy for train/validation split + early_stopping_patience : int or None, default=40 + The early stopping patience when early stopping is used. + If set to None, no early stopping is employed + num_model : int, default=1 + The total number of models used for Bagging strategy + random_state : int or None, default=0 + Pseudo-random number generator to control the train/validation data split + if early stoppingis enabled, the weight initialization, and the dropout. + Pass an int for reproducible output across multiple function calls. + n_jobs : int, default=1 + Number of jobs to run in parallel. Training the estimator the score are parallelized + over the number of models. + device : {"cpu", "gpu"}, default="cpu", + The device used for the estimator. + disable_pbar : bool, default=True + Indicates whether to show progress bars for the training process. + """ + def __init__( + self, + *, + ablation_method: str = "exclude-edge", + loss: str = "binary_crossentropy", + scoring: str = "auroc", + num_layers: int = 1, + load_pretrain: bool = False, + freeze_pretrain: bool = False, + learning_rate: float = 1e-3, + batch_size: int = 16, + max_epoch: int = 500, + dropout: float = 0, + val_size: float = 0.2, + cross_validate: bool = False, + early_stopping_patience: Union[None, int] = 40, + num_model: int = 1, + random_state: int = 0, + n_jobs: int = 1, + device: str = "cpu", + disable_pbar: bool = True, + ): + + super(CARTE_AblationClassifier, self).__init__( + loss=loss, + scoring=scoring, + num_layers=num_layers, + load_pretrain=load_pretrain, + freeze_pretrain=freeze_pretrain, + learning_rate=learning_rate, + batch_size=batch_size, + max_epoch=max_epoch, + dropout=dropout, + val_size=val_size, + cross_validate=cross_validate, + early_stopping_patience=early_stopping_patience, + num_model=num_model, + random_state=random_state, + n_jobs=n_jobs, + device=device, + disable_pbar=disable_pbar, + ) + + self.ablation_method = ablation_method + + def _load_model(self): + """Load the CARTE Ablation model for training. + + This loads the pretrained weights if the parameter load_pretrain is set to True. + The freeze of the pretrained weights are controlled by the freeze_pretrain parameter. + + Returns the model depending on the ablation method that can be used for training. + """ + + # Model configuration + model_config = dict() + model_config["ablation_method"] = self.ablation_method + model_config["input_dim_x"] = self.X_[0].x.size(1) + model_config["input_dim_e"] = self.X_[0].x.size(1) + model_config["hidden_dim"] = self.X_[0].x.size(1) + model_config["ff_dim"] = self.X_[0].x.size(1) + model_config["num_heads"] = 12 + model_config["num_layers"] = self.num_layers-1 + model_config["output_dim"] = self.output_dim_ + model_config["dropout"] = self.dropout + + # Set seed for torch - for reproducibility + random_state = check_random_state(self.random_state) + model_seed = random_state.randint(10000) + torch.manual_seed(model_seed) + + # Set model architecture + model = CARTE_NN_Model_Ablation(**model_config) + + # Load the pretrained weights if specified + if self.load_pretrain: + dir_model = config_directory["pretrained_model"] + pretrain_model_dict = torch.load(dir_model, map_location=self.device_) + initial_x_keys = [ + key for key in pretrain_model_dict.keys() if "initial_x" in key + ] + for key in initial_x_keys: + pretrain_model_dict[key + "_pretrain"] = pretrain_model_dict.pop(key) + model.load_state_dict(pretrain_model_dict, strict=False) + + # Freeze the pretrained weights if specified + if self.freeze_pretrain: + for param in model.ft_base.read_out_block.parameters(): + param.requires_grad = False + for param in model.ft_base.layers.parameters(): + param.requires_grad = False + + # Set architecture for ablation + if self.ablation_method == "exclude-edge": + model.ft_base.initial_e = torch.nn.Identity() + + return model diff --git a/carte/src/carte_gridsearch.py b/carte/src/carte_gridsearch.py new file mode 100644 index 0000000..e5dad54 --- /dev/null +++ b/carte/src/carte_gridsearch.py @@ -0,0 +1,110 @@ +"""Custom grid search used for CARTE-GNN model""" + +import ast +import copy +import pandas as pd +import numpy as np +from joblib import Parallel, delayed +from time import perf_counter +from sklearn.model_selection import ParameterGrid + + +def carte_gridsearch( + estimator, + X_train: list, + y_train: np.array, + param_distributions: dict, + refit: bool = True, + n_jobs: int = 1, +): + """CARTE grid search. + + This function runs grid search for CARTE GNN models. + + Parameters + ---------- + estimator : CARTE estimator + The CARTE estimator used for grid search + X_train : list + The list of graph objects for the train data transformed using Table2GraphTransformer + y_train : numpy array of shape (n_samples,) + The target variable of the train data. + param_distributions: dict + The dictionary of parameter grids to search for the optimial parameter. + refit: bool, default=True + Indicates whether to return a refitted estimator with the best parameter. + n_jobs: int, default=1 + Number of jobs to run in parallel. Training the estimator in the grid search is parallelized + over the parameter grid. + + Returns + ------- + Result : Pandas DataFrame + The result of each parameter grid. + best_params : dict + The dictionary of best parameters obtained through grid search. + best_estimator : CARTEGNN estimator + The CARTE estimator trained using the best_params if refit is set to True. + """ + # Set paramater list + param_distributions_ = param_distributions.copy() + param_list = list(ParameterGrid(param_distributions_)) + + # Run Gridsearch + gridsearch_result = Parallel(n_jobs=n_jobs)( + delayed(_run_search_carte)(estimator, X_train, y_train, params) + for params in param_list + ) + gridsearch_result = pd.concat(gridsearch_result, axis=0) + + # Add rank + rank = gridsearch_result["score"].rank(method="min").astype(int).copy() + rank = pd.DataFrame(rank) + rank.rename(columns={"score": "rank"}, inplace=True) + gridsearch_result = pd.concat([gridsearch_result, rank], axis=1) + + # Best params + params_ = gridsearch_result["params"] + best_params_ = params_[gridsearch_result["rank"] == 1].iloc[0] + best_params = ast.literal_eval(best_params_) + + # Refit + best_estimator = None + if refit: + best_estimator = copy.deepcopy(estimator) + best_estimator.__dict__.update(best_params) + best_estimator.fit(X=X_train, y=y_train) + + return gridsearch_result, best_params, best_estimator + + +def _run_search_carte(estimator, X_train, y_train, params): + """Run fit predict over a parmeter in the parameter grid.""" + # Measure time + start_time = perf_counter() + + # Run estimator + estimator_ = copy.deepcopy(estimator) + estimator_.__dict__.update(params) + estimator_.fit(X=X_train, y=y_train) + + # Measure time + end_time = perf_counter() + duration = round(end_time - start_time, 4) + + # Statistics + vl = np.array(estimator_.valid_loss_) + + # Obtain results + result_run = { + f"cv-run_{i}_valid_loss": estimator_.valid_loss_[i] + for i in range(estimator_.num_model) + } + result_run["params"] = str(params) + result_run["score"] = np.mean(vl) + result_run["fit_time"] = duration + + result_df = pd.DataFrame([result_run]) + result_df = result_df.reindex(sorted(result_df.columns), axis=1) + + return result_df diff --git a/carte/src/carte_model.py b/carte/src/carte_model.py new file mode 100644 index 0000000..cebcaf3 --- /dev/null +++ b/carte/src/carte_model.py @@ -0,0 +1,420 @@ +""" +CARTE neural network model used for pretraining and downstream tasks. + +""" + +import math +import torch +import torch.nn as nn +from typing import Tuple +from torch import Tensor +from torch_geometric.utils import softmax +from torch_scatter import scatter + + +## CARTE - Attention and output calculation +def _carte_calculate_attention( + edge_index: Tensor, query: Tensor, key: Tensor, value: Tensor +): + # Calculate the scaled-dot product attention + attention = torch.sum(torch.mul(query[edge_index[0], :], key), dim=1) + attention = attention / math.sqrt(query.size(1)) + attention = softmax(attention, edge_index[0]) + # Generate the output + src = torch.mul(attention, value.t()).t() + output = scatter(src, edge_index[0], dim=0, reduce="sum") + return output, attention + + +## CARTE - output calculation with multi-head (message passing) +def _carte_calculate_multihead_output( + edge_index: Tensor, + query: Tensor, + key: Tensor, + value: Tensor, + num_heads: int = 1, + concat: bool = True, +): + if concat: + H, C = num_heads, query.size(1) // num_heads + for i in range(H): + O, A = _carte_calculate_attention( + edge_index, + query[:, i * C : (i + 1) * C], + key[:, i * C : (i + 1) * C], + value[:, i * C : (i + 1) * C], + ) + if i == 0: + output, attention = O, A + else: + output = torch.cat((output, O), dim=1) + attention = torch.cat((attention, A), dim=0) + else: + H, C = num_heads, query.size(1) + for i in range(H): + O, A = _carte_calculate_attention( + edge_index, + query[:, i * C : (i + 1) * C], + key[:, i * C : (i + 1) * C], + value[:, i * C : (i + 1) * C], + ) + if i == 0: + output, attention = O, A + else: + output = torch.cat((output, O), dim=0) + attention = torch.cat((attention, A), dim=0) + output = output / H + attention = attention / H + return output, attention + + +## CARTE - Attention Layer +class CARTE_Attention(nn.Module): + def __init__( + self, + input_dim: int, + output_dim: int, + num_heads: int = 1, + concat: bool = True, + read_out: bool = False, + ): + super(CARTE_Attention, self).__init__() + + if concat: + assert output_dim % num_heads == 0 + self.lin_query = nn.Linear(input_dim, output_dim, bias=False) + self.lin_key = nn.Linear(input_dim, output_dim, bias=False) + self.lin_value = nn.Linear(input_dim, output_dim, bias=False) + else: + self.lin_query = nn.Linear(input_dim, num_heads * output_dim, bias=False) + self.lin_key = nn.Linear(input_dim, num_heads * output_dim, bias=False) + self.lin_value = nn.Linear(input_dim, num_heads * output_dim, bias=False) + + if read_out == False: + self.lin_edge = nn.Linear(input_dim, output_dim) + + self.input_dim = input_dim + self.output_dim = output_dim + self.num_heads = num_heads + self.concat = concat + self.readout = read_out + + self.reset_parameters() + + def reset_parameters(self): + self.lin_query.reset_parameters() + self.lin_key.reset_parameters() + self.lin_value.reset_parameters() + if self.readout == False: + self.lin_edge.reset_parameters() + + def forward( + self, + x: Tensor, + edge_index: Tensor, + edge_attr: Tensor, + return_attention: bool = False, + ): + Z = torch.mul(edge_attr, x[edge_index[1]]) + + query = self.lin_query(x) + key = self.lin_key(Z) + value = self.lin_value(Z) + + output, attention = _carte_calculate_multihead_output( + edge_index=edge_index, + query=query, + key=key, + value=value, + num_heads=self.num_heads, + concat=self.concat, + ) + + if self.readout == False: + edge_attr = self.lin_edge(edge_attr) + + if return_attention: + return output, edge_attr, attention + else: + return output, edge_attr + + +## CARTE - single encoding block +class CARTE_Block(nn.Module): + def __init__( + self, + input_dim: int, + ff_dim: int, + num_heads: int = 1, + concat: bool = True, + dropout: float = 0.1, + read_out: bool = False, + ): + super().__init__() + + # Graph Attention Layer + self.g_attn = CARTE_Attention( + input_dim, input_dim, num_heads, concat, read_out=read_out + ) + + # Two-layer MLP + Layers to apply in between the main layers for x and edges + self.linear_net_x = nn.Sequential( + nn.Linear(input_dim, ff_dim), + nn.Dropout(dropout), + nn.GELU(), + nn.Linear(ff_dim, input_dim), + ) + self.norm1_x = nn.LayerNorm(input_dim) + self.norm2_x = nn.LayerNorm(input_dim) + + self.read_out = read_out + if self.read_out == False: + self.linear_net_e = nn.Sequential( + nn.Linear(input_dim, ff_dim), + nn.Dropout(dropout), + nn.GELU(), + nn.Linear(ff_dim, input_dim), + ) + self.norm1_e = nn.LayerNorm(input_dim) + + self.dropout = nn.Dropout(dropout) + self.gelu = nn.GELU() + + def forward( + self, + x: Tensor, + edge_index: Tensor, + edge_attr: Tensor, + ): + # Attention part + attn_out_x, attn_out_e = self.g_attn(x, edge_index, edge_attr) + x = self.dropout(attn_out_x) + # x = self.gelu(x) + # x = x + self.dropout(attn_out_x) + x = self.norm1_x(x) + + # MLP part - Node + linear_out_x = self.linear_net_x(x) + x = self.dropout(linear_out_x) + # x = self.gelu(x) + # x = x + self.dropout(linear_out_x) + x = self.norm2_x(x) + + # MLP part - Edge + if self.read_out == False: + edge_attr = self.linear_net_e(attn_out_e) + edge_attr = edge_attr + self.dropout(edge_attr) + edge_attr = self.norm1_e(edge_attr) + return x, edge_attr + else: + return x + + +## CARTE - contrast block +class CARTE_Contrast(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x: Tensor): + x = nn.functional.normalize(x, dim=1) + + # Cosine similarity + x = 1 - (torch.cdist(x, x) / 2) + + # RBF kernel (Gaussian similarity) + # sig = torch.median(torch.cdist(x, x)) + # x = torch.exp(-(torch.cdist(x, x) / (2 * sig))) + + return x + + +## CARTE - finetune base block +class CARTE_Base(nn.Module): + def __init__( + self, + input_dim_x: int, + input_dim_e: int, + hidden_dim: int, + num_layers: int, + **block_args + ): + super(CARTE_Base, self).__init__() + + self.initial_x = nn.Sequential( + nn.Linear(input_dim_x, hidden_dim), + nn.GELU(), + nn.LayerNorm(hidden_dim), + ) + + self.initial_e = nn.Sequential( + nn.Linear(input_dim_e, hidden_dim), + nn.GELU(), + nn.LayerNorm(hidden_dim), + ) + + self.layers = nn.ModuleList( + [CARTE_Block(input_dim=hidden_dim, **block_args) for _ in range(num_layers)] + ) + + self.read_out_block = CARTE_Block( + input_dim=hidden_dim, read_out=True, **block_args + ) + + def forward(self, x, edge_index, edge_attr, return_attention=False): + # Initial layer for the node/edge features + x = self.initial_x(x) + edge_attr = self.initial_e(edge_attr) + + for l in self.layers: + x, edge_attr = l(x, edge_index, edge_attr) + + x = self.read_out_block(x, edge_index, edge_attr) + + if return_attention: + attention_maps = [] + for l in self.layers: + _, _, attention = l.g_attn(x, edge_index, edge_attr, return_attention) + attention_maps.append(attention) + return x, attention_maps + elif return_attention == False: + return x + + +## CARTE - Pretrain Model +class CARTE_Pretrain(nn.Module): + def __init__( + self, + input_dim_x: int, + input_dim_e: int, + hidden_dim: int, + num_layers: int, + **block_args + ): + super(CARTE_Pretrain, self).__init__() + + self.ft_base = CARTE_Base( + input_dim_x=input_dim_x, + input_dim_e=input_dim_e, + hidden_dim=hidden_dim, + num_layers=num_layers, + **block_args + ) + + self.pretrain_classifier = nn.Sequential( + nn.Linear(hidden_dim, 4 * hidden_dim), + nn.GELU(), + nn.Linear(4 * hidden_dim, hidden_dim), + nn.GELU(), + nn.LayerNorm(hidden_dim, elementwise_affine=False), + CARTE_Contrast(), + ) + + def forward(self, input): + x, edge_index, edge_attr, head_idx = ( + input.x.clone(), + input.edge_index, + input.edge_attr.clone(), + input.head_idx, + ) + + x = self.ft_base(x, edge_index, edge_attr) + x = x[head_idx, :] + x = self.pretrain_classifier(x) + + return x + + +## CARTE - Downstream Model +class CARTE_NN_Model(nn.Module): + def __init__( + self, + input_dim_x: int, + input_dim_e: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + **block_args + ): + super(CARTE_NN_Model, self).__init__() + + self.ft_base = CARTE_Base( + input_dim_x=input_dim_x, + input_dim_e=input_dim_e, + hidden_dim=hidden_dim, + num_layers=num_layers, + **block_args + ) + + self.ft_classifier = nn.Sequential( + nn.Linear(hidden_dim, int(hidden_dim / 2)), + nn.ReLU(), + nn.LayerNorm(int(hidden_dim / 2)), + nn.Linear(int(hidden_dim / 2), int(hidden_dim / 4)), + nn.ReLU(), + nn.LayerNorm(int(hidden_dim / 4)), + nn.Linear(int(hidden_dim / 4), output_dim), + ) + + def forward(self, input): + x, edge_index, edge_attr, head_idx = ( + input.x.clone(), + input.edge_index.clone(), + input.edge_attr.clone(), + input.ptr[:-1], + ) + + x = self.ft_base(x, edge_index, edge_attr) + x = x[head_idx, :] + x = self.ft_classifier(x) + + return x + + +## CARTE - Downstream Ablation model +class CARTE_NN_Model_Ablation(nn.Module): + def __init__( + self, + ablation_method: str, + input_dim_x: int, + input_dim_e: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + **block_args, + ): + super(CARTE_NN_Model_Ablation, self).__init__() + + self.ablation_method = ablation_method + + self.ft_base = CARTE_Base( + input_dim_x=input_dim_x, + input_dim_e=input_dim_e, + hidden_dim=hidden_dim, + num_layers=num_layers, + **block_args, + ) + + self.ft_classifier = nn.Sequential( + nn.Linear(hidden_dim, int(hidden_dim / 2)), + nn.ReLU(), + nn.LayerNorm(int(hidden_dim / 2)), + nn.Linear(int(hidden_dim / 2), int(hidden_dim / 4)), + nn.ReLU(), + nn.LayerNorm(int(hidden_dim / 4)), + nn.Linear(int(hidden_dim / 4), output_dim), + ) + + def forward(self, input): + x, edge_index, edge_attr, head_idx = ( + input.x.clone(), + input.edge_index.clone(), + input.edge_attr.clone(), + input.ptr[:-1], + ) + + if "exclude-attention" not in self.ablation_method: + x = self.ft_base(x, edge_index, edge_attr) + x = x[head_idx, :] + x = self.ft_classifier(x) + + return x diff --git a/carte/src/carte_table_to_graph.py b/carte/src/carte_table_to_graph.py new file mode 100644 index 0000000..87e1fce --- /dev/null +++ b/carte/src/carte_table_to_graph.py @@ -0,0 +1,277 @@ +import torch +import numpy as np +import pandas as pd +import fasttext +import fasttext.util +import gc # Import the garbage collector module +from typing import Union +from torch_geometric.data import Data +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.preprocessing import PowerTransformer +from sklearn.pipeline import make_pipeline +from carte.configs.directory import config_directory +from skrub import MinHashEncoder # change to skrub + + +def _create_edge_index(num_nodes: int, edge_attr: torch.Tensor, undirected: bool = True, self_loop: bool = True): + """ + Sets the edge_index and edge_attr for graphs. + + Parameters + ---------- + num_nodes : int + Number of nodes in the graph. + edge_attr : torch.Tensor + Edge attributes tensor. + undirected : bool, optional + Whether the graph is undirected, by default True. + self_loop : bool, optional + Whether to add self-loops, by default True. + + Returns + ------- + edge_index : torch.Tensor + Edge indices tensor. + edge_attr : torch.Tensor + Edge attributes tensor. + """ + edge_index_ = torch.triu_indices(num_nodes, num_nodes, offset=1) + edge_index_ = edge_index_[:, (edge_index_[0] == 0)] + edge_index = edge_index_.clone() + edge_attr_ = edge_attr.clone() + + if undirected: + edge_index = torch.cat((edge_index, torch.flip(edge_index, [0]))) + edge_attr_ = torch.cat((edge_attr_, edge_attr_)) + + if self_loop: + unique_nodes = edge_index_[1].unique() + edge_index_self_loop = torch.stack((unique_nodes, unique_nodes)) + edge_index = torch.cat((edge_index, edge_index_self_loop), dim=1) + edge_attr_ = torch.cat((edge_attr_, torch.ones(unique_nodes.size(0), edge_attr_.size(1), dtype=edge_attr_.dtype))) + + return edge_index, edge_attr_ + + +class Table2GraphTransformer(TransformerMixin, BaseEstimator): + """ + Transformer from tables to a list of graphs. + + Parameters + ---------- + include_edge_attr : bool, optional + Whether to include edge attributes, by default True. + lm_model : str, optional + Language model to use, by default "fasttext". + n_components : int, optional + Number of components for MinHash encoder, by default 300. + n_jobs : int, optional + Number of jobs for parallel processing, by default 1. + """ + + def __init__(self, *, include_edge_attr: bool = True, lm_model: str = "fasttext", n_components: int = 300, n_jobs: int = 1): + super().__init__() + self.include_edge_attr = include_edge_attr + self.lm_model = lm_model + self.n_components = n_components + self.n_jobs = n_jobs + self.is_fitted_ = False + + def fit(self, X, y=None): + """ + Fit function used for the Table2GraphTransformer. + + Parameters + ---------- + X : pandas.DataFrame + Input data to fit. + y : array-like, optional + Target values, by default None. + + Returns + ------- + self : Table2GraphTransformer + Fitted transformer. + """ + self.y_ = y + + if not hasattr(self, "lm_model_"): + self._load_lm_model() + + cat_col_names = X.select_dtypes(include="object").columns.str.replace("\n", " ", regex=True).str.lower() + self.cat_col_names = list(cat_col_names) + num_col_names = X.select_dtypes(exclude="object").columns.str.replace("\n", " ", regex=True).str.lower() + self.num_col_names = list(num_col_names) + self.col_names = self.cat_col_names + self.num_col_names + + self.num_transformer_ = PowerTransformer().set_output(transform="pandas") + if self.lm_model == "minhash": + self.name_transformer = make_pipeline( + MinHashEncoder(n_components=self.n_components, n_jobs=self.n_jobs), + PowerTransformer(), + ) + + # Ensure numerical columns exist before fitting the transformer + if self.num_col_names: + num_cols_exist = [col for col in self.num_col_names if col in X.columns] + if num_cols_exist: + self.num_transformer_.fit(X[num_cols_exist]) + + self.is_fitted_ = True + return self + + def transform(self, X, y=None): + """ + Apply Table2GraphTransformer to each row of the data. + + Parameters + ---------- + X : pandas.DataFrame + Input data to transform. + y : array-like, optional + Target values, by default None. + + Returns + ------- + data_graph : list + List of transformed graph objects. + """ + X_ = X.replace("\n", " ", regex=True) + num_data = X_.shape[0] + + y_ = torch.tensor(self.y_, dtype=torch.float32).reshape((num_data, 1)) if self.y_ is not None else None + + X_categorical = X_.select_dtypes(include="object").copy() + X_categorical.columns = self.cat_col_names + X_numerical = X_.select_dtypes(exclude="object").copy() + X_numerical.columns = self.num_col_names + + cat_names = pd.melt(X_categorical)["value"].dropna().astype(str).str.lower().unique() + names_total = np.unique(np.hstack([self.col_names, cat_names])) + name_dict = {name: idx for idx, name in enumerate(names_total)} + + name_attr_total = self._transform_names(names_total) + if self.num_col_names: + num_cols_exist = [col for col in self.num_col_names if col in X.columns] + if num_cols_exist: + X_numerical = self._transform_numerical(X_numerical[num_cols_exist]) + + data_graph = [ + self._graph_construct(X_categorical.iloc[idx], X_numerical.iloc[idx], name_attr_total, name_dict, y_, idx) + for idx in range(num_data) + ] + + self.y_ = None + + # Manually trigger garbage collection after transforming data + gc.collect() + + return data_graph + + def _load_lm_model(self): + """ + Load the language model for features of nodes and edges. + """ + if self.lm_model == "fasttext": + self.lm_model_ = fasttext.load_model(config_directory["fasttext"]) + if self.n_components != 300: + fasttext.util.reduce_model(self.lm_model_, self.n_components) + elif self.lm_model == "minhash": + self.lm_model_ = MinHashEncoder(n_components=self.n_components, n_jobs=self.n_jobs) + + def _transform_numerical(self, X): + """ + Transform numerical columns using power transformer. + + Parameters + ---------- + X : pandas.DataFrame + Input numerical data. + + Returns + ------- + transformed_X : pandas.DataFrame + Transformed numerical data. + """ + return self.num_transformer_.transform(X) + + def _transform_names(self, names_total): + """ + Obtain the feature for a given list of string values. + + Parameters + ---------- + names_total : array-like + List of string values. + + Returns + ------- + name_features : np.ndarray + Transformed features for names. + """ + if self.lm_model == "fasttext": + return np.array([self.lm_model_.get_sentence_vector(name) for name in names_total], dtype=np.float32) + elif self.lm_model == "minhash": + return self.name_transformer.fit_transform(names_total.reshape(-1, 1)).astype(np.float32) + + def _graph_construct(self, data_cat, data_num, name_attr_total, name_dict, y, idx): + """ + Transform to graph objects. + + Parameters + ---------- + data_cat : pandas.Series + Categorical data for a single instance. + data_num : pandas.Series + Numerical data for a single instance. + name_attr_total : np.ndarray + Transformed features for names. + name_dict : dict + Dictionary mapping names to indices. + y : torch.Tensor or None + Target values. + idx : int + Index of the instance. + + Returns + ------- + data : torch_geometric.data.Data + Graph data object. + """ + data_cat = data_cat.dropna().str.lower() + data_num = data_num.dropna() + num_cat = len(data_cat) + num_num = len(data_num) + + edge_attr_cat = np.array([name_attr_total[name_dict[col]] for col in data_cat.index], dtype=np.float32) + edge_attr_num = np.array([name_attr_total[name_dict[col]] for col in data_num.index], dtype=np.float32) + + x_cat = torch.tensor(np.array([name_attr_total[name_dict[val]] for val in data_cat]), dtype=torch.float32) + x_num = torch.tensor(data_num.values[:, None] * edge_attr_num, dtype=torch.float32) + + if x_cat.size(0) == 0: + x_cat = torch.empty((0, self.n_components), dtype=torch.float32) + edge_attr_cat = torch.empty((0, self.n_components), dtype=torch.float32) + if x_num.size(0) == 0: + x_num = torch.empty((0, self.n_components), dtype=torch.float32) + edge_attr_num = torch.empty((0, self.n_components), dtype=torch.float32) + + x = torch.cat((x_cat, x_num)) + x = torch.cat((torch.ones((1, x.size(1))), x)) + edge_attr = torch.tensor(np.vstack((edge_attr_cat, edge_attr_num)), dtype=torch.float32) + + num_nodes = num_cat + num_num + 1 + edge_index, edge_attr = _create_edge_index(num_nodes, edge_attr, False, True) + + Z = torch.mul(edge_attr, x[edge_index[1]]) + x[0, :] = Z[edge_index[0] == 0].mean(dim=0) + + y_ = y[idx].clone() if y is not None else torch.tensor([]) + + return Data( + x=x, + edge_index=edge_index, + edge_attr=edge_attr, + y=y_, + g_idx=idx, + ) diff --git a/carte/src/evaluate_utils.py b/carte/src/evaluate_utils.py new file mode 100644 index 0000000..99f6743 --- /dev/null +++ b/carte/src/evaluate_utils.py @@ -0,0 +1,145 @@ + +import numpy as np +import pandas as pd + +from ast import literal_eval +from sklearn.metrics import ( + r2_score, + root_mean_squared_error, + roc_auc_score, + average_precision_score, +) +from sklearn.model_selection import GroupShuffleSplit +from carte.configs.directory import config_directory + + +def set_split(data, data_config, num_train, random_state): + """Set train/test split given the random state.""" + + target_name = data_config["target_name"] + X = data.drop(columns=target_name) + y = data[target_name] + y = np.array(y) + + if data_config["repeated"]: + entity_name = data_config["entity_name"] + else: + entity_name = np.arange(len(y)) + + groups = np.array(data.groupby(entity_name).ngroup()) + num_groups = len(np.unique(groups)) + gss = GroupShuffleSplit( + n_splits=1, + test_size=int(num_groups - num_train), + random_state=random_state, + ) + idx_train, idx_test = next(iter(gss.split(X=y, groups=groups))) + + X_train, X_test = X.iloc[idx_train], X.iloc[idx_test] + y_train, y_test = y[idx_train], y[idx_test] + return X_train, X_test, y_train, y_test + + +def extract_best_params(data_name, method, num_train, random_state): + """Extract the best parameters in the CARTE paper.""" + + if "tabpfn" in method: + return dict() + else: + # Load compiled log + df_log_dir = f"{config_directory['results']}/compiled_results/results_carte_baseline_bestparams.csv" + df_log = pd.read_csv(df_log_dir) + + # Obtain the mask + mask = df_log["data_name"] != data_name + mask += df_log["model"] != method + mask += df_log["num_train"] != num_train + mask += df_log["random_state"] != random_state + + # Extract the best paramameters + best_params_ = df_log["best_param"].copy() + best_params = literal_eval(best_params_[~mask].iloc[0]) + return best_params + + +def set_score_criterion(task): + """Set scoring method for CV and score criterion in final result.""" + + if task == "regression": + scoring = "r2" + score_criterion = ["r2", "rmse"] + else: + scoring = "roc_auc" + score_criterion = ["roc_auc", "avg_precision"] + score_criterion += ["preprocess_time"] + score_criterion += ["inference_time"] + score_criterion += ["run_time"] + return scoring, score_criterion + +def shorten_param(param_name): + """Shorten the param_names for column names in search results.""" + + if "__" in param_name: + return param_name.rsplit("__", 1)[1] + return param_name + + +def check_pred_output(y_train, y_pred): + """Set the output as the mean of train data if it is nan.""" + + if np.isnan(y_pred).sum() > 0: + mean_pred = np.mean(y_train) + y_pred[np.isnan(y_pred)] = mean_pred + return y_pred + + +def reshape_pred_output(y_pred): + """Reshape the predictive output accordingly.""" + + num_pred = len(y_pred) + if y_pred.shape == (num_pred, 2): + y_pred = y_pred[:, 1] + elif y_pred.shape == (num_pred, 1): + y_pred = y_pred.ravel() + else: + pass + return y_pred + + +def set_score_criterion(task): + """Set scoring method for CV and score criterion in final result.""" + + if task == "regression": + scoring = "r2" + score_criterion = ["r2", "rmse"] + else: + scoring = "roc_auc" + score_criterion = ["roc_auc", "avg_precision"] + score_criterion += ["preprocess_time"] + score_criterion += ["inference_time"] + score_criterion += ["run_time"] + return scoring, score_criterion + + +def return_score(y_target, y_pred, task): + """Return score results for given task.""" + + if task == "regression": + score_r2 = r2_score(y_target, y_pred) + score_rmse = root_mean_squared_error(y_target, y_pred) + return score_r2, score_rmse + else: + score_auc = roc_auc_score(y_target, y_pred) + score_avg_precision = average_precision_score(y_target, y_pred) + return score_auc, score_avg_precision + + +def col_names_per_type(data, target_name): + """Extract column names per type.""" + num_col_names = data.select_dtypes(exclude="object").columns.tolist() + if target_name in num_col_names: + num_col_names.remove(target_name) + cat_col_names = data.select_dtypes(include="object").columns.tolist() + if target_name in cat_col_names: + cat_col_names.remove(target_name) + return num_col_names, cat_col_names \ No newline at end of file diff --git a/carte/src/preprocess_utils.py b/carte/src/preprocess_utils.py new file mode 100644 index 0000000..f22d823 --- /dev/null +++ b/carte/src/preprocess_utils.py @@ -0,0 +1,162 @@ +""" Functions used for preprocessing the data. """ + +import numpy as np +import pandas as pd +from carte.configs.directory import config_directory + + +def _clean_entity_names(data_entity_name): + data_entity_name = ( + data_entity_name.str.replace("<", "") + .str.replace(">", "") + .str.replace("\n", "") + .str.replace("_", " ") + .str.lower() + ) + return data_entity_name + + +def _serialize_instance(data): + data_temp = data.copy() + data_temp = data_temp.dropna() # Exclude cells with Null values + data_temp = _clean_entity_names(data_temp) + serialization = np.array(data_temp.index) + " " + np.array(data_temp) + ". " + sentence = "" + for i in range(len(data_temp)): + sentence += serialization[i] + sentence = sentence[:-1] + return sentence + + +def extract_fasttext_features(data: pd.DataFrame, extract_col_name: str): + import fasttext + + # Preliminary Settings + lm_model = fasttext.load_model(config_directory["fasttext"]) + + # Original data + data_ = data.copy() + data_.replace("\n", " ", regex=True, inplace=True) + data_ = data.copy() + + # Entity Names + ent_names = _clean_entity_names(data[extract_col_name]) + ent_names = list(ent_names) + + # Data Fasttext for entity names + data_fasttext = [lm_model.get_sentence_vector(str(x)) for x in ent_names] + data_fasttext = np.array(data_fasttext) + data_fasttext = pd.DataFrame(data_fasttext) + col_names = [f"X{i}" for i in range(data_fasttext.shape[1])] + data_fasttext = data_fasttext.set_axis(col_names, axis="columns") + data_fasttext = pd.concat([data_fasttext, data[extract_col_name]], axis=1) + # data_fasttext.drop_duplicates(inplace=True) + data_fasttext = data_fasttext.reset_index(drop=True) + + return data_fasttext + + +def extract_llm_features( + data: pd.DataFrame, + extract_col_name: str, + device: str = "cuda:0", +): + # Load LLM Model + from sentence_transformers import SentenceTransformer + + lm_model = SentenceTransformer("intfloat/e5-large-v2", device=device) + + # Original data + data_ = data.copy() + data_.replace("\n", " ", regex=True, inplace=True) + + # Entity Names + ent_names = _clean_entity_names(data_[extract_col_name].copy()) + ent_names = ent_names.astype(str) + ent_names = ( + "query: " + ent_names + ) # following the outlined procedure using "query: " + ent_names = list(ent_names) + + # Data for entity names + embedding = lm_model.encode(ent_names, convert_to_numpy=True) + embedding = pd.DataFrame(embedding) + col_names = [f"X{i}" for i in range(embedding.shape[1])] + embedding = embedding.set_axis(col_names, axis="columns") + embedding = pd.concat([embedding, data[extract_col_name]], axis=1) + # data_fasttext.drop_duplicates(inplace=True) + embedding = embedding.reset_index(drop=True) + + return embedding + + +def extract_ken_features( + data: pd.DataFrame, + extract_col_name: str, +): + # KEN embeddings + ken_emb = pd.read_parquet(config_directory["ken_embed"]) + ken_ent = ken_emb["Entity"].str.lower() + ken_embed_ent2idx = {ken_ent[i]: i for i in range(len(ken_emb))} + + # Original data + data_ = data.copy() + data_.replace("\n", " ", regex=True, inplace=True) + data_ = data.copy() + data_[extract_col_name] = data_[extract_col_name].str.lower() + + # Mapping + mapping = data_[extract_col_name].map(ken_embed_ent2idx) + mapping = mapping.dropna() + mapping = mapping.astype(np.int64) + mapping = np.array(mapping) + + # KEN data + data_ken = ken_emb.iloc[mapping] + data_ken.rename(columns={"Entity": "name"}, inplace=True) + data_ken.drop_duplicates(inplace=True) + data_ken = data_ken.reset_index(drop=True) + + return data_ken + + +def table2llmfeatures( + data: pd.DataFrame, + embed_numeric: bool, + device: str = "cuda:0", +): + # Load LLM Model + from sentence_transformers import SentenceTransformer + + lm_model = SentenceTransformer("intfloat/e5-large-v2", device=device) + + # Preprocessing for the strings (subject to specifics of the data) + data = data.replace("\n", " ", regex=True) + num_data = len(data) + data_x = data.copy() + + if embed_numeric: + num_cols = data_x.select_dtypes(exclude="object").columns.tolist() + data_x[num_cols] = data_x[num_cols].astype("str") + + data_x_cat = data_x.select_dtypes(include="object") + data_x_num = data_x.select_dtypes(exclude="object") + + sentences = [] + for idx in range(num_data): + data_ = data_x_cat.iloc[idx] + sentence = _serialize_instance(data_) + sentence = ( + "query: " + sentence + ) # following the outlined procedure using "query: " + sentences.append(sentence) + + X_categorical = lm_model.encode(sentences, convert_to_numpy=True) + X_categorical = pd.DataFrame(X_categorical) + + col_names = [f"X{i}" for i in range(X_categorical.shape[1])] + X_categorical = X_categorical.set_axis(col_names, axis="columns") + + data_total = pd.concat([X_categorical, data_x_num], axis=1) + + return data_total diff --git a/carte/src/visualization_utils.py b/carte/src/visualization_utils.py new file mode 100644 index 0000000..9dcf71a --- /dev/null +++ b/carte/src/visualization_utils.py @@ -0,0 +1,488 @@ +""" +Functions that can be utilized for visualization. +For Critical difference diagram, it modifies some of the codes from scikit-posthocs. +""" + +import pandas as pd +import numpy as np +from typing import Union, List, Tuple, Dict, Set +from matplotlib import colors +from matplotlib.axes import SubplotBase +from matplotlib.colorbar import ColorbarBase, Colorbar +from matplotlib.colors import ListedColormap +from matplotlib import pyplot +from pandas import DataFrame, Series +from seaborn import heatmap +from carte.configs.carte_configs import carte_singletable_baseline_mapping +from carte.configs.directory import config_directory + + +# Normalization function of the results +def _normalize(group): + min_score = group["score"].min() + max_score = group["score"].max() + group["normalized_score"] = (group["score"] - min_score) / (max_score - min_score) + return group + + +# Prepare dataframe suitable for the learning curves +def prepare_result(task, models="all", rank_at=2048): + + # load result + result_dir = f"{config_directory['results']}/compiled_results/results_carte_baseline_singletable.csv" + df_score = pd.read_csv(result_dir) + + # control for not important values + mask_cls = df_score["task"] == "classification" + temp = df_score["score"].copy() + temp[np.logical_and(mask_cls, temp < 0.5)] = 0.5 + temp[np.logical_and(~mask_cls, temp < 0)] = 0 + df_score["score"] = temp + + # select results based on task + mask = df_score["task"] == task + df_score = df_score[mask].reset_index(drop=True) + + # select result with model of interest + if models == "all": + pass + else: + mask = df_score["model"].isin(models) + df_score = df_score[mask] + df_score.reset_index(drop=True, inplace=True) + + # Change the names of models for clarity + temp = df_score["model"].copy() + for key in carte_singletable_baseline_mapping: + temp = temp.str.replace(key, carte_singletable_baseline_mapping[key]) + df_score["model"] = temp.copy() + + # Apply normalization on scores + df_normalized = df_score.groupby(["data_name"], group_keys=True).apply(_normalize) + df_normalized.reset_index(drop=True, inplace=True) + + # Ranking + if rank_at == "all": + temp = df_normalized["num_train"].astype(float) + mask = temp <= max(temp) + df_normalized_ = df_normalized[mask].copy() + avg_rank = ( + df_normalized_.groupby("model") + .normalized_score.mean() + .rank(ascending=False) + ) + avg_rank = avg_rank.sort_values() + rank_order = avg_rank.index.tolist() + else: + mask = df_normalized["num_train"] == rank_at + df_normalized_ = df_normalized[mask].copy() + avg_rank = ( + df_normalized_.groupby("model") + .normalized_score.mean() + .rank(ascending=False) + ) + avg_rank = avg_rank.sort_values() + rank_order = avg_rank.index.tolist() + + df_normalized = df_normalized.sort_values(by="num_train", ascending=True) + df_normalized["num_train"] = df_normalized["num_train"].astype(str) + df_normalized.reset_index(drop=True, inplace=True) + + return df_normalized, rank_order + + +# Generate dataframe suitable for creating critical difference diagram +def generate_df_cdd(df_normalized, train_size="all"): + + # Set the base df + df_cdd = df_normalized.copy() + df_cdd["case"] = ( + df_normalized["data_name"] + + "_" + + df_normalized["num_train"].astype(str) + + "_" + + df_normalized["random_state"].astype(str) + ) + + # select the train_size for comparison + if train_size == "all": + return df_cdd + else: + mask = df_cdd["num_train"].str.contains(f"{train_size}") + df_cdd = df_cdd[mask].copy() + df_cdd.reset_index(drop=True, inplace=True) + return df_cdd + + +# Sign array for scikit-posthoc +def sign_array(p_values: Union[List, np.ndarray], alpha: float = 0.05) -> np.ndarray: + + p_values = np.array(p_values) + p_values[p_values > alpha] = 0 + p_values[(p_values < alpha) & (p_values > 0)] = 1 + np.fill_diagonal(p_values, 1) + + return p_values + + +# Sign table for scikit-posthoc +def sign_table( + p_values: Union[List, np.ndarray, DataFrame], lower: bool = True, upper: bool = True +) -> Union[DataFrame, np.ndarray]: + + if not any([lower, upper]): + raise ValueError("Either lower or upper triangle must be returned") + + pv = ( + DataFrame(p_values, copy=True) + if not isinstance(p_values, DataFrame) + else p_values.copy() + ) + + ns = pv > 0.05 + three = (pv < 0.001) & (pv >= 0) + two = (pv < 0.01) & (pv >= 0.001) + one = (pv < 0.05) & (pv >= 0.01) + + pv = pv.astype(str) + pv[ns] = "NS" + pv[three] = "***" + pv[two] = "**" + pv[one] = "*" + + np.fill_diagonal(pv.values, "-") + if not lower: + pv.values[np.tril_indices(pv.shape[0], -1)] = "" + elif not upper: + pv.values[np.triu_indices(pv.shape[0], 1)] = "" + + return pv + + +# Sign plot for scikit-posthoc +def sign_plot( + x: Union[List, np.ndarray, DataFrame], + g: Union[List, np.ndarray] = None, + flat: bool = False, + labels: bool = True, + cmap: List = None, + cbar_ax_bbox: List = None, + ax: SubplotBase = None, + **kwargs, +) -> Union[SubplotBase, Tuple[SubplotBase, Colorbar]]: + + for key in ["cbar", "vmin", "vmax", "center"]: + if key in kwargs: + del kwargs[key] + + if isinstance(x, DataFrame): + df = x.copy() + else: + x = np.array(x) + g = g or np.arange(x.shape[0]) + df = DataFrame(np.copy(x), index=g, columns=g) + + dtype = df.values.dtype + + if not np.issubdtype(dtype, np.integer) and flat: + raise ValueError("X should be a sign_array or DataFrame of integers") + elif not np.issubdtype(dtype, np.floating) and not flat: + raise ValueError("X should be an array or DataFrame of float p values") + + if not cmap and flat: + # format: diagonal, non-significant, significant + cmap = ["1", "#fbd7d4", "#1a9641"] + elif not cmap and not flat: + # format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 + cmap = ["1", "#fbd7d4", "#005a32", "#238b45", "#a1d99b"] + + if flat: + np.fill_diagonal(df.values, -1) + hax = heatmap( + df, vmin=-1, vmax=1, cmap=ListedColormap(cmap), cbar=False, ax=ax, **kwargs + ) + if not labels: + hax.set_xlabel("") + hax.set_ylabel("") + return hax + + else: + df[(x < 0.001) & (x >= 0)] = 1 + df[(x < 0.01) & (x >= 0.001)] = 2 + df[(x < 0.05) & (x >= 0.01)] = 3 + df[(x >= 0.05)] = 0 + + np.fill_diagonal(df.values, -1) + + if len(cmap) != 5: + raise ValueError("Cmap list must contain 5 items") + + hax = heatmap( + df, + vmin=-1, + vmax=3, + cmap=ListedColormap(cmap), + center=1, + cbar=False, + ax=ax, + **kwargs, + ) + if not labels: + hax.set_xlabel("") + hax.set_ylabel("") + + cbar_ax = hax.figure.add_axes(cbar_ax_bbox or [0.95, 0.35, 0.04, 0.3]) + cbar = ColorbarBase( + cbar_ax, + cmap=(ListedColormap(cmap[2:] + [cmap[1]])), + norm=colors.NoNorm(), + boundaries=[0, 1, 2, 3, 4], + ) + cbar.set_ticks( + list(np.linspace(0, 3, 4)), + labels=["p < 0.001", "p < 0.01", "p < 0.05", "NS"], + ) + + cbar.outline.set_linewidth(1) + cbar.outline.set_edgecolor("0.5") + cbar.ax.tick_params(size=0) + + return hax, cbar + + +def _find_maximal_cliques(adj_matrix: DataFrame) -> List[Set]: + + if (adj_matrix.index != adj_matrix.columns).any(): + raise ValueError("adj_matrix must be symmetric, indices do not match") + if not adj_matrix.isin((0, 1)).values.all(): + raise ValueError("Input matrix must be binary") + if adj_matrix.empty or not (adj_matrix.T == adj_matrix).values.all(): + raise ValueError("Input matrix must be non-empty and symmetric") + + result = [] + _bron_kerbosch( + current_clique=set(), + candidates=set(adj_matrix.index), + visited=set(), + adj_matrix=adj_matrix, + result=result, + ) + return result + + +def _bron_kerbosch( + current_clique: Set, + candidates: Set, + visited: Set, + adj_matrix: DataFrame, + result: List[Set], +) -> None: + + while candidates: + v = candidates.pop() + _bron_kerbosch( + current_clique | {v}, + # Restrict candidate vertices to the neighbors of v + {n for n in candidates if adj_matrix.loc[v, n]}, + # Restrict visited vertices to the neighbors of v + {n for n in visited if adj_matrix.loc[v, n]}, + adj_matrix, + result, + ) + visited.add(v) + + # We do not need to report a clique if a children call aready did it. + if not visited: + # If this is not a terminal call, i.e. if any clique was reported. + result.append(current_clique) + + +def critical_difference_diagram( + ranks: Union[dict, Series], + sig_matrix: DataFrame, + *, + ax: SubplotBase = None, + label_fmt_left: str = "{label} ({rank:.2g})", + label_fmt_right: str = "({rank:.2g}) {label}", + label_props: dict = None, + marker_props: dict = None, + elbow_props: dict = None, + crossbar_props: dict = None, + color_palette: Union[Dict[str, str], List] = {}, + line_style: Union[Dict[str, str], List] = {}, + text_h_margin: float = 0.01, +) -> Dict[str, list]: + + ## check color_palette consistency + if len(color_palette) == 0: + pass + elif isinstance(color_palette, Dict) and ( + (len(set(ranks.keys()) & set(color_palette.keys()))) == len(ranks) + ): + pass + elif isinstance(color_palette, List) and (len(ranks) <= len(color_palette)): + pass + else: + raise ValueError( + "color_palette keys are not consistent, or list size too small" + ) + + elbow_props = elbow_props or {} + marker_props = {"zorder": 3, **(marker_props or {})} + label_props = {"va": "center", **(label_props or {})} + crossbar_props = { + "color": "k", + "zorder": 3, + "linewidth": 2, + **(crossbar_props or {}), + } + + ax = ax or pyplot.gca() + ax.yaxis.set_visible(False) + ax.spines["right"].set_visible(False) + ax.spines["left"].set_visible(False) + ax.spines["bottom"].set_visible(False) + ax.xaxis.set_ticks_position("top") + ax.spines["top"].set_position("zero") + + # lists of artists to be returned + markers = [] + elbows = [] + labels = [] + crossbars = [] + + # True if pairwise comparison is NOT significant + adj_matrix = DataFrame( + 1 - sign_array(sig_matrix), + index=sig_matrix.index, + columns=sig_matrix.columns, + dtype=bool, + ) + + ranks = Series(ranks) # Standardize if ranks is dict + points_left, points_right = np.array_split(ranks.sort_values(), 2) + + # Sets of points under the same crossbar + crossbar_sets = _find_maximal_cliques(adj_matrix) + + # Sort by lowest rank and filter single-valued sets + crossbar_sets = sorted( + (x for x in crossbar_sets if len(x) > 1), key=lambda x: ranks[list(x)].min() + ) + + # Create stacking of crossbars: for each level, try to fit the crossbar, + # so that it does not intersect with any other in the level. If it does not + # fit in any level, create a new level for it. + crossbar_levels: list[list[set]] = [] + for bar in crossbar_sets: + for level, bars_in_level in enumerate(crossbar_levels): + if not any(bool(bar & bar_in_lvl) for bar_in_lvl in bars_in_level): + ypos = -level - 1 + bars_in_level.append(bar) + break + else: + ypos = -len(crossbar_levels) - 1 + crossbar_levels.append([bar]) + + crossbars.append( + ax.plot( + # Adding a separate line between each pair enables showing a + # marker over each elbow with crossbar_props={'marker': 'o'}. + [ranks[i] for i in bar], + [ypos] * len(bar), + **crossbar_props, + ) + ) + + lowest_crossbar_ypos = -len(crossbar_levels) + + # def _change_label(label): + # label_ = label.split("-") + # label_ = [rf"$\bf{x}$" for x in label_] + # label_ = ("-").join(label_) + # return label_ + + def _change_label(label): + label_temp = label.split("-") + label_ = [] + for x in label_temp: + if len(x.split(" ")) != 1: + temp = x.split(" ") + temp = (" ").join([r"$\bf\{" + f"{x}" + r"}$" for x in temp]) + label_.append(temp) + else: + label_.append(r"$\bf\{" + f"{x}" + r"}$") + label_ = ("-").join(label_) + label_ = label_.replace("\\{", "{") + return label_ + + def plot_items(points, xpos, label_fmt, color_palette, line_style, label_props): + """Plot each marker + elbow + label.""" + ypos = lowest_crossbar_ypos - 1 + for idx, (label, rank) in enumerate(points.items()): + if len(color_palette) == 0: + elbow, *_ = ax.plot( + [xpos, rank, rank], + [ypos, ypos, 0], + **elbow_props, + ) + label_ = label + else: + elbow, *_ = ax.plot( + [xpos, rank, rank], + [ypos, ypos, 0], + c=( + color_palette[label] + if isinstance(color_palette, Dict) + else color_palette[idx] + ), + ls=( + line_style[label] + if isinstance(line_style, Dict) + else line_style[idx] + ), + **elbow_props, + ) + if color_palette[label] != "black": # darkgrey black + label_ = _change_label(label) + else: + label_ = label + elbows.append(elbow) + curr_color = elbow.get_color() + markers.append(ax.scatter(rank, 0, **{"color": curr_color, **marker_props})) + labels.append( + ax.text( + xpos, + ypos, + label_fmt.format(label=label_, rank=-1 * rank), + **{"color": curr_color, **label_props}, + ) + ) + ypos -= 1.5 + + plot_items( + points_left, + xpos=points_left.iloc[0] - text_h_margin, + label_fmt=label_fmt_left, + color_palette=color_palette, + line_style=line_style, + label_props={ + "ha": "right", + **label_props, + }, + ) + plot_items( + points_right[::-1], + xpos=points_right.iloc[-1] + text_h_margin, + label_fmt=label_fmt_right, + color_palette=color_palette, + line_style=line_style, + label_props={"ha": "left", **label_props}, + ) + + return { + "markers": markers, + "elbows": elbows, + "labels": labels, + "crossbars": crossbars, + } diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..58193a9 --- /dev/null +++ b/setup.py @@ -0,0 +1,53 @@ +"""The setup script.""" + +from setuptools import setup, find_packages + +with open('History.rst') as history_file: + history = history_file.read() + +requirements = [] +test_requirements = [] + +setup( + author="""Myung Jun Kim, Léo Grinsztajn, Gaël Varoquaux""", + author_email='test@gmail.com', + python_requires='>=3.10.12', + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Environment :: Console', + 'Operating System :: OS Independent', + 'Operating System :: POSIX :: Linux', + 'Operating System :: MacOS', + 'Operating System :: POSIX', + 'Operating System :: Microsoft :: Windows', + 'Natural Language :: English', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + ], + description="""Pretrained deep-learning models are the go-to solution for images or text. However, for tabular data the standard is still to train tree-based models. + Indeed, transfer learning on tables hits the challenge of data integration: finding correspondences, + correspondences in the entries (entity matching) where different words may denote the same entity, correspondences across columns (schema matching), + which may come in different orders, names... We propose a neural architecture that does not need such correspondences. + As a result, we can pretrain it on background data that has not been matched. + The architecture -- CARTE for Context Aware Representation of Table Entries -- uses a graph representation of tabular (or relational) data to process tables with different columns, + string embedding of entries and columns names to model an open vocabulary, and a graph-attentional network to contextualize entries with column names and neighboring entries. + An extensive benchmark shows that CARTE facilitates learning, outperforming a solid set of baselines including the best tree-based models. + CARTE also enables joint learning across tables with unmatched columns, enhancing a small table with bigger ones. CARTE opens the door to large pretrained models for tabular data.""", + install_requires=["numpy", "pandas", "scipy", "scikit-learn", "skrub","torch","torch-geometric","torcheval","torch_scatter"], + license="MIT license", + keywords='carte', + name='carte', + packages=find_packages(include=['carte', 'carte.*']), + include_package_data=True, + #package_data = { + # '': ['*.csv'], + # 'carte': ['data/data_singletable/',"data/etc"], + #}, + test_suite='tests', + tests_require=test_requirements, + url='https://github.com/soda-inria/carte', + version='0.0.9', + zip_safe=False, +) \ No newline at end of file diff --git a/src/carte_table_to_graph.py b/src/carte_table_to_graph.py index 3fbcacd..c3d4a88 100644 --- a/src/carte_table_to_graph.py +++ b/src/carte_table_to_graph.py @@ -1,9 +1,9 @@ -"""The Table2GraphTransformer Class""" - import torch import numpy as np import pandas as pd import fasttext +import fasttext.util +import gc # Import the garbage collector module from typing import Union from torch_geometric.data import Data from sklearn.base import BaseEstimator, TransformerMixin @@ -13,102 +13,97 @@ from skrub import MinHashEncoder # change to skrub -def _create_edge_index( - num_nodes: int, - edge_attr: torch.tensor, - undirected: bool = True, - self_loop: bool = True, -): - """Sets the edge_index and edge_attr for graphs.""" +def _create_edge_index(num_nodes: int, edge_attr: torch.Tensor, undirected: bool = True, self_loop: bool = True): + """ + Sets the edge_index and edge_attr for graphs. - # the list of possible edge_index (directed with the numbering) - edge_index_ = torch.combinations(torch.arange(num_nodes), 2).transpose(0, 1) + Parameters + ---------- + num_nodes : int + Number of nodes in the graph. + edge_attr : torch.Tensor + Edge attributes tensor. + undirected : bool, optional + Whether the graph is undirected, by default True. + self_loop : bool, optional + Whether to add self-loops, by default True. + + Returns + ------- + edge_index : torch.Tensor + Edge indices tensor. + edge_attr : torch.Tensor + Edge attributes tensor. + """ + edge_index_ = torch.triu_indices(num_nodes, num_nodes, offset=1) edge_index_ = edge_index_[:, (edge_index_[0] == 0)] edge_index = edge_index_.clone() edge_attr_ = edge_attr.clone() - # undirected + if undirected: - edge_index = torch.hstack((edge_index, torch.flipud(edge_index))) - edge_attr_ = torch.vstack((edge_attr_, edge_attr_)) - # self-loop + edge_index = torch.cat((edge_index, torch.flip(edge_index, [0]))) + edge_attr_ = torch.cat((edge_attr_, edge_attr_)) + if self_loop: - edge_index_self_loop = torch.vstack( - (edge_index_[1].unique(), edge_index_[1].unique()) - ) - edge_index = torch.hstack((edge_index, edge_index_self_loop)) - edge_attr_ = torch.vstack( - (edge_attr_, torch.ones(num_nodes - 1, edge_attr_.size(1))) - ) + unique_nodes = edge_index_[1].unique() + edge_index_self_loop = torch.stack((unique_nodes, unique_nodes)) + edge_index = torch.cat((edge_index, edge_index_self_loop), dim=1) + edge_attr_ = torch.cat((edge_attr_, torch.ones(unique_nodes.size(0), edge_attr_.size(1), dtype=edge_attr_.dtype))) + return edge_index, edge_attr_ class Table2GraphTransformer(TransformerMixin, BaseEstimator): - """Transformer from tables to a list of graphs. - - The list of graphs are generated in a row-wise fashion. + """ + Transformer from tables to a list of graphs. Parameters ---------- - include_edge_attr : bool, default = True - Indicates whether to include the edge features or not. - lm_model : {'fasttext', 'minhash'}, default = 'fasttext' - The lm_model used to initialize the features of nodes and edges. - n_components : int, default = 300 - The number of components for the minhash encoder. Ignored for lm_model='fasttext' - n_jobs : : int, default=1 - Number of jobs to run in parallel for minhash encoder. + include_edge_attr : bool, optional + Whether to include edge attributes, by default True. + lm_model : str, optional + Language model to use, by default "fasttext". + n_components : int, optional + Number of components for MinHash encoder, by default 300. + n_jobs : int, optional + Number of jobs for parallel processing, by default 1. """ - def __init__( - self, - *, - include_edge_attr: bool = True, - lm_model: str = "fasttext", - n_components: float = 300, - n_jobs: int = 1, - ): - super(Table2GraphTransformer, self).__init__() - + def __init__(self, *, include_edge_attr: bool = True, lm_model: str = "fasttext", n_components: int = 300, n_jobs: int = 1): + super().__init__() self.include_edge_attr = include_edge_attr self.lm_model = lm_model self.n_components = n_components self.n_jobs = n_jobs + self.is_fitted_ = False def fit(self, X, y=None): - """Fit function used for the Table2GraphTransformer + """ + Fit function used for the Table2GraphTransformer. Parameters ---------- - X : pandas DataFrame (n_samples, n_features) - The input data used to transform to graphs. - - y : None - Ignored. + X : pandas.DataFrame + Input data to fit. + y : array-like, optional + Target values, by default None. Returns ------- - self : object + self : Table2GraphTransformer Fitted transformer. """ - self.y_ = y - self.is_fitted_ = False - - # Load language_model - if hasattr(self, "lm_model_") == False: + if not hasattr(self, "lm_model_"): self._load_lm_model() - # Relations - cat_col_names = X.select_dtypes(include="object").columns - cat_col_names = cat_col_names.str.replace("\n", " ", regex=True).str.lower() + cat_col_names = X.select_dtypes(include="object").columns.str.replace("\n", " ", regex=True).str.lower() self.cat_col_names = list(cat_col_names) - num_col_names = X.select_dtypes(exclude="object").columns - num_col_names = num_col_names.str.replace("\n", " ", regex=True).str.lower() + num_col_names = X.select_dtypes(exclude="object").columns.str.replace("\n", " ", regex=True).str.lower() self.num_col_names = list(num_col_names) self.col_names = self.cat_col_names + self.num_col_names - # Numerical transformer - Powertransformer self.num_transformer_ = PowerTransformer().set_output(transform="pandas") if self.lm_model == "minhash": self.name_transformer = make_pipeline( @@ -116,212 +111,167 @@ def fit(self, X, y=None): PowerTransformer(), ) + # Ensure numerical columns exist before fitting the transformer + if self.num_col_names: + num_cols_exist = [col for col in self.num_col_names if col in X.columns] + if num_cols_exist: + self.num_transformer_.fit(X[num_cols_exist]) + + self.is_fitted_ = True return self def transform(self, X, y=None): - """Apply Table2GraphTransformer to each row of the data + """ + Apply Table2GraphTransformer to each row of the data. Parameters ---------- - X : Pandas DataFrame. (n_samples, n_features) - The input data used to transform to graphs. - - y : None - Ignored. + X : pandas.DataFrame + Input data to transform. + y : array-like, optional + Target values, by default None. Returns ------- - Graph Data : list of size (n_samples). - The list of transformed graph data. + data_graph : list + List of transformed graph objects. """ - - # Preprocess the features - X_ = X.copy() - X_ = X_.replace("\n", " ", regex=True) + X_ = X.replace("\n", " ", regex=True) num_data = X_.shape[0] - # Preprocess the target - y_ = None - if self.y_ is not None: - y_ = np.array(self.y_) - y_ = torch.tensor(y_).reshape((num_data, 1)) + y_ = torch.tensor(self.y_, dtype=torch.float32).reshape((num_data, 1)) if self.y_ is not None else None - # Separate categorical and numerical columns X_categorical = X_.select_dtypes(include="object").copy() X_categorical.columns = self.cat_col_names X_numerical = X_.select_dtypes(exclude="object").copy() X_numerical.columns = self.num_col_names - # Features for names - cat_names = pd.melt(X_categorical)["value"] - cat_names = cat_names.dropna() - cat_names = cat_names.astype(str) - cat_names = cat_names.str.replace("\n", " ", regex=True).str.lower() - cat_names = cat_names.unique() - names_total = np.hstack([self.col_names, cat_names]) - names_total = np.unique(names_total) - name_dict = {names_total[i]: i for i in range(names_total.shape[0])} - - # preprocess values + cat_names = pd.melt(X_categorical)["value"].dropna().astype(str).str.lower().unique() + names_total = np.unique(np.hstack([self.col_names, cat_names])) + name_dict = {name: idx for idx, name in enumerate(names_total)} + name_attr_total = self._transform_names(names_total) - if len(self.num_col_names) != 0: - X_numerical = self._transform_numerical(X_numerical) - if self.is_fitted_ == False: - self.is_fitted_ = True + if self.num_col_names: + num_cols_exist = [col for col in self.num_col_names if col in X.columns] + if num_cols_exist: + X_numerical = self._transform_numerical(X_numerical[num_cols_exist]) data_graph = [ - self._graph_construct( - X_categorical, - X_numerical, - name_attr_total, - name_dict, - y_, - idx=i, - ) - for i in range(num_data) + self._graph_construct(X_categorical.iloc[idx], X_numerical.iloc[idx], name_attr_total, name_dict, y_, idx) + for idx in range(num_data) ] - if self.y_ is not None: - self.y_ = None + self.y_ = None + + # Manually trigger garbage collection after transforming data + gc.collect() return data_graph def _load_lm_model(self): - """Load the language model for features of nodes and edges.""" - + """ + Load the language model for features of nodes and edges. + """ if self.lm_model == "fasttext": - # Loading fasttext self.lm_model_ = fasttext.load_model(config_directory["fasttext"]) if self.n_components != 300: fasttext.util.reduce_model(self.lm_model_, self.n_components) elif self.lm_model == "minhash": - self.lm_model_ = MinHashEncoder( - n_components=self.n_components, - n_jobs=self.n_jobs, - ) + self.lm_model_ = MinHashEncoder(n_components=self.n_components, n_jobs=self.n_jobs) def _transform_numerical(self, X): - """Transform numerical columns using powertransformer""" + """ + Transform numerical columns using power transformer. - X_num = X.copy() - if self.is_fitted_ == False: - X_num = self.num_transformer_.fit_transform(X_num) - else: - X_num = self.num_transformer_.transform(X_num) - return X_num + Parameters + ---------- + X : pandas.DataFrame + Input numerical data. + + Returns + ------- + transformed_X : pandas.DataFrame + Transformed numerical data. + """ + return self.num_transformer_.transform(X) def _transform_names(self, names_total): - """Obtain the feature for a given list of string values""" + """ + Obtain the feature for a given list of string values. + Parameters + ---------- + names_total : array-like + List of string values. + + Returns + ------- + name_features : np.ndarray + Transformed features for names. + """ if self.lm_model == "fasttext": - name_attr_total = [ - self.lm_model_.get_sentence_vector(i) for i in names_total - ] - name_attr_total = np.array(name_attr_total).astype(np.float32) - pass + return np.array([self.lm_model_.get_sentence_vector(name) for name in names_total], dtype=np.float32) elif self.lm_model == "minhash": - name_attr_total = self.name_transformer.fit_transform( - names_total.reshape(-1, 1) - ) - name_attr_total = name_attr_total.astype(np.float32) - return name_attr_total - - def _graph_construct( - self, - X_categorical, - X_numerical, - name_attr_total, - name_dict, - y, - idx, - ): - """Transform to graph objects. + return self.name_transformer.fit_transform(names_total.reshape(-1, 1)).astype(np.float32) + + def _graph_construct(self, data_cat, data_num, name_attr_total, name_dict, y, idx): + """ + Transform to graph objects. Parameters ---------- - X_categorical : Pandas DataFrame of shape (n_samples, n_categorical_features) - The input pandas DataFrame containing only the categorical features. - X_numerical : Pandas DataFrame of shape (n_samples, n_numerical_features) - The input pandas DataFrame containing only the numerical features. - name_attr_total : Numpy array of shape (n_words, n_dim_fasttext) - The features of each word (or sentence) in the name_dict. - name_dict : List of shape (n_words,) - Total list of words (or sentences) that the data contains. - y : array-like of shape (n_samples,) - The target variable to try to predict. - idx: int - The index of a particular data point used to transform into graphs + data_cat : pandas.Series + Categorical data for a single instance. + data_num : pandas.Series + Numerical data for a single instance. + name_attr_total : np.ndarray + Transformed features for names. + name_dict : dict + Dictionary mapping names to indices. + y : torch.Tensor or None + Target values. + idx : int + Index of the instance. Returns ------- - Graph : Graph object - The graph object from torch_geometric + data : torch_geometric.data.Data + Graph data object. """ - - # Obtain the data for a 'idx'-th row - data_cat = X_categorical.iloc[idx] - data_cat = data_cat.dropna() - num_cat = len(data_cat) - if num_cat != 0: - data_cat = data_cat.str.replace("\n", " ", regex=True).str.lower() - data_num = X_numerical.iloc[idx] + data_cat = data_cat.dropna().str.lower() data_num = data_num.dropna() + num_cat = len(data_cat) num_num = len(data_num) - # edge_attributes - if self.include_edge_attr: - edge_attr_cat = [name_attr_total[name_dict[x]] for x in data_cat.index] - edge_attr_cat = np.array(edge_attr_cat).astype(np.float32) - edge_attr_num = [name_attr_total[name_dict[x]] for x in data_num.index] - edge_attr_num = np.array(edge_attr_num).astype(np.float32) - else: - edge_attr_cat = np.ones((num_cat, self.n_components)).astype(np.float32) - edge_attr_num = np.ones((num_num, self.n_components)).astype(np.float32) - - # node_attributes - x_cat = [name_attr_total[name_dict[x]] for x in data_cat] - x_cat = np.array(x_cat).astype(np.float32) - x_cat = torch.tensor(x_cat) - if x_cat.size(0) == 0: - x_cat = x_cat.reshape(0, self.n_components) - edge_attr_cat = edge_attr_cat.reshape(0, self.n_components) + edge_attr_cat = np.array([name_attr_total[name_dict[col]] for col in data_cat.index], dtype=np.float32) + edge_attr_num = np.array([name_attr_total[name_dict[col]] for col in data_num.index], dtype=np.float32) + + x_cat = torch.tensor(np.array([name_attr_total[name_dict[val]] for val in data_cat]), dtype=torch.float32) + x_num = torch.tensor(data_num.values[:, None] * edge_attr_num, dtype=torch.float32) - x_num_ = np.array(data_num).astype("float32") - x_num = x_num_.reshape(-1, 1) * edge_attr_num - x_num = torch.tensor(x_num) + if x_cat.size(0) == 0: + x_cat = torch.empty((0, self.n_components), dtype=torch.float32) + edge_attr_cat = torch.empty((0, self.n_components), dtype=torch.float32) if x_num.size(0) == 0: - x_num = x_num.reshape(0, self.n_components) - edge_attr_num = edge_attr_num.reshape(0, self.n_components) + x_num = torch.empty((0, self.n_components), dtype=torch.float32) + edge_attr_num = torch.empty((0, self.n_components), dtype=torch.float32) - # combined node/edge attributes - x = torch.vstack((x_cat, x_num)) - x = torch.vstack((torch.ones((1, x.size(1))), x)) - edge_attr = np.vstack((edge_attr_cat, edge_attr_num)) - edge_attr = torch.tensor(edge_attr) + x = torch.cat((x_cat, x_num)) + x = torch.cat((torch.ones((1, x.size(1))), x)) + edge_attr = torch.tensor(np.vstack((edge_attr_cat, edge_attr_num)), dtype=torch.float32) - # edge_index num_nodes = num_cat + num_num + 1 edge_index, edge_attr = _create_edge_index(num_nodes, edge_attr, False, True) - # Set the center node Z = torch.mul(edge_attr, x[edge_index[1]]) - x[0, :] = Z[(edge_index[0] == 0), :].mean(dim=0) - - # Target - if y is not None: - y_ = y[idx].clone() - else: - y_ = torch.tensor([]) + x[0, :] = Z[edge_index[0] == 0].mean(dim=0) - # graph index (g_idx) - g_idx = idx + y_ = y[idx].clone() if y is not None else torch.tensor([]) - data = Data( + return Data( x=x, edge_index=edge_index, edge_attr=edge_attr, y=y_, - g_idx=g_idx, + g_idx=idx, ) - - return data