From d1a90eba8c82fd3254ab46948575f1f9892bc31f Mon Sep 17 00:00:00 2001 From: jinpeng Date: Mon, 26 Aug 2013 12:15:41 +0200 Subject: [PATCH] ENH: add save_datasets to save dictionary data for soma-workflow #13 --- epac/map_reduce/engine.py | 1 + epac/utils.py | 33 ++++++++++++++++++++++++++++++++- examples/run_a_big_matrix.py | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/epac/map_reduce/engine.py b/epac/map_reduce/engine.py index 39af303..b3b7091 100644 --- a/epac/map_reduce/engine.py +++ b/epac/map_reduce/engine.py @@ -272,6 +272,7 @@ def run(self, **Xy): name="working directory") else: ft_working_directory = tmp_work_dir_path + ## Save the database and tree to working directory ## =============================================== np.savez(os.path.join(tmp_work_dir_path, diff --git a/epac/utils.py b/epac/utils.py index 6b2613c..c4e94b7 100644 --- a/epac/utils.py +++ b/epac/utils.py @@ -325,4 +325,35 @@ def train_test_merge(Xy_train, Xy_test): Xy_train = {key_push(k, conf.TRAIN): Xy_train[k] for k in Xy_train} Xy_test = {key_push(k, conf.TEST) : Xy_test[k] for k in Xy_test} Xy_train.update(Xy_test) - return Xy_train \ No newline at end of file + return Xy_train + + +def save_datasets(dataset_dir, **Xy): + '''Save a dictionary to a directory + Save a dictionary to a directory. This dictionary may contain + numpy array, numpy.memmap + + Example + ------- + from sklearn import datasets + from epac.utils import save_datasets + X, y = datasets.make_classification(n_samples=50, + n_features=10000, + n_informative=2, + random_state=1) + Xy = dict(X=X, y=y) + save_datasets("/tmp/save_datasets_data", **Xy) + ''' + if not os.path.exists(dataset_dir): + os.makedirs(dataset_dir) + index_filepath = os.path.join(dataset_dir, "db_index.txt") + file_db_index = open(index_filepath, "w+") + file_db_index.write(repr(len(Xy)) + "\n") + for key in Xy: + filepath = os.path.join(dataset_dir, key + ".npy") + file_db_index.write(filepath) + file_db_index.write("\n") + file_db_index.close() + for key in Xy: + filepath = os.path.join(dataset_dir, key + ".npy") + np.save(filepath, Xy[key]) \ No newline at end of file diff --git a/examples/run_a_big_matrix.py b/examples/run_a_big_matrix.py index e1639e1..69020e9 100644 --- a/examples/run_a_big_matrix.py +++ b/examples/run_a_big_matrix.py @@ -6,6 +6,7 @@ """ import numpy as np +import os import os.path as path from tempfile import mkdtemp from sklearn import datasets @@ -21,20 +22,48 @@ def convert2memmap(np_mat): return mem_mat -X, y = datasets.make_classification(n_samples=12, - n_features=10, +X, y = datasets.make_classification(n_samples=50, + n_features=10000, n_informative=2, random_state=1) X = convert2memmap(X) y = convert2memmap(y) +Xy = dict(X=X, y=y) + +for k in Xy: + print k + print type(Xy[k]) is np.core.memmap + print Xy[k] + + +np.savez("/tmp/data.dat", **Xy) + + + + + +def load_datasets(datasets_filepath): + Xy = np.load(datasets_filepath) + return {k: Xy[k] for k in Xy.keys()} + + +Xy = load_datasets("/tmp/data.dat.npz") + from sklearn.svm import SVC from epac import CV, Methods cv_svm = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) + from epac import LocalEngine local_engine = LocalEngine(cv_svm, num_processes=2) cv_svm = local_engine.run(X=X, y=y) print cv_svm.reduce() + + +from epac import SomaWorkflowEngine +swf_engine = SomaWorkflowEngine(cv_svm, num_processes=2) +cv_svm = swf_engine.run(X=X, y=y) +print cv_svm.reduce()