From d1a90eba8c82fd3254ab46948575f1f9892bc31f Mon Sep 17 00:00:00 2001
From: jinpeng <mr.li.jinpeng@gmail.com>
Date: Mon, 26 Aug 2013 12:15:41 +0200
Subject: [PATCH] ENH: add save_datasets to save dictionary data for
 soma-workflow #13

---
 epac/map_reduce/engine.py    |  1 +
 epac/utils.py                | 33 ++++++++++++++++++++++++++++++++-
 examples/run_a_big_matrix.py | 33 +++++++++++++++++++++++++++++++--
 3 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/epac/map_reduce/engine.py b/epac/map_reduce/engine.py
index 39af303..b3b7091 100644
--- a/epac/map_reduce/engine.py
+++ b/epac/map_reduce/engine.py
@@ -272,6 +272,7 @@ def run(self, **Xy):
                                         name="working directory")
         else:
             ft_working_directory = tmp_work_dir_path
+
         ## Save the database and tree to working directory
         ## ===============================================
         np.savez(os.path.join(tmp_work_dir_path,
diff --git a/epac/utils.py b/epac/utils.py
index 6b2613c..c4e94b7 100644
--- a/epac/utils.py
+++ b/epac/utils.py
@@ -325,4 +325,35 @@ def train_test_merge(Xy_train, Xy_test):
     Xy_train = {key_push(k, conf.TRAIN): Xy_train[k] for k in Xy_train}
     Xy_test = {key_push(k, conf.TEST) : Xy_test[k] for k in Xy_test}
     Xy_train.update(Xy_test)
-    return Xy_train
\ No newline at end of file
+    return Xy_train
+
+
+def save_datasets(dataset_dir, **Xy):
+    '''Save a dictionary to a directory
+    Save a dictionary to a directory. This dictionary may contain
+    numpy array, numpy.memmap
+
+    Example
+    -------
+    from sklearn import datasets
+    from epac.utils import save_datasets
+    X, y = datasets.make_classification(n_samples=50,
+                                        n_features=10000,
+                                        n_informative=2,
+                                        random_state=1)
+    Xy = dict(X=X, y=y)
+    save_datasets("/tmp/save_datasets_data", **Xy)
+    '''
+    if not os.path.exists(dataset_dir):
+        os.makedirs(dataset_dir)
+    index_filepath = os.path.join(dataset_dir, "db_index.txt")
+    file_db_index = open(index_filepath, "w+")
+    file_db_index.write(repr(len(Xy)) + "\n")
+    for key in Xy:
+        filepath = os.path.join(dataset_dir, key + ".npy")
+        file_db_index.write(filepath)
+        file_db_index.write("\n")
+    file_db_index.close()
+    for key in Xy:
+        filepath = os.path.join(dataset_dir, key + ".npy")
+        np.save(filepath, Xy[key])
\ No newline at end of file
diff --git a/examples/run_a_big_matrix.py b/examples/run_a_big_matrix.py
index e1639e1..69020e9 100644
--- a/examples/run_a_big_matrix.py
+++ b/examples/run_a_big_matrix.py
@@ -6,6 +6,7 @@
 """
 
 import numpy as np
+import os
 import os.path as path
 from tempfile import mkdtemp
 from sklearn import datasets
@@ -21,20 +22,48 @@ def convert2memmap(np_mat):
     return mem_mat
 
 
-X, y = datasets.make_classification(n_samples=12,
-                                    n_features=10,
+X, y = datasets.make_classification(n_samples=50,
+                                    n_features=10000,
                                     n_informative=2,
                                     random_state=1)
 X = convert2memmap(X)
 y = convert2memmap(y)
 
+Xy = dict(X=X, y=y)
+
+for k in Xy:
+    print k
+    print type(Xy[k]) is np.core.memmap
+    print Xy[k]
+
+
+np.savez("/tmp/data.dat", **Xy)
+
+
+
+
+
+def load_datasets(datasets_filepath):
+    Xy = np.load(datasets_filepath)
+    return {k: Xy[k] for k in Xy.keys()}
+
+
+Xy = load_datasets("/tmp/data.dat.npz")
+
 from sklearn.svm import SVC
 from epac import CV, Methods
 cv_svm = CV(Methods(*[SVC(kernel="linear"),
                       SVC(kernel="rbf")]),
                       n_folds=3)
 
+
 from epac import LocalEngine
 local_engine = LocalEngine(cv_svm, num_processes=2)
 cv_svm = local_engine.run(X=X, y=y)
 print cv_svm.reduce()
+
+
+from epac import SomaWorkflowEngine
+swf_engine = SomaWorkflowEngine(cv_svm, num_processes=2)
+cv_svm = swf_engine.run(X=X, y=y)
+print cv_svm.reduce()