ENH: add save_datasets to save dictionary data for soma-workflow #13

neurospin · Aug 26, 2013 · d1a90eb · d1a90eb
1 parent 01fa6e9
commit d1a90eb
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 3 deletions.
diff --git a/epac/map_reduce/engine.py b/epac/map_reduce/engine.py
@@ -272,6 +272,7 @@ def run(self, **Xy):
                                         name="working directory")
         else:
             ft_working_directory = tmp_work_dir_path
+
         ## Save the database and tree to working directory
         ## ===============================================
         np.savez(os.path.join(tmp_work_dir_path,

diff --git a/epac/utils.py b/epac/utils.py
@@ -325,4 +325,35 @@ def train_test_merge(Xy_train, Xy_test):
     Xy_train = {key_push(k, conf.TRAIN): Xy_train[k] for k in Xy_train}
     Xy_test = {key_push(k, conf.TEST) : Xy_test[k] for k in Xy_test}
     Xy_train.update(Xy_test)
-    return Xy_train
+    return Xy_train
+
+
+def save_datasets(dataset_dir, **Xy):
+    '''Save a dictionary to a directory
+    Save a dictionary to a directory. This dictionary may contain
+    numpy array, numpy.memmap
+
+    Example
+    -------
+    from sklearn import datasets
+    from epac.utils import save_datasets
+    X, y = datasets.make_classification(n_samples=50,
+                                        n_features=10000,
+                                        n_informative=2,
+                                        random_state=1)
+    Xy = dict(X=X, y=y)
+    save_datasets("/tmp/save_datasets_data", **Xy)
+    '''
+    if not os.path.exists(dataset_dir):
+        os.makedirs(dataset_dir)
+    index_filepath = os.path.join(dataset_dir, "db_index.txt")
+    file_db_index = open(index_filepath, "w+")
+    file_db_index.write(repr(len(Xy)) + "\n")
+    for key in Xy:
+        filepath = os.path.join(dataset_dir, key + ".npy")
+        file_db_index.write(filepath)
+        file_db_index.write("\n")
+    file_db_index.close()
+    for key in Xy:
+        filepath = os.path.join(dataset_dir, key + ".npy")
+        np.save(filepath, Xy[key])
diff --git a/examples/run_a_big_matrix.py b/examples/run_a_big_matrix.py
@@ -6,6 +6,7 @@
 """
 
 import numpy as np
+import os
 import os.path as path
 from tempfile import mkdtemp
 from sklearn import datasets
@@ -21,20 +22,48 @@ def convert2memmap(np_mat):
     return mem_mat
 
 
-X, y = datasets.make_classification(n_samples=12,
-                                    n_features=10,
+X, y = datasets.make_classification(n_samples=50,
+                                    n_features=10000,
                                     n_informative=2,
                                     random_state=1)
 X = convert2memmap(X)
 y = convert2memmap(y)
 
+Xy = dict(X=X, y=y)
+
+for k in Xy:
+    print k
+    print type(Xy[k]) is np.core.memmap
+    print Xy[k]
+
+
+np.savez("/tmp/data.dat", **Xy)
+
+
+
+
+
+def load_datasets(datasets_filepath):
+    Xy = np.load(datasets_filepath)
+    return {k: Xy[k] for k in Xy.keys()}
+
+
+Xy = load_datasets("/tmp/data.dat.npz")
+
 from sklearn.svm import SVC
 from epac import CV, Methods
 cv_svm = CV(Methods(*[SVC(kernel="linear"),
                       SVC(kernel="rbf")]),
                       n_folds=3)
 
+
 from epac import LocalEngine
 local_engine = LocalEngine(cv_svm, num_processes=2)
 cv_svm = local_engine.run(X=X, y=y)
 print cv_svm.reduce()
+
+
+from epac import SomaWorkflowEngine
+swf_engine = SomaWorkflowEngine(cv_svm, num_processes=2)
+cv_svm = swf_engine.run(X=X, y=y)
+print cv_svm.reduce()