Skip to content

Commit

Permalink
ENH: add save_datasets to save dictionary data for soma-workflow #13
Browse files Browse the repository at this point in the history
  • Loading branch information
JinpengLI committed Aug 26, 2013
1 parent 01fa6e9 commit d1a90eb
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 3 deletions.
1 change: 1 addition & 0 deletions epac/map_reduce/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def run(self, **Xy):
name="working directory")
else:
ft_working_directory = tmp_work_dir_path

## Save the database and tree to working directory
## ===============================================
np.savez(os.path.join(tmp_work_dir_path,
Expand Down
33 changes: 32 additions & 1 deletion epac/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,4 +325,35 @@ def train_test_merge(Xy_train, Xy_test):
Xy_train = {key_push(k, conf.TRAIN): Xy_train[k] for k in Xy_train}
Xy_test = {key_push(k, conf.TEST) : Xy_test[k] for k in Xy_test}
Xy_train.update(Xy_test)
return Xy_train
return Xy_train


def save_datasets(dataset_dir, **Xy):
'''Save a dictionary to a directory
Save a dictionary to a directory. This dictionary may contain
numpy array, numpy.memmap
Example
-------
from sklearn import datasets
from epac.utils import save_datasets
X, y = datasets.make_classification(n_samples=50,
n_features=10000,
n_informative=2,
random_state=1)
Xy = dict(X=X, y=y)
save_datasets("/tmp/save_datasets_data", **Xy)
'''
if not os.path.exists(dataset_dir):
os.makedirs(dataset_dir)
index_filepath = os.path.join(dataset_dir, "db_index.txt")
file_db_index = open(index_filepath, "w+")
file_db_index.write(repr(len(Xy)) + "\n")
for key in Xy:
filepath = os.path.join(dataset_dir, key + ".npy")
file_db_index.write(filepath)
file_db_index.write("\n")
file_db_index.close()
for key in Xy:
filepath = os.path.join(dataset_dir, key + ".npy")
np.save(filepath, Xy[key])
33 changes: 31 additions & 2 deletions examples/run_a_big_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import numpy as np
import os
import os.path as path
from tempfile import mkdtemp
from sklearn import datasets
Expand All @@ -21,20 +22,48 @@ def convert2memmap(np_mat):
return mem_mat


X, y = datasets.make_classification(n_samples=12,
n_features=10,
X, y = datasets.make_classification(n_samples=50,
n_features=10000,
n_informative=2,
random_state=1)
X = convert2memmap(X)
y = convert2memmap(y)

Xy = dict(X=X, y=y)

for k in Xy:
print k
print type(Xy[k]) is np.core.memmap
print Xy[k]


np.savez("/tmp/data.dat", **Xy)





def load_datasets(datasets_filepath):
Xy = np.load(datasets_filepath)
return {k: Xy[k] for k in Xy.keys()}


Xy = load_datasets("/tmp/data.dat.npz")

from sklearn.svm import SVC
from epac import CV, Methods
cv_svm = CV(Methods(*[SVC(kernel="linear"),
SVC(kernel="rbf")]),
n_folds=3)


from epac import LocalEngine
local_engine = LocalEngine(cv_svm, num_processes=2)
cv_svm = local_engine.run(X=X, y=y)
print cv_svm.reduce()


from epac import SomaWorkflowEngine
swf_engine = SomaWorkflowEngine(cv_svm, num_processes=2)
cv_svm = swf_engine.run(X=X, y=y)
print cv_svm.reduce()

0 comments on commit d1a90eb

Please sign in to comment.