Skip to content

Commit

Permalink
Merge pull request #75 from petered/the_little_things
Browse files Browse the repository at this point in the history
The little things
  • Loading branch information
petered committed Feb 25, 2016
2 parents f38de9e + 52e47ce commit ba346ec
Show file tree
Hide file tree
Showing 10 changed files with 319 additions and 10 deletions.
44 changes: 38 additions & 6 deletions fileman/experiment_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@
__author__ = 'peter'


class _ExpLibClass(object):

def __setattr__(self, experiment_name, experiment):
assert isinstance(experiment, Experiment), "Your experiment must be an experiment!"
if experiment.name is None:
experiment.name = experiment_name
assert experiment_name not in GLOBAL_EXPERIMENT_LIBRARY, "Experiment %s is already in the library" % (experiment_name, )
self.__dict__[experiment_name] = experiment
GLOBAL_EXPERIMENT_LIBRARY[experiment_name] = experiment


ExperimentLibrary = _ExpLibClass()

GLOBAL_EXPERIMENT_LIBRARY = {}


Expand Down Expand Up @@ -368,28 +381,47 @@ def get_experiment_info(name):

class Experiment(object):

def __init__(self, name, function, description, conclusion = ''):
def __init__(self, function, description, conclusion = '', name = None, versions = None, current_version = None):
if versions is not None:
assert isinstance(versions, (list, dict))
assert current_version is not None, 'If you specify multiple versions, you have to pick a current version'
if isinstance(versions, list):
assert isinstance(current_version, int)
self.name = name
self.function = function
self.description = description
self.conclusion = conclusion
self.versions = versions
self.current_version = current_version

def __str__(self):
return 'Experiment: %s\n Defined in: %s\n Description: %s\n Conclusion: %s' % \
(self.name, inspect.getmodule(self.function).__name__, self.description, self.conclusion)

def run(self, **experiment_record_kwargs):
def run(self, print_to_console = True, show_figs = None, **experiment_record_kwargs):
"""
Run the experiment, and return the ExperimentRecord that is generated.
Note, if you want the output of the function, you should just run the function directly.
:param experiment_record_kwargs: See ExperimentRecord for kwargs
"""
print '%s Running Experiment: %s %s' % ('='*10, self.name, '='*10)
with ExperimentRecord(name = self.name, **experiment_record_kwargs) as exp_rec:
self.function()
print '%s Done Experiment: %s %s' % ('-'*11, self.name, '-'*12)
if self.versions is not None:
kwargs = self.versions[self.current_version]
name = self.name+'-'+(self.current_version if isinstance(self.current_version, str) else str(self.versions[self.current_version]))
else:
kwargs = {}
name = self.name

print '%s Running Experiment: %s %s' % ('='*10, name, '='*10)
with ExperimentRecord(name = name, print_to_console=print_to_console, show_figs=show_figs, **experiment_record_kwargs) as exp_rec:
self.function(**kwargs)
print '%s Done Experiment: %s %s' % ('-'*11, name, '-'*12)
return exp_rec

def run_all(self):
for v in (self.versions.keys() if isinstance(self.versions, dict) else xrange(len(self.versions))):
self.current_version = v
self.run()


if __name__ == '__main__':
browse_experiment_records()
20 changes: 20 additions & 0 deletions plato/interfaces/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,23 @@ def __call__(self, x):
add_update(running_mean_sq, new_running_mean_sq)
running_std = tt.sqrt((new_running_mean_sq - new_running_mean**2))
return (x - running_mean)/(running_std+1e-7)


@symbolic_simple
class SlowBatchCenter(object):
"""
Keeps a running mean and standard deviation, and normalizes the incoming data according to these.
This can be useful if you want to do something similar to minibatch-normalization, but without having
the batch-size tied to the normalization range.
"""

def __init__(self, half_life):
self.decay_constant = np.exp(-np.log(2)/half_life).astype(theano.config.floatX)

def __call__(self, x):
# x should have
assert x.ishape[0]==1, "This method only works for minibatches of size 1, but you used a minibatch of size: %s" % (x.tag.test_value.shape[0])
running_mean = create_shared_variable(np.zeros(x.tag.test_value.shape[1:]))
new_running_mean = running_mean * self.decay_constant + x[0] * (1-self.decay_constant).astype(theano.config.floatX)
add_update(running_mean, new_running_mean)
return x - running_mean
2 changes: 1 addition & 1 deletion plato/tools/va/demo_gaussian_vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,4 @@ def demo_simple_vae_on_mnist(

if __name__ == '__main__':

run_experiment('mnist-vae-20d-continuous_in')
run_experiment('mnist-vae-2latent')
8 changes: 8 additions & 0 deletions plotting/data_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ def vector_length_to_tile_dims(vector_length):
return grid_shape


def put_vector_in_grid(vec):
n_rows, n_cols = vector_length_to_tile_dims(len(vec))
grid = np.zeros(n_rows*n_cols, dtype = vec.dtype)
grid[:len(vec)]=vec
grid=grid.reshape(n_rows, n_cols)
return grid


@memoize
def _data_shape_and_boundary_width_to_grid_slices(shape, grid_shape, boundary_width):

Expand Down
4 changes: 2 additions & 2 deletions utils/benchmarks/plot_learning_curves.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
__author__ = 'peter'


def plot_learning_curves(learning_curves, xscale = 'sqrt', yscale = 'linear', hang = None, title = None, figure_name = None):
def plot_learning_curves(learning_curves, xscale = 'sqrt', yscale = 'linear', hang = None, title = None, figure_name = None, y_title = 'Score'):
"""
Plot a set of PredictionResults. These can be obtained by running compare_predictors.
See module test_compare_predictors for an example.
Expand Down Expand Up @@ -53,7 +53,7 @@ def plot_learning_curves(learning_curves, xscale = 'sqrt', yscale = 'linear', ha
legend.append('%s-test' % record_name)

plt.xlabel('Epoch')
plt.ylabel('Score')
plt.ylabel(y_title)
plt.legend(legend, loc = 'best')
if title is not None:
plt.title(title)
Expand Down
2 changes: 1 addition & 1 deletion utils/benchmarks/predictor_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,4 +277,4 @@ def get_scores(self, which_test_set = None):
else:
assert which_test_set in results, 'You asked for results for the test set %s, but we only have test sets %s' \
% (which_test_set, results.keys())
return results[which_test_set]
return results[which_test_set]
5 changes: 5 additions & 0 deletions utils/benchmarks/train_and_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def get_evaluation_function(name):
'mse': mean_squared_error,
'mean_squared_error': mean_squared_error,
'percent_argmax_correct': percent_argmax_correct,
'percent_argmax_incorrect': percent_argmax_incorrect,
'percent_correct': percent_correct,
}[name]

Expand Down Expand Up @@ -68,6 +69,10 @@ def percent_argmax_correct(actual, target):
return 100*fraction_correct(actual, target)


def percent_argmax_incorrect(actual, target):
return 100 - percent_argmax_correct(actual, target)


def collapse_onehot_if_necessary(output_data):
"""
Given an input that could either be in onehot encoding or not, return it in onehot encoding.
Expand Down
18 changes: 18 additions & 0 deletions utils/bureaucracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,24 @@ def multichannel(fcn):
return lambda args: (fcn(*args), )


def single_to_batch(fcn, *batch_inputs, **batch_kwargs):
"""
:param fcn: A function
:param batch_inputs: A collection of batch-form (n_samples, input_dims_i) inputs
:return: batch_outputs, an (n_samples, output_dims) array
"""
n_samples = len(batch_inputs[0])
assert all(len(b) == n_samples for b in batch_inputs)
first_out = fcn(*[b[0] for b in batch_inputs], **{k: b[0] for k, b in batch_kwargs.iteritems()})
if n_samples==1:
return first_out[None]
out = np.empty((n_samples, )+first_out.shape)
out[0] = n_samples
for i in xrange(1, n_samples):
out[i] = fcn(*[b[i] for b in batch_inputs], **{k: b[i] for k, b in batch_kwargs.iteritems()})
return out


def minibatch_iterate(data, minibatch_size, n_epochs=1):
"""
Yields minibatches in sequence.
Expand Down
144 changes: 144 additions & 0 deletions utils/datasets/newsgroups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from fileman.file_getter import get_file
from general.should_be_builtins import memoize
import numpy as np
from utils.datasets.datasets import DataSet

__author__ = 'peter'


@memoize
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None):
"""
The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in
posts in the forums.
Words have been preprocessed to the "stemmed" version, as explained on the website:
http://ana.cachopo.org/datasets-for-single-label-text-categorization
:param filter_most_common: Can be:
None: Don't filter out words
int N: Filter out words that are not in the N most common workds
(int N, int M): Filter out words that are not between the Nth and Mth most common words.
:param numeric: Convert everything from words to numbers
:param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
:param bag_of_words: Return count vectors for each word
:param count_scaling: If using bag_of_words, apply the transformation:
vector = log(1+word_counts)
To generate the input data (this scaling makes it more suitable for some types of classifiers).
:return: A DataSet object
"""

training_set_file = get_file(
relative_name = 'data/20ng-train-stemmed.txt',
url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
)

test_set_file = get_file(
relative_name = 'data/20ng-test-stemmed.txt',
url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
)

train_words, train_labels = _read_formatted_file(training_set_file)
test_words, test_labels = _read_formatted_file(test_set_file)

# Shuffle it up...
rng = np.random.RandomState(shuffling_seed)
train_words, train_labels =_shuffle((train_words, train_labels), rng)
test_words, test_labels =_shuffle((test_words, test_labels), rng)

# Filter out most-common-but-not-too-common-words
all_train_words = np.concatenate(train_words)
filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common)
train_words = _filter_lists_of_words(train_words, filtered_vocab)
test_words = _filter_lists_of_words(test_words, filtered_vocab)

if numeric or bag_of_words:
train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab)
test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab)
label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
train_labels = _words_to_ints(train_labels, label_vocab)
test_labels = _words_to_ints(test_labels, label_vocab)

if bag_of_words:
train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab))
test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab))
if count_scaling == 'log':
train_counts = np.log(1+train_counts)
test_counts = np.log(1+test_counts)
return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels)
else:
return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels)
else:
return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)



def _read_formatted_file(file_relative_path):

with open(get_file(file_relative_path)) as f:
text = f.read()
pairs = [line.split('\t') for line in text.split('\n')[:-1]]
labels = [group for group, _ in pairs]
words = [sentence.split(' ') for _, sentence in pairs]
return words, labels


def _find_most_common(elements, n_most_common):

unique_elements, counts = np.unique(elements, return_counts=True)
# ixs = np.argpartition(-counts, kth = n_most_common)
ixs = np.argsort(counts)[::-1]
if isinstance(n_most_common, int):
most_common_element_ixs = ixs[:n_most_common]
else:
assert isinstance(n_most_common, tuple) and len(n_most_common) == 2, 'eh?'
start, stop = n_most_common
most_common_element_ixs = ixs[start:stop]
most_common_elements = unique_elements[most_common_element_ixs]
return most_common_elements, counts[most_common_element_ixs]


def _filter_words(word_list, filter_set):
if not isinstance(filter_set, set):
filter_set = set(filter_set)
return [w for w in word_list if w in filter_set]


def _filter_lists_of_words(lists_of_words, filter_set):
return np.array([_filter_words(word_list, filter_set) for word_list in lists_of_words])


def _words_to_ints(word_list, lookup):
return np.array([lookup[w] for w in word_list])


def _list_of_posts_to_list_of_ixs(list_of_posts, vocabulary):
div_ixs = np.cumsum([len(post) for post in list_of_posts])[:-1]
all_filtered_words = np.concatenate(list_of_posts)
ixs = np.zeros(len(all_filtered_words), dtype = int)
for i, w in enumerate(vocabulary):
ixs[all_filtered_words==w] = i
list_of_ixs = np.split(ixs, div_ixs)
return np.array(list_of_ixs)


def _list_of_ixs_to_count_matrix(list_of_ixs, n_words):
n_samples = len(list_of_ixs)
counts = np.zeros((n_samples, n_words), dtype = int)
for c, ixs in zip(counts, list_of_ixs):
np.add.at(c, ixs, 1)
return counts


def _shuffle(arrays, rng):
n_samples = len(arrays[0])
assert all(n_samples == len(arr) for arr in arrays)
ixs = np.array(rng.permutation(n_samples))
return tuple(np.array(arr)[ixs] for arr in arrays)


if __name__ == '__main__':

data = get_20_newsgroups_dataset(numeric=False, filter_most_common = (100, 500), bag_of_words=False)
for _, (inputs, ), (targets, ) in data.training_set.shorten(20).minibatch_iterator(minibatch_size = 1):
print '%s: %s' % (targets, inputs)
Loading

0 comments on commit ba346ec

Please sign in to comment.