Merge pull request #75 from petered/the_little_things

The little things
petered · Feb 25, 2016 · ba346ec · ba346ec
2 parents f38de9e + 52e47ce
commit ba346ec
Show file tree

Hide file tree

Showing 10 changed files with 319 additions and 10 deletions.
diff --git a/fileman/experiment_record.py b/fileman/experiment_record.py
@@ -20,6 +20,19 @@
 __author__ = 'peter'
 
 
+class _ExpLibClass(object):
+
+ def __setattr__(self, experiment_name, experiment):
+ assert isinstance(experiment, Experiment), "Your experiment must be an experiment!"
+ if experiment.name is None:
+ experiment.name = experiment_name
+ assert experiment_name not in GLOBAL_EXPERIMENT_LIBRARY, "Experiment %s is already in the library" % (experiment_name, )
+ self.__dict__[experiment_name] = experiment
+ GLOBAL_EXPERIMENT_LIBRARY[experiment_name] = experiment
+
+
+ExperimentLibrary = _ExpLibClass()
+
 GLOBAL_EXPERIMENT_LIBRARY = {}
 
 
@@ -368,28 +381,47 @@ def get_experiment_info(name):
 
 class Experiment(object):
 
- def __init__(self, name, function, description, conclusion = ''):
+ def __init__(self, function, description, conclusion = '', name = None, versions = None, current_version = None):
+ if versions is not None:
+ assert isinstance(versions, (list, dict))
+ assert current_version is not None, 'If you specify multiple versions, you have to pick a current version'
+ if isinstance(versions, list):
+ assert isinstance(current_version, int)
  self.name = name
  self.function = function
  self.description = description
  self.conclusion = conclusion
+ self.versions = versions
+ self.current_version = current_version
 
  def __str__(self):
  return 'Experiment: %s\n Defined in: %s\n Description: %s\n Conclusion: %s' % \
  (self.name, inspect.getmodule(self.function).__name__, self.description, self.conclusion)
 
- def run(self, **experiment_record_kwargs):
+ def run(self, print_to_console = True, show_figs = None, **experiment_record_kwargs):
  """
  Run the experiment, and return the ExperimentRecord that is generated.
  Note, if you want the output of the function, you should just run the function directly.
  :param experiment_record_kwargs: See ExperimentRecord for kwargs
  """
- print '%s Running Experiment: %s %s' % ('='*10, self.name, '='*10)
- with ExperimentRecord(name = self.name, **experiment_record_kwargs) as exp_rec:
- self.function()
- print '%s Done Experiment: %s %s' % ('-'*11, self.name, '-'*12)
+ if self.versions is not None:
+ kwargs = self.versions[self.current_version]
+ name = self.name+'-'+(self.current_version if isinstance(self.current_version, str) else str(self.versions[self.current_version]))
+ else:
+ kwargs = {}
+ name = self.name
+
+ print '%s Running Experiment: %s %s' % ('='*10, name, '='*10)
+ with ExperimentRecord(name = name, print_to_console=print_to_console, show_figs=show_figs, **experiment_record_kwargs) as exp_rec:
+ self.function(**kwargs)
+ print '%s Done Experiment: %s %s' % ('-'*11, name, '-'*12)
  return exp_rec
 
+ def run_all(self):
+ for v in (self.versions.keys() if isinstance(self.versions, dict) else xrange(len(self.versions))):
+ self.current_version = v
+ self.run()
+
 
 if __name__ == '__main__':
  browse_experiment_records()
diff --git a/plato/interfaces/helpers.py b/plato/interfaces/helpers.py
@@ -134,3 +134,23 @@ def __call__(self, x):
  add_update(running_mean_sq, new_running_mean_sq)
  running_std = tt.sqrt((new_running_mean_sq - new_running_mean**2))
  return (x - running_mean)/(running_std+1e-7)
+
+
+@symbolic_simple
+class SlowBatchCenter(object):
+ """
+ Keeps a running mean and standard deviation, and normalizes the incoming data according to these.
+ This can be useful if you want to do something similar to minibatch-normalization, but without having
+ the batch-size tied to the normalization range.
+ """
+
+ def __init__(self, half_life):
+ self.decay_constant = np.exp(-np.log(2)/half_life).astype(theano.config.floatX)
+
+ def __call__(self, x):
+ # x should have
+ assert x.ishape[0]==1, "This method only works for minibatches of size 1, but you used a minibatch of size: %s" % (x.tag.test_value.shape[0])
+ running_mean = create_shared_variable(np.zeros(x.tag.test_value.shape[1:]))
+ new_running_mean = running_mean * self.decay_constant + x[0] * (1-self.decay_constant).astype(theano.config.floatX)
+ add_update(running_mean, new_running_mean)
+ return x - running_mean
diff --git a/plato/tools/va/demo_gaussian_vae.py b/plato/tools/va/demo_gaussian_vae.py
@@ -108,4 +108,4 @@ def demo_simple_vae_on_mnist(
 
 if __name__ == '__main__':
 
- run_experiment('mnist-vae-20d-continuous_in')
+ run_experiment('mnist-vae-2latent')
diff --git a/plotting/data_conversion.py b/plotting/data_conversion.py
@@ -23,6 +23,14 @@ def vector_length_to_tile_dims(vector_length):
  return grid_shape
 
 
+def put_vector_in_grid(vec):
+ n_rows, n_cols = vector_length_to_tile_dims(len(vec))
+ grid = np.zeros(n_rows*n_cols, dtype = vec.dtype)
+ grid[:len(vec)]=vec
+ grid=grid.reshape(n_rows, n_cols)
+ return grid
+
+
 @memoize
 def _data_shape_and_boundary_width_to_grid_slices(shape, grid_shape, boundary_width):
 

diff --git a/utils/benchmarks/plot_learning_curves.py b/utils/benchmarks/plot_learning_curves.py
@@ -10,7 +10,7 @@
 __author__ = 'peter'
 
 
-def plot_learning_curves(learning_curves, xscale = 'sqrt', yscale = 'linear', hang = None, title = None, figure_name = None):
+def plot_learning_curves(learning_curves, xscale = 'sqrt', yscale = 'linear', hang = None, title = None, figure_name = None, y_title = 'Score'):
  """
  Plot a set of PredictionResults. These can be obtained by running compare_predictors.
  See module test_compare_predictors for an example.
@@ -53,7 +53,7 @@ def plot_learning_curves(learning_curves, xscale = 'sqrt', yscale = 'linear', ha
  legend.append('%s-test' % record_name)
 
  plt.xlabel('Epoch')
- plt.ylabel('Score')
+ plt.ylabel(y_title)
  plt.legend(legend, loc = 'best')
  if title is not None:
  plt.title(title)

diff --git a/utils/benchmarks/predictor_comparison.py b/utils/benchmarks/predictor_comparison.py
@@ -277,4 +277,4 @@ def get_scores(self, which_test_set = None):
  else:
  assert which_test_set in results, 'You asked for results for the test set %s, but we only have test sets %s' \
  % (which_test_set, results.keys())
- return results[which_test_set]
+ return results[which_test_set]
diff --git a/utils/benchmarks/train_and_test.py b/utils/benchmarks/train_and_test.py
@@ -41,6 +41,7 @@ def get_evaluation_function(name):
  'mse': mean_squared_error,
  'mean_squared_error': mean_squared_error,
  'percent_argmax_correct': percent_argmax_correct,
+ 'percent_argmax_incorrect': percent_argmax_incorrect,
  'percent_correct': percent_correct,
  }[name]
 
@@ -68,6 +69,10 @@ def percent_argmax_correct(actual, target):
  return 100*fraction_correct(actual, target)
 
 
+def percent_argmax_incorrect(actual, target):
+ return 100 - percent_argmax_correct(actual, target)
+
+
 def collapse_onehot_if_necessary(output_data):
  """
  Given an input that could either be in onehot encoding or not, return it in onehot encoding.

diff --git a/utils/bureaucracy.py b/utils/bureaucracy.py
@@ -19,6 +19,24 @@ def multichannel(fcn):
  return lambda args: (fcn(*args), )
 
 
+def single_to_batch(fcn, *batch_inputs, **batch_kwargs):
+ """
+ :param fcn: A function
+ :param batch_inputs: A collection of batch-form (n_samples, input_dims_i) inputs
+ :return: batch_outputs, an (n_samples, output_dims) array
+ """
+ n_samples = len(batch_inputs[0])
+ assert all(len(b) == n_samples for b in batch_inputs)
+ first_out = fcn(*[b[0] for b in batch_inputs], **{k: b[0] for k, b in batch_kwargs.iteritems()})
+ if n_samples==1:
+ return first_out[None]
+ out = np.empty((n_samples, )+first_out.shape)
+ out[0] = n_samples
+ for i in xrange(1, n_samples):
+ out[i] = fcn(*[b[i] for b in batch_inputs], **{k: b[i] for k, b in batch_kwargs.iteritems()})
+ return out
+
+
 def minibatch_iterate(data, minibatch_size, n_epochs=1):
  """
  Yields minibatches in sequence.

diff --git a/utils/datasets/newsgroups.py b/utils/datasets/newsgroups.py
@@ -0,0 +1,144 @@
+from fileman.file_getter import get_file
+from general.should_be_builtins import memoize
+import numpy as np
+from utils.datasets.datasets import DataSet
+
+__author__ = 'peter'
+
+
+@memoize
+def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None):
+ """
+ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in
+ posts in the forums.
+
+ Words have been preprocessed to the "stemmed" version, as explained on the website:
+ http://ana.cachopo.org/datasets-for-single-label-text-categorization
+
+ :param filter_most_common: Can be:
+ None: Don't filter out words
+ int N: Filter out words that are not in the N most common workds
+ (int N, int M): Filter out words that are not between the Nth and Mth most common words.
+ :param numeric: Convert everything from words to numbers
+ :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
+ :param bag_of_words: Return count vectors for each word
+ :param count_scaling: If using bag_of_words, apply the transformation:
+ vector = log(1+word_counts)
+ To generate the input data (this scaling makes it more suitable for some types of classifiers).
+ :return: A DataSet object
+ """
+
+ training_set_file = get_file(
+ relative_name = 'data/20ng-train-stemmed.txt',
+ url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
+ )
+
+ test_set_file = get_file(
+ relative_name = 'data/20ng-test-stemmed.txt',
+ url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
+ )
+
+ train_words, train_labels = _read_formatted_file(training_set_file)
+ test_words, test_labels = _read_formatted_file(test_set_file)
+
+ # Shuffle it up...
+ rng = np.random.RandomState(shuffling_seed)
+ train_words, train_labels =_shuffle((train_words, train_labels), rng)
+ test_words, test_labels =_shuffle((test_words, test_labels), rng)
+
+ # Filter out most-common-but-not-too-common-words
+ all_train_words = np.concatenate(train_words)
+ filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common)
+ train_words = _filter_lists_of_words(train_words, filtered_vocab)
+ test_words = _filter_lists_of_words(test_words, filtered_vocab)
+
+ if numeric or bag_of_words:
+ train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab)
+ test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab)
+ label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
+ train_labels = _words_to_ints(train_labels, label_vocab)
+ test_labels = _words_to_ints(test_labels, label_vocab)
+
+ if bag_of_words:
+ train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab))
+ test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab))
+ if count_scaling == 'log':
+ train_counts = np.log(1+train_counts)
+ test_counts = np.log(1+test_counts)
+ return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels)
+ else:
+ return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels)
+ else:
+ return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)
+
+
+
+def _read_formatted_file(file_relative_path):
+
+ with open(get_file(file_relative_path)) as f:
+ text = f.read()
+ pairs = [line.split('\t') for line in text.split('\n')[:-1]]
+ labels = [group for group, _ in pairs]
+ words = [sentence.split(' ') for _, sentence in pairs]
+ return words, labels
+
+
+def _find_most_common(elements, n_most_common):
+
+ unique_elements, counts = np.unique(elements, return_counts=True)
+ # ixs = np.argpartition(-counts, kth = n_most_common)
+ ixs = np.argsort(counts)[::-1]
+ if isinstance(n_most_common, int):
+ most_common_element_ixs = ixs[:n_most_common]
+ else:
+ assert isinstance(n_most_common, tuple) and len(n_most_common) == 2, 'eh?'
+ start, stop = n_most_common
+ most_common_element_ixs = ixs[start:stop]
+ most_common_elements = unique_elements[most_common_element_ixs]
+ return most_common_elements, counts[most_common_element_ixs]
+
+
+def _filter_words(word_list, filter_set):
+ if not isinstance(filter_set, set):
+ filter_set = set(filter_set)
+ return [w for w in word_list if w in filter_set]
+
+
+def _filter_lists_of_words(lists_of_words, filter_set):
+ return np.array([_filter_words(word_list, filter_set) for word_list in lists_of_words])
+
+
+def _words_to_ints(word_list, lookup):
+ return np.array([lookup[w] for w in word_list])
+
+
+def _list_of_posts_to_list_of_ixs(list_of_posts, vocabulary):
+ div_ixs = np.cumsum([len(post) for post in list_of_posts])[:-1]
+ all_filtered_words = np.concatenate(list_of_posts)
+ ixs = np.zeros(len(all_filtered_words), dtype = int)
+ for i, w in enumerate(vocabulary):
+ ixs[all_filtered_words==w] = i
+ list_of_ixs = np.split(ixs, div_ixs)
+ return np.array(list_of_ixs)
+
+
+def _list_of_ixs_to_count_matrix(list_of_ixs, n_words):
+ n_samples = len(list_of_ixs)
+ counts = np.zeros((n_samples, n_words), dtype = int)
+ for c, ixs in zip(counts, list_of_ixs):
+ np.add.at(c, ixs, 1)
+ return counts
+
+
+def _shuffle(arrays, rng):
+ n_samples = len(arrays[0])
+ assert all(n_samples == len(arr) for arr in arrays)
+ ixs = np.array(rng.permutation(n_samples))
+ return tuple(np.array(arr)[ixs] for arr in arrays)
+
+
+if __name__ == '__main__':
+
+ data = get_20_newsgroups_dataset(numeric=False, filter_most_common = (100, 500), bag_of_words=False)
+ for _, (inputs, ), (targets, ) in data.training_set.shorten(20).minibatch_iterator(minibatch_size = 1):
+ print '%s: %s' % (targets, inputs)