From 99e9e37f232c48a05d927542dc25bd010aa0c48c Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Fri, 27 Mar 2015 11:25:19 -0400 Subject: [PATCH 1/6] added back option of softmaxforest to RNN --- examples/recurrentlanguagemodel.lua | 23 ++++++++++++++++++++++- nn/Print.lua | 2 +- nn/PrintSize.lua | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/examples/recurrentlanguagemodel.lua b/examples/recurrentlanguagemodel.lua index 2b97158..4eb97db 100644 --- a/examples/recurrentlanguagemodel.lua +++ b/examples/recurrentlanguagemodel.lua @@ -33,6 +33,8 @@ cmd:option('--dropout', false, 'apply dropout on hidden neurons (not recommended --[[ output layer ]]-- cmd:option('--softmaxtree', false, 'use SoftmaxTree instead of the inefficient (full) softmax') +cmd:option('--softmaxforest', false, 'use SoftmaxForest instead of SoftmaxTree (uses more memory)') +cmd:option('--forestGaterSize', '{}', 'size of hidden layers used for forest gater (trees are experts)') --cmd:option('--accUpdate', false, 'accumulate output layer updates inplace. Note that this will cause BPTT instability, but will cost less memory.') --[[ data ]]-- @@ -58,6 +60,10 @@ if opt.xpPath ~= '' then assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist') end +if not opt.forceForget then + print"Warning : you should probably use --forceForget" +end + --[[data]]-- local train_file = 'train_data.th7' if opt.small then @@ -93,7 +99,22 @@ end -- build the last layer first: local softmax -if opt.softmaxtree then +if opt.softmaxforest then + softmax = dp.SoftmaxForest{ + input_size = opt.hiddenSize, + hierarchy = { + datasource:hierarchy('word_tree1.th7'), + datasource:hierarchy('word_tree2.th7'), + datasource:hierarchy('word_tree3.th7') + }, + gater_size = table.fromString(opt.forestGaterSize), + gater_act = nn.Tanh(), + root_id = {880542,880542,880542}, + dropout = opt.dropout and nn.Dropout() or nil, + acc_update = opt.accUpdate + } + opt.softmaxtree = true +elseif opt.softmaxtree then softmax = dp.SoftmaxTree{ input_size = opt.hiddenSize, hierarchy = datasource:hierarchy(), diff --git a/nn/Print.lua b/nn/Print.lua index ceab659..80bf706 100644 --- a/nn/Print.lua +++ b/nn/Print.lua @@ -13,7 +13,7 @@ end function Print:updateGradInput(input, gradOutput) - print(self.prefix.."gradOuput\n", gradOutput) + print(self.prefix.."gradOutput\n", gradOutput) self.gradInput = gradOutput return self.gradInput end diff --git a/nn/PrintSize.lua b/nn/PrintSize.lua index 320698d..66a21e2 100644 --- a/nn/PrintSize.lua +++ b/nn/PrintSize.lua @@ -29,7 +29,7 @@ function PrintSize:updateGradInput(input, gradOutput) else size = gradOutput:size() end - print(self.prefix..":gradOuput\n", size) + print(self.prefix..":gradOutput\n", size) self.gradInput = gradOutput return self.gradInput end From e06241215a252f76463fe6c996bb0e446e6b526b Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Fri, 27 Mar 2015 12:48:36 -0400 Subject: [PATCH 2/6] ListView:sub/index --- test/test.lua | 31 +++++++++++++++++++++ view/listview.lua | 68 +++++++++++++++++++++++++++++++---------------- view/view.lua | 4 +-- 3 files changed, 78 insertions(+), 25 deletions(-) diff --git a/test/test.lua b/test/test.lua index 859f961..d5ae62e 100644 --- a/test/test.lua +++ b/test/test.lua @@ -176,6 +176,37 @@ function dptest.listview() list:forwardPut('bhwc',{torch.randn(1,3,4,2),torch.randn(1,3,4,2)}) local t = list:forwardGet('bchw') mytester:assertTableEq(t[1]:size():totable(), {1,2,3,4}, 0.00001) + -- indexing + local data1, data2 = torch.randn(5,2,3,4), torch.randn(5,2,3,4) + local v = dp.ListView{dp.ImageView('bchw',data1), dp.ImageView('bchw',data2)} + local indices = torch.LongTensor{2,3} + local v2 = v:index(indices) + local tbl = v2:forward('bchw', 'torch.DoubleTensor') + local tbl2 = {data1, data2} + mytester:assert(#tbl == 2) + for i, d in ipairs(tbl) do + mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001) + end + local v3 = dp.ListView{dp.ImageView('bchw',torch.randn(1,2,3,4)), dp.ImageView('bchw',torch.randn(1,2,3,4))} + local v4 = v:index(v3, indices) + local tbl = v4:forward('bchw', 'torch.DoubleTensor') + mytester:assert(#tbl == 2) + for i, d in ipairs(tbl) do + mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001) + end + -- sub + local v5 = v:sub(2,3) + local tbl = v5:forward('bchw', 'torch.DoubleTensor') + mytester:assert(#tbl == 2) + for i, d in ipairs(tbl) do + mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001) + end + local v6 = v:sub(nil, 2,3) + local tbl = v6:forward('bchw', 'torch.DoubleTensor') + mytester:assert(#tbl == 2) + for i, d in ipairs(tbl) do + mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001) + end end function dptest.carry() local data = torch.rand(3,4) diff --git a/view/listview.lua b/view/listview.lua index b8706df..fbc9c68 100644 --- a/view/listview.lua +++ b/view/listview.lua @@ -8,8 +8,8 @@ local ListView, parent = torch.class("dp.ListView", "dp.View") ListView.isListView = true function ListView:__init(components) - parent.assertInstances(components) - self._components = components + self._components = components or {} + parent.assertInstances(self._components) self._modules = {} parent.__init(self) end @@ -60,37 +60,59 @@ function ListView:nSample() end function ListView:index(v, indices) - error"Not Implemented" if indices then - assert(v.isListView, "Expecting ListView as first argument") - return torch.protoClone(self, + if not torch.isTypeOf(v, self) then + error("Expecting "..torch.type(self).." at arg 1 ".. + "got "..torch.type(v).." instead") + end + if v:size() ~= self:size() then + error("Expecting "..torch.type(self).." ar arg 1 " .. + "having same number of components as self") + end + for i, component in self:pairs() do + component:index(v:components()[i], indices) + end + else + indices = v + v = self.new( _.map(self._components, function(key, component) - return component:index(v, indices) + return component:index(indices) end ) ) - else - indices = v end - return torch.protoClone(self, - _.map(self._components, - function(key, component) - return component:index(v, indices) - end - ) - ) + return v end -function ListView:sub(start, stop) - error"Not Implemented" - return torch.protoClone(self, - _.map(self._components, - function(key, component) - return component:sub(start, stop) - end +function ListView:sub(v, start, stop, inplace) + if v and stop then + if not torch.isTypeOf(v, self) then + error("Expecting "..torch.type(self).." at arg 1 ".. + "got "..torch.type(v).." instead") + end + if v:size() ~= self:size() then + error("Expecting "..torch.type(self).." ar arg 1 " .. + "having same number of components as self") + end + for i, component in self:pairs() do + component:sub(v:components()[i], start, stop, inplace) + end + else + if v then + inplace = stop + stop = start + start = v + end + v = self.new( + _.map(self._components, + function(key, component) + return component:sub(start, stop, inplace) + end + ) ) - ) + end + return v end function ListView:size() diff --git a/view/view.lua b/view/view.lua index 09bc399..1c25301 100644 --- a/view/view.lua +++ b/view/view.lua @@ -140,7 +140,7 @@ function View.areInstances(obj_table) local map = _.values( _.map(obj_table, function(key, obj) - return obj.isView + return torch.isTypeOf(obj, 'dp.View') end ) ) @@ -151,5 +151,5 @@ function View.assertInstances(obj_table) local areInstances, index = View.areInstances(obj_table) index = index or 0 assert(areInstances, "Error : object at index " .. index .. - " is of wrong type. Expecting type dp.DataView.") + " is of wrong type. Expecting type dp.View.") end From 616aec279b2e8fbdb8fd9f12825501545ccc3b17 Mon Sep 17 00:00:00 2001 From: nicholas-leonard Date: Fri, 27 Mar 2015 14:00:15 -0400 Subject: [PATCH 3/6] removed legacy CompositeTensor --- data/baseset.lua | 8 -------- 1 file changed, 8 deletions(-) diff --git a/data/baseset.lua b/data/baseset.lua index fbd9f1d..7f1fe7f 100644 --- a/data/baseset.lua +++ b/data/baseset.lua @@ -46,20 +46,12 @@ function BaseSet:isTrain() end function BaseSet:setInputs(inputs) - if not torch.typename(inputs) and type(inputs) == 'table' then - --if list, make CompositeTensor - inputs = dp.CompositeTensor{components=inputs} - end assert(inputs.isView, "Error : invalid inputs. Expecting type dp.View") self._inputs = inputs end function BaseSet:setTargets(targets) - if not torch.typename(targets) and type(targets) == 'table' then - --if list, make CompositeTensor - targets = dp.CompositeTensor{components=targets} - end assert(targets.isView, "Error : invalid targets. Expecting type dp.View") self._targets = targets From 5d37f264330079d3baabaf5693e1ebb35a0caea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicholas=20L=C3=A9onard?= Date: Sun, 29 Mar 2015 13:42:55 -0400 Subject: [PATCH 4/6] Update convolution1D.lua --- model/convolution1D.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/convolution1D.lua b/model/convolution1D.lua index d456321..6ada313 100644 --- a/model/convolution1D.lua +++ b/model/convolution1D.lua @@ -80,7 +80,7 @@ function Convolution1D:outputSize(width, view) local input = torch.Tensor(2, width, self._input_size) local inputView = dp.SequenceView('bwc', input) -- just propagate this dummy input through to know the output size - local output = self:forward(input,{nSample=2}):forward(view or 'bwc') + local output = self:forward(inputView,{nSample=2}):forward(view or 'bwc') self:zeroStatistics() return output:size(2), output:size(3) end From 53ed1aaa1a95015b3cc9664c1d9d696e36dd9a42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicholas=20L=C3=A9onard?= Date: Sun, 29 Mar 2015 13:43:33 -0400 Subject: [PATCH 5/6] Update convolution1D.lua --- model/convolution1D.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/convolution1D.lua b/model/convolution1D.lua index 6ada313..1a81c5e 100644 --- a/model/convolution1D.lua +++ b/model/convolution1D.lua @@ -80,7 +80,7 @@ function Convolution1D:outputSize(width, view) local input = torch.Tensor(2, width, self._input_size) local inputView = dp.SequenceView('bwc', input) -- just propagate this dummy input through to know the output size - local output = self:forward(inputView,{nSample=2}):forward(view or 'bwc') + local output = self:forward(inputView,dp.Carry{nSample=2}):forward(view or 'bwc') self:zeroStatistics() return output:size(2), output:size(3) end From db69e22a89ce464b4bc318716983646ae857de9b Mon Sep 17 00:00:00 2001 From: Kate Silverstein Date: Thu, 2 Apr 2015 21:49:57 -0400 Subject: [PATCH 6/6] Fix some small typos --- doc/neuralnetworktutorial.md | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/doc/neuralnetworktutorial.md b/doc/neuralnetworktutorial.md index 0d25cae..63a5846 100644 --- a/doc/neuralnetworktutorial.md +++ b/doc/neuralnetworktutorial.md @@ -28,28 +28,28 @@ which we encapsulate in a [DataSource](data.md#dp.DataSource) object. __dp__ provides the option of training on different datasets, notably [MNIST](data.md#dp.Mnist), [NotMNIST](data.md#dp.NotMnist), [CIFAR-10](data.md#dp.Cifar10) or [CIFAR-100](data.md#dp.Cifar100), but for this -tutorial we will be using the archtypical MNIST (don't leave home without it): +tutorial we will be using the archetypal MNIST (don't leave home without it): ```lua --[[data]]-- datasource = dp.Mnist{input_preprocess = dp.Standardize()} ``` A DataSource contains up to three [DataSets](data.md#dp.DataSet): -`train`, `valid` and `test`. The first if for training the model. +`train`, `valid` and `test`. The first is for training the model. The second is used for [early-stopping](observer.md#dp.EarlyStopper) and cross-validation. -The third is used for publishing papers and comparing different models. +The third is used for publishing papers and comparing results across different models. Although not really necessary, we [Standardize](preprocess.md#dp.Standardize) the datasource, which subtracts the mean and divides by the standard deviation. Both statistics (mean and standard deviation) are measured on the `train` set only. This is a common pattern when preprocessing data. -When statistics need to be measured accross different examples +When statistics need to be measured across different examples (as in [ZCA](preprocess.md#dp.ZCA) and [LecunLCN](preprocess.md#dp.LeCunLCN) preprocesses), we fit the preprocessor on the `train` set and apply it to all sets (`train`, `valid` and `test`). However, some preprocesses require that statistics be measured only on each example (as in [global constrast normalization](preprocess.md#dp.GCN)). ## Model of Modules ## -Ok so we have a DataSource, now we need a [Model](model.md#dp.Model). Lets build a +Ok so we have a DataSource, now we need a [Model](model.md#dp.Model). Let's build a multi-layer perceptron (MLP) with two parameterized non-linear [Neural](model.md#dp.Neural) [Layers](model.md#dp.Layer): ```lua --[[Model]]-- @@ -90,7 +90,7 @@ If you construct it with argument `sparse_init=false`, it will delegate paramete which is what Neural uses internally for its parameters. These two Neural [Models](model.md#dp.Model) are combined to form an MLP using [Sequential](model.md#dp.Sequential), -which is not to be confused with (yet very similar to) the +which is not to be confused with the [Sequential](https://github.com/torch/nn/blob/master/containers.md#nn.Sequential) Module. It differs in that it can be constructed from a list of [Models](model.md#dp.Model) instead of [Modules](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module). Models have extra @@ -148,21 +148,21 @@ which makes the training algorithm more stochastic. ### Loss ### Each Propagator must also specify a [Loss](loss.md#dp.Loss) for training or evaluation. If you have previously used the [nn](https://github.com/torch/nn/blob/master/README.md) package, -there is nothing new here, a [Loss](loss.md#dp.Loss) is simply an adapter of +there is nothing new here. A [Loss](loss.md#dp.Loss) is simply an adapter of [Criterions](https://github.com/torch/nn/blob/master/doc/criterion.md#nn.Criterion). Each example has a single target class and our Model output is LogSoftMax so we use a [NLL](loss.md#dp.NLL), which wraps a [ClassNLLCriterion](https://github.com/torch/nn/blob/master/doc/criterion.md#nn.ClassNLLCriterion). ### Feedback ### -The `feedback` parameter is used to provide us with, you guessed it, feedback; like performance measures and -statistics after each epoch. We use [Confusion](feedback.md#dp.Confusion), which is a wrapper +The `feedback` parameter is used to provide us with, you guessed it, feedback (like performance measures and +statistics after each epoch). We use [Confusion](feedback.md#dp.Confusion), which is a wrapper for the [optim](https://github.com/torch/optim/blob/master/README.md) package's [ConfusionMatrix](https://github.com/torch/optim/blob/master/ConfusionMatrix.lua). While our Loss measures the Negative Log-Likelihood (NLL) of the Model on different DataSets, our [Feedback](feedback.md#feedback) measures classification accuracy (which is what we will use for -early-stopping and comparing our model to the state of the art). +early-stopping and comparing our model to the state-of-the-art). ### Visitor ### Since the [Optimizer](propagator.md#dp.Optimizer) is used to train the Model on a DataSet, @@ -171,11 +171,11 @@ We want to update the Model by sequentially applying the following visitors: 1. [Momentum](visitor.md#dp.Momentum) : updates parameter gradients using a factored mixture of current and previous gradients. 2. [Learn](visitor.md#dp.Learn) : updates the parameters using the gradients and a learning rate. - 3. [MaxNorm](visitor.md#dp.MaxNorm) : updates output or input neuron weights (in this case, output) so that they have a norm less or equal to a specified value. + 3. [MaxNorm](visitor.md#dp.MaxNorm) : updates output or input neuron weights (in this case, output) so that they have a norm less than or equal to a specified value. The only mandatory Visitor is the second one (Learn), which does the actual parameter updates. -The first is the well known momentum. -The last is the lesser known hard constraint on the norm of output or input neuron weights +The first is the well-known Momentum. +The last (MaxNorm) is the lesser-known hard constraint on the norm of output or input neuron weights (see [Hinton 2012](http://arxiv.org/pdf/1207.0580v1.pdf)), which acts as a regularizer. You could also replace it with a more classic regularizer like [WeightDecay](visitor.md#dp.WeightDecay), in which case you would have to put it *before* the Learn visitor. @@ -210,26 +210,26 @@ order is not important. Observers listen to mediator [Channels](mediator.md#dp.C calls them back when certain events occur. In particular, they may listen to the _doneEpoch_ Channel to receive a report from the Experiment after each epoch. A report is nothing more than a hierarchy of tables. After each epoch, the component objects of the Experiment (except Observers) -can submit a report to its composite parent thereby forming a tree of reports. The Observers can analyse -these and modify the component which they are assigned to (in this case, Experiment). +can each submit a report to its composite parent thereby forming a tree of reports. The Observers can analyse +these and modify the components which they are assigned to (in this case, Experiment). Observers may be attached to Experiments, Propagators, Visitors, etc. #### FileLogger #### Here we use a simple [FileLogger](observer.md#dp.FileLogger) which will -store serialized reports in a simple text file for later use. Each experiment has a unique ID which are -included in reports, thus allowing the FileLogger to name its file appropriately. +store serialized reports in a simple text file for later use. Each experiment has a unique ID which is +included in the corresponding reports, thus allowing the FileLogger to name its file appropriately. #### EarlyStopper #### The [EarlyStopper](observer.md#dp.EarlyStopper) is used for stopping the Experiment when error has not decreased, or accuracy has not -be maximized. It also saves onto disk the best version of the Experiment when it finds a new one. -It is initialized with a channel to `maximize` or minimize (default is to minimize). In this case, we intend +been maximized. It also saves to disk the best version of the Experiment when it finds a new one. +It is initialized with a channel to `maximize` or minimize (the default is to minimize). In this case, we intend to early-stop the experiment on a field of the report, in particular the _accuracy_ field of the _confusion_ table of the _feedback_ table of the `validator`. This `{'validator','feedback','confusion','accuracy'}` happens to measure the accuracy of the Model on the validation DataSet after each training epoch. So by early-stopping on this measure, we hope to find a -Model that generalizes well. The parameter `max_epochs` indicates how much consecutive +Model that generalizes well. The parameter `max_epochs` indicates how many consecutive epochs of training can occur without finding a new best model before the experiment is signaled to stop -on the _doneExperiment_ Mediator Channel. +by the _doneExperiment_ Mediator Channel. ## Running the Experiment ## Once we have initialized the experiment, we need only run it on the `datasource` to begin training. @@ -282,8 +282,8 @@ xps:25044:1398320864:1:tester:confusion accuracy = 0.92548076923077 ## Hyperoptimizing ## Hyper-optimization is the hardest part of deep learning. -In many ways, it feels more like an art than a science. -[Momentum](visitor.md#dp.Momentum) can help convergence, but it requires so much more memory. +In many ways, it can feel more like an art than a science. +[Momentum](visitor.md#dp.Momentum) can help convergence, but it requires much more memory. The same is true of weight decay, as both methods require a copy of parameter gradients which often almost double the memory footprint of the model. Using [MaxNorm](visitor.md#dp.MaxNorm) and [AdaptiveLearningRate](observer.md#dp.AdaptiveLearningRate) is often better as @@ -305,7 +305,7 @@ and only try 1000000000 when out of ideas. You can vary the epoch sizes to divide processing time between evaluation and training. It's often best to keep the evaluation sets small when you can -(like 10% of all data). The more training data the better. +(like 10% of all data). The more training data, the better. But these are all arbitrary guidelines. No one can tell you how to hyper-optimize. You need to try optimizing a dataset for yourself to find your own methodology and tricks.