Merge branch 'master' of github.com:nicholas-leonard/dp

nicholas-leonard · Apr 6, 2015 · f66f947 · f66f947
2 parents 93f4a4a + e9541be
commit f66f947
Show file tree

Hide file tree

Showing 9 changed files with 127 additions and 61 deletions.
diff --git a/data/baseset.lua b/data/baseset.lua
@@ -46,20 +46,12 @@ function BaseSet:isTrain()
 end
 
 function BaseSet:setInputs(inputs)
-   if not torch.typename(inputs) and type(inputs) == 'table' then
-      --if list, make CompositeTensor
-      inputs = dp.CompositeTensor{components=inputs}
-   end
    assert(inputs.isView, 
       "Error : invalid inputs. Expecting type dp.View")
    self._inputs = inputs
 end
 
 function BaseSet:setTargets(targets)
-   if not torch.typename(targets) and type(targets) == 'table' then
-      --if list, make CompositeTensor
-      targets = dp.CompositeTensor{components=targets}
-   end
    assert(targets.isView,
       "Error : invalid targets. Expecting type dp.View")
    self._targets = targets

diff --git a/doc/neuralnetworktutorial.md b/doc/neuralnetworktutorial.md
@@ -28,28 +28,28 @@ which we encapsulate in a [DataSource](data.md#dp.DataSource)
 object. __dp__ provides the option of training on different datasets, 
 notably [MNIST](data.md#dp.Mnist), [NotMNIST](data.md#dp.NotMnist), 
 [CIFAR-10](data.md#dp.Cifar10) or [CIFAR-100](data.md#dp.Cifar100), but for this
-tutorial we will be using the archtypical MNIST (don't leave home without it):
+tutorial we will be using the archetypal MNIST (don't leave home without it):
 ```lua
 --[[data]]--
 datasource = dp.Mnist{input_preprocess = dp.Standardize()}
 ```
 A DataSource contains up to three [DataSets](data.md#dp.DataSet): 
-`train`, `valid` and `test`. The first if for training the model. 
+`train`, `valid` and `test`. The first is for training the model. 
 The second is used for [early-stopping](observer.md#dp.EarlyStopper) and cross-validation.
-The third is used for publishing papers and comparing different models.
+The third is used for publishing papers and comparing results across different models.
 
 Although not really necessary, we [Standardize](preprocess.md#dp.Standardize) 
 the datasource, which subtracts the mean and divides 
 by the standard deviation. Both statistics (mean and standard deviation) are 
 measured on the `train` set only. This is a common pattern when preprocessing data. 
-When statistics need to be measured accross different examples 
+When statistics need to be measured across different examples 
 (as in [ZCA](preprocess.md#dp.ZCA) and [LecunLCN](preprocess.md#dp.LeCunLCN) preprocesses), 
 we fit the preprocessor on the `train` set and apply it to all sets (`train`, `valid` and `test`). 
 However, some preprocesses require that statistics be measured
 only on each example (as in [global constrast normalization](preprocess.md#dp.GCN)). 
 
 ## Model of Modules ##
-Ok so we have a DataSource, now we need a [Model](model.md#dp.Model). Lets build a 
+Ok so we have a DataSource, now we need a [Model](model.md#dp.Model). Let's build a 
 multi-layer perceptron (MLP) with two parameterized non-linear [Neural](model.md#dp.Neural) [Layers](model.md#dp.Layer):
 ```lua
 --[[Model]]--
@@ -90,7 +90,7 @@ If you construct it with argument `sparse_init=false`, it will delegate paramete
 which is what Neural uses internally for its parameters.
 
 These two Neural [Models](model.md#dp.Model) are combined to form an MLP using [Sequential](model.md#dp.Sequential), 
-which is not to be confused with (yet very similar to) the 
+which is not to be confused with the 
 [Sequential](https://github.com/torch/nn/blob/master/containers.md#nn.Sequential) Module. It differs in that
 it can be constructed from a list of [Models](model.md#dp.Model) instead of 
 [Modules](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module). Models have extra 
@@ -148,21 +148,21 @@ which makes the training algorithm more stochastic.
 ### Loss ###
 Each Propagator must also specify a [Loss](loss.md#dp.Loss) for training or evaluation.
 If you have previously used the [nn](https://github.com/torch/nn/blob/master/README.md) package, 
-there is nothing new here, a [Loss](loss.md#dp.Loss) is simply an adapter of
+there is nothing new here. A [Loss](loss.md#dp.Loss) is simply an adapter of
 [Criterions](https://github.com/torch/nn/blob/master/doc/criterion.md#nn.Criterion). 
 Each example has a single target class and our Model output is LogSoftMax so 
 we use a [NLL](loss.md#dp.NLL), which wraps a 
 [ClassNLLCriterion](https://github.com/torch/nn/blob/master/doc/criterion.md#nn.ClassNLLCriterion).
 
 ### Feedback ###
-The `feedback` parameter is used to provide us with, you guessed it, feedback; like performance measures and
-statistics after each epoch. We use [Confusion](feedback.md#dp.Confusion), which is a wrapper 
+The `feedback` parameter is used to provide us with, you guessed it, feedback (like performance measures and
+statistics after each epoch). We use [Confusion](feedback.md#dp.Confusion), which is a wrapper 
 for the [optim](https://github.com/torch/optim/blob/master/README.md) package's 
 [ConfusionMatrix](https://github.com/torch/optim/blob/master/ConfusionMatrix.lua).
 While our Loss measures the Negative Log-Likelihood (NLL) of the Model 
 on different DataSets, our [Feedback](feedback.md#feedback) 
 measures classification accuracy (which is what we will use for 
-early-stopping and comparing our model to the state of the art).
+early-stopping and comparing our model to the state-of-the-art).
 
 ### Visitor ###
 Since the [Optimizer](propagator.md#dp.Optimizer) is used to train the Model on a DataSet, 
@@ -171,11 +171,11 @@ We want to update the Model by sequentially applying the following visitors:
 
   1. [Momentum](visitor.md#dp.Momentum) : updates parameter gradients using a factored mixture of current and previous gradients.
   2. [Learn](visitor.md#dp.Learn) : updates the parameters using the gradients and a learning rate.
-  3. [MaxNorm](visitor.md#dp.MaxNorm) : updates output or input neuron weights (in this case, output) so that they have a norm less or equal to a specified value.
+  3. [MaxNorm](visitor.md#dp.MaxNorm) : updates output or input neuron weights (in this case, output) so that they have a norm less than or equal to a specified value.
 
 The only mandatory Visitor is the second one (Learn), which does the actual parameter updates. 
-The first is the well known momentum. 
-The last is the lesser known hard constraint on the norm of output or input neuron weights 
+The first is the well-known Momentum. 
+The last (MaxNorm) is the lesser-known hard constraint on the norm of output or input neuron weights 
 (see [Hinton 2012](http://arxiv.org/pdf/1207.0580v1.pdf)), which acts as a regularizer. You could also
 replace it with a more classic regularizer like [WeightDecay](visitor.md#dp.WeightDecay), in which case you 
 would have to put it *before* the Learn visitor.
@@ -210,26 +210,26 @@ order is not important. Observers listen to mediator [Channels](mediator.md#dp.C
 calls them back when certain events occur. In particular, they may listen to the _doneEpoch_
 Channel to receive a report from the Experiment after each epoch. A report is nothing more than 
 a hierarchy of tables. After each epoch, the component objects of the Experiment (except Observers) 
-can submit a report to its composite parent thereby forming a tree of reports. The Observers can analyse 
-these and modify the component which they are assigned to (in this case, Experiment). 
+can each submit a report to its composite parent thereby forming a tree of reports. The Observers can analyse 
+these and modify the components which they are assigned to (in this case, Experiment). 
 Observers may be attached to Experiments, Propagators, Visitors, etc. 
 
 #### FileLogger ####
 Here we use a simple [FileLogger](observer.md#dp.FileLogger) which will 
-store serialized reports in a simple text file for later use. Each experiment has a unique ID which are 
-included in reports, thus allowing the FileLogger to name its file appropriately. 
+store serialized reports in a simple text file for later use. Each experiment has a unique ID which is 
+included in the corresponding reports, thus allowing the FileLogger to name its file appropriately. 
 
 #### EarlyStopper ####
 The [EarlyStopper](observer.md#dp.EarlyStopper) is used for stopping the Experiment when error has not decreased, or accuracy has not 
-be maximized. It also saves onto disk the best version of the Experiment when it finds a new one. 
-It is initialized with a channel to `maximize` or minimize (default is to minimize). In this case, we intend 
+been maximized. It also saves to disk the best version of the Experiment when it finds a new one. 
+It is initialized with a channel to `maximize` or minimize (the default is to minimize). In this case, we intend 
 to early-stop the experiment on a field of the report, in particular the _accuracy_ field of the 
 _confusion_ table of the _feedback_ table of the `validator`. 
 This `{'validator','feedback','confusion','accuracy'}` happens to measure the accuracy of the Model on the 
 validation DataSet after each training epoch. So by early-stopping on this measure, we hope to find a 
-Model that generalizes well. The parameter `max_epochs` indicates how much consecutive 
+Model that generalizes well. The parameter `max_epochs` indicates how many consecutive 
 epochs of training can occur without finding a new best model before the experiment is signaled to stop 
-on the _doneExperiment_ Mediator Channel.
+by the _doneExperiment_ Mediator Channel.
 
 ## Running the Experiment ##
 Once we have initialized the experiment, we need only run it on the `datasource` to begin training.
@@ -282,8 +282,8 @@ xps:25044:1398320864:1:tester:confusion accuracy = 0.92548076923077
 ## Hyperoptimizing ##
 
 Hyper-optimization is the hardest part of deep learning. 
-In many ways, it feels more like an art than a science. 
-[Momentum](visitor.md#dp.Momentum) can help convergence, but it requires so much more memory. 
+In many ways, it can feel more like an art than a science. 
+[Momentum](visitor.md#dp.Momentum) can help convergence, but it requires much more memory. 
 The same is true of weight decay, as both methods require a 
 copy of parameter gradients which often almost double the memory footprint of the model. 
 Using [MaxNorm](visitor.md#dp.MaxNorm) and [AdaptiveLearningRate](observer.md#dp.AdaptiveLearningRate) is often better as 
@@ -305,7 +305,7 @@ and only try 1000000000 when out of ideas.
 You can vary the epoch sizes to divide processing time 
 between evaluation and training. 
 It's often best to keep the evaluation sets small when you can 
-(like 10% of all data). The more training data the better. 
+(like 10% of all data). The more training data, the better. 
 
 But these are all arbitrary guidelines. No one can tell you how to hyper-optimize. 
 You need to try optimizing a dataset for yourself to find your own methodology and tricks. 

diff --git a/examples/recurrentlanguagemodel.lua b/examples/recurrentlanguagemodel.lua
@@ -33,6 +33,8 @@ cmd:option('--dropout', false, 'apply dropout on hidden neurons (not recommended
 
 --[[ output layer ]]--
 cmd:option('--softmaxtree', false, 'use SoftmaxTree instead of the inefficient (full) softmax')
+cmd:option('--softmaxforest', false, 'use SoftmaxForest instead of SoftmaxTree (uses more memory)')
+cmd:option('--forestGaterSize', '{}', 'size of hidden layers used for forest gater (trees are experts)')
 --cmd:option('--accUpdate', false, 'accumulate output layer updates inplace. Note that this will cause BPTT instability, but will cost less memory.')
 
 --[[ data ]]--
@@ -58,6 +60,10 @@ if opt.xpPath ~= '' then
    assert(paths.filep(opt.xpPath), opt.xpPath..' does not exist')
 end
 
+if not opt.forceForget then
+   print"Warning : you should probably use --forceForget"
+end
+
 --[[data]]--
 local train_file = 'train_data.th7' 
 if opt.small then 
@@ -93,7 +99,22 @@ end
 
 -- build the last layer first:
 local softmax
-if opt.softmaxtree then
+if opt.softmaxforest then
+   softmax = dp.SoftmaxForest{
+      input_size = opt.hiddenSize, 
+      hierarchy = {  
+         datasource:hierarchy('word_tree1.th7'), 
+         datasource:hierarchy('word_tree2.th7'),
+         datasource:hierarchy('word_tree3.th7')
+      },
+      gater_size = table.fromString(opt.forestGaterSize),
+      gater_act = nn.Tanh(),
+      root_id = {880542,880542,880542},
+      dropout = opt.dropout and nn.Dropout() or nil,
+      acc_update = opt.accUpdate
+   }
+   opt.softmaxtree = true
+elseif opt.softmaxtree then
    softmax = dp.SoftmaxTree{
       input_size = opt.hiddenSize, 
       hierarchy = datasource:hierarchy(),

diff --git a/model/convolution1D.lua b/model/convolution1D.lua
@@ -80,7 +80,7 @@ function Convolution1D:outputSize(width, view)
    local input = torch.Tensor(2, width, self._input_size)
    local inputView = dp.SequenceView('bwc', input)
    -- just propagate this dummy input through to know the output size
-   local output = self:forward(input,{nSample=2}):forward(view or 'bwc')
+   local output = self:forward(inputView,dp.Carry{nSample=2}):forward(view or 'bwc')
    self:zeroStatistics()
    return output:size(2), output:size(3)
 end
diff --git a/nn/Print.lua b/nn/Print.lua
@@ -13,7 +13,7 @@ end
 
 
 function Print:updateGradInput(input, gradOutput)
-   print(self.prefix.."gradOuput\n", gradOutput)
+   print(self.prefix.."gradOutput\n", gradOutput)
    self.gradInput = gradOutput
    return self.gradInput
 end
diff --git a/nn/PrintSize.lua b/nn/PrintSize.lua
@@ -29,7 +29,7 @@ function PrintSize:updateGradInput(input, gradOutput)
    else
       size = gradOutput:size()
    end
-   print(self.prefix..":gradOuput\n", size)
+   print(self.prefix..":gradOutput\n", size)
    self.gradInput = gradOutput
    return self.gradInput
 end
diff --git a/test/test.lua b/test/test.lua
@@ -176,6 +176,37 @@ function dptest.listview()
    list:forwardPut('bhwc',{torch.randn(1,3,4,2),torch.randn(1,3,4,2)})
    local t = list:forwardGet('bchw')
    mytester:assertTableEq(t[1]:size():totable(), {1,2,3,4}, 0.00001)
+   -- indexing
+   local data1, data2 = torch.randn(5,2,3,4), torch.randn(5,2,3,4)
+   local v = dp.ListView{dp.ImageView('bchw',data1), dp.ImageView('bchw',data2)}
+   local indices = torch.LongTensor{2,3}
+   local v2 = v:index(indices)
+   local tbl = v2:forward('bchw', 'torch.DoubleTensor')
+   local tbl2 = {data1, data2}
+   mytester:assert(#tbl == 2)
+   for i, d in ipairs(tbl) do
+      mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001)
+   end
+   local v3 = dp.ListView{dp.ImageView('bchw',torch.randn(1,2,3,4)), dp.ImageView('bchw',torch.randn(1,2,3,4))}
+   local v4 = v:index(v3, indices)
+   local tbl = v4:forward('bchw', 'torch.DoubleTensor')
+   mytester:assert(#tbl == 2)
+   for i, d in ipairs(tbl) do
+      mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001)
+   end
+   -- sub
+   local v5 = v:sub(2,3)
+   local tbl = v5:forward('bchw', 'torch.DoubleTensor')
+   mytester:assert(#tbl == 2)
+   for i, d in ipairs(tbl) do
+      mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001)
+   end
+   local v6 = v:sub(nil, 2,3)
+   local tbl = v6:forward('bchw', 'torch.DoubleTensor')
+   mytester:assert(#tbl == 2)
+   for i, d in ipairs(tbl) do
+      mytester:assertTensorEq(d, tbl2[i]:index(1, indices), 0.000001)
+   end
 end
 function dptest.carry()
    local data = torch.rand(3,4)

diff --git a/view/listview.lua b/view/listview.lua
@@ -8,8 +8,8 @@ local ListView, parent = torch.class("dp.ListView", "dp.View")
 ListView.isListView = true
 
 function ListView:__init(components)
-   parent.assertInstances(components)
-   self._components = components
+   self._components = components or {}
+   parent.assertInstances(self._components)
    self._modules = {}
    parent.__init(self)
 end
@@ -60,37 +60,59 @@ function ListView:nSample()
 end
 
 function ListView:index(v, indices)
-   error"Not Implemented"
    if indices then
-      assert(v.isListView, "Expecting ListView as first argument")
-      return torch.protoClone(self, 
+      if not torch.isTypeOf(v, self) then
+         error("Expecting "..torch.type(self).." at arg 1 "..
+               "got "..torch.type(v).." instead")
+      end
+      if v:size() ~= self:size() then
+         error("Expecting "..torch.type(self).." ar arg 1 " ..
+               "having same number of components as self")
+      end
+      for i, component in self:pairs() do
+         component:index(v:components()[i], indices)
+      end
+   else
+      indices = v
+      v = self.new(
          _.map(self._components, 
             function(key, component) 
-               return component:index(v, indices)
+               return component:index(indices)
             end
          )
       )
-   else
-      indices = v
    end
-   return torch.protoClone(self, 
-      _.map(self._components, 
-         function(key, component) 
-            return component:index(v, indices)
-         end
-      )
-   )
+   return v
 end
 
-function ListView:sub(start, stop)
-   error"Not Implemented"
-   return torch.protoClone(self,
-      _.map(self._components, 
-         function(key, component) 
-            return component:sub(start, stop)
-         end
+function ListView:sub(v, start, stop, inplace)
+   if v and stop then
+      if not torch.isTypeOf(v, self) then
+         error("Expecting "..torch.type(self).." at arg 1 "..
+               "got "..torch.type(v).." instead")
+      end
+      if v:size() ~= self:size() then
+         error("Expecting "..torch.type(self).." ar arg 1 " ..
+               "having same number of components as self")
+      end
+      for i, component in self:pairs() do
+         component:sub(v:components()[i], start, stop, inplace)
+      end
+   else
+      if v then
+         inplace = stop
+         stop = start
+         start = v
+      end
+      v = self.new(
+         _.map(self._components, 
+            function(key, component) 
+               return component:sub(start, stop, inplace)
+            end
+         )
       )
-   )
+   end
+   return v
 end
 
 function ListView:size()

diff --git a/view/view.lua b/view/view.lua
@@ -140,7 +140,7 @@ function View.areInstances(obj_table)
    local map = _.values(
       _.map(obj_table, 
          function(key, obj)
-            return obj.isView
+            return torch.isTypeOf(obj, 'dp.View')
          end
       )
    )
@@ -151,5 +151,5 @@ function View.assertInstances(obj_table)
    local areInstances, index = View.areInstances(obj_table)
    index = index or 0
    assert(areInstances, "Error : object at index " .. index .. 
-      " is of wrong type. Expecting type dp.DataView.")
+      " is of wrong type. Expecting type dp.View.")
 end