From b8fba784ef18378fa8717045d61ac0a735ed16b0 Mon Sep 17 00:00:00 2001 From: Nicholas Leonard Date: Thu, 9 Apr 2015 11:50:12 -0400 Subject: [PATCH] GradClip --- init.lua | 1 + model/layer.lua | 18 +++++++++++++ model/sequential.lua | 14 ---------- node.lua | 10 ------- visitor/gradclip.lua | 63 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 24 deletions(-) create mode 100644 visitor/gradclip.lua diff --git a/init.lua b/init.lua index e0f179b..fadf9fe 100644 --- a/init.lua +++ b/init.lua @@ -117,6 +117,7 @@ torch.include('dp', 'visitor/maxnorm.lua') torch.include('dp', 'visitor/weightdecay.lua') torch.include('dp', 'visitor/learn.lua') torch.include('dp', 'visitor/momentum.lua') +torch.include('dp', 'visitor/gradclip.lua') --[[ observer ]]-- torch.include('dp', 'observer/observer.lua') diff --git a/model/layer.lua b/model/layer.lua index b8f74bb..8e74dd9 100644 --- a/model/layer.lua +++ b/model/layer.lua @@ -180,6 +180,24 @@ function Layer:maxNorm(max_out_norm, max_in_norm) end end +function Layer:gradClip(cutoff) + assert(self.backwarded, "Should call gradClip after a backward pass") + cutoff = self.mvstate.cutoff or cutoff + local params, gradParams = self:parameters() + local norm = 0 + for k,gradParam in pairs(gradParams) do + norm = norm + math.pow(gradParam:norm(),2) + end + norm = math.sqrt(norm) + if norm > cutoff then + -- rescale gradParams to obtain desired norm + for k,gradParam in pairs(gradParams) do + gradParam:mul(cutoff/norm) + end + end + return norm +end + function Layer:share(layer, ...) assert(layer.isLayer) local arg = {...} diff --git a/model/sequential.lua b/model/sequential.lua index e501f11..ca5ac8f 100644 --- a/model/sequential.lua +++ b/model/sequential.lua @@ -82,17 +82,3 @@ function Sequential:_toModule() self._models[i]:_toModule() end end - ---[[ --- experimental -function Sequential:flux(state) - local output = self.output - -- setup - for i=1,#self._models-1 do - self._models[i]:setSuccessor(self._models[i+1]) - end - return self._model[1]:flux() - self.input = output - return carry -end ---]] diff --git a/node.lua b/node.lua index 4a172f9..3955d4b 100644 --- a/node.lua +++ b/node.lua @@ -210,13 +210,3 @@ end function Node:_evaluate(carry) return self:_forward(carry) end - ---[[ --- experimental (would allow for one chained RPC call for both backward forward) -function Node:flux(carry) - local output, carry = self:forward() - local input, carry = self._successor:flux{output, carry} - local input, carry = self:backward{input, carry} - return input, carry -end ---]] diff --git a/visitor/gradclip.lua b/visitor/gradclip.lua new file mode 100644 index 0000000..3f9334b --- /dev/null +++ b/visitor/gradclip.lua @@ -0,0 +1,63 @@ +------------------------------------------------------------------------ +--[[ GradClip ]]-- +-- Ref.: A. http://goo.gl/Zxza8m +-- B. http://jmlr.org/proceedings/papers/v28/pascanu13.pdf +-- Visitor +-- Hard constraint on the upper bound of the norm of gradient with +-- respect to parameters (gradParams). Unlike ref A and B, which apply +-- the constraint on the norm of all parameters, the norm is applied +-- on the norm of each Layer's parameters. +-- Should occur before Learn in VisitorChain +------------------------------------------------------------------------ +local GradClip, parent = torch.class("dp.GradClip", "dp.Visitor") +GradClip.isGradClip = true + +function GradClip:__init(config) + config = config or {} + assert(torch.type(config) == 'table' and not config[1], + "Constructor requires key-value arguments") + local args, cutoff, name = xlua.unpack( + {config}, + 'GradClip', + 'Hard constraint on the upper bound of the norm of gradParams.', + {arg='cutoff', type='number', default=1, + help="max norm of a Layer's parameters"}, + {arg='name', type='string', default='gradclip', + help='identifies visitor in reports.'} + ) + self._cutoff = cutoff + config.include = config.include or {} + table.insert(config.include, 'hasParams') + config.exclude = config.exclude or {} + table.insert(config.exclude, 'no-gradclip') + config.name = name + parent.__init(self, config) + self.norms = {} +end + +function GradClip:_visitModel(model) + if model.gradClip then + local norm = model:gradClip(self._cutoff) + -- keep a moving average of norms + self.norms[model:id():toString()] = (self.norms[model:id():toString()] or 0)*0.8 + norm*0.2 + else + if not model.mvstate[self:id():name()].warned then + print("Warning: GradClip not implemented for model " .. + torch.typename(model) .. ". Ignoring model-visitor pair") + model.mvstate[self:id():name()].warned = true + end + end +end + +function GradClip:report() + local norms = _.values(self.norms) + if self._verbose then + print(self:id():name().." norms: ", unpack(norms)) + end + local report = { + [self:name()] = { + norms = self.norms + } + } + return report +end