https://github.com/torch/cunn
Revision f225bb44c2cbccf90a48ffc4f25ee9056958ac63 authored by Soumith Chintala on 31 March 2017, 18:44:07 UTC, committed by GitHub on 31 March 2017, 18:44:07 UTC
Using temporary variables when performing transpose + addmm
Tip revision: f225bb44c2cbccf90a48ffc4f25ee9056958ac63 authored by Soumith Chintala on 31 March 2017, 18:44:07 UTC
Merge pull request #456 from twitter-forks/addmm-fixes
Merge pull request #456 from twitter-forks/addmm-fixes
Tip revision: f225bb4
test_DataParallelTable.lua
require 'cunn'
require 'optim'
-- If fbcunn and fbnn exists we'll do a profile of DataParallel
local profileDp = pcall(function() require 'fbcunn'; require 'fbnn' end)
local baseGpu = 1 -- First GPU to use
local numGpus = cutorch.getDeviceCount()
torch.setdefaulttensortype('torch.DoubleTensor')
torch.setnumthreads(8)
cutorch.setDevice(baseGpu)
cutorch.reserveStreams(1)
local typenames = {
'torch.CudaTensor',
'torch.CudaDoubleTensor',
}
local t2cpu = {
['torch.CudaTensor'] = 'torch.FloatTensor',
['torch.CudaDoubleTensor'] = 'torch.DoubleTensor',
}
local function checkHalf()
if cutorch.hasHalf then
table.insert(typenames, 'torch.CudaHalfTensor')
t2cpu['torch.CudaHalfTensor'] = 'torch.HalfTensor'
end
end
local function half_max_error(maxabs)
-- arbitrarily double the precision limit
return 2 * ((maxabs and (2^(math.floor(math.log(maxabs) / math.log(2)))) * (2^(-10))) or 0)
end
-- Create an instance of the test framework
function precision(typename, max_error)
if typename == 'torch.CudaHalfTensor' then
return 5e-2 + half_max_error(max_error)
else
return 1e-5
end
end
-- Create an instance of the test framework
local mytester = torch.Tester()
local test = torch.TestSuite()
local function copyTable(x) -- Shallow copy
local ret = {}
for key, value in pairs(x) do ret[key] = value end
return ret
end
local function createSplitNetwork(dim, dimSize)
local split = nn.ConcatTable()
for i = 1, dimSize do
split:add(nn.Narrow(dim, i, 1))
end
return split
end
-- Build a binary classifier that takes in a table of tensors and outputs
-- a table of tensors. We will split the BATCHES across GPUs.
local function buildNet(width, height, pool, feat, filt, tableInOut, numConvs)
local net = nn.Sequential()
if tableInOut then
net:add(nn.JoinTable(2)) -- Join R,G,B tensors into RGB
end
assert(math.fmod(filt,2) == 1)
for i = 1, numConvs do
local fin = 3
if (i > 1) then fin = feat end
net:add(nn.SpatialConvolutionMM(fin, feat, filt, filt, 1, 1, (filt-1)/2))
net:add(nn.Threshold())
end
net:add(nn.SpatialMaxPooling(pool, pool))
net:add(nn.Reshape(width * height * feat / (pool * pool)))
net:add(nn.Linear(width * height * feat / (pool * pool), 2))
-- net:add(nn.SoftMax()) -- This is fake anyway, so just do regression :-)
if tableInOut then
net:add(createSplitNetwork(2,2))
end
return net
end
local function serialize(net)
net:clearState()
local uniq = sys.execute('echo "$(($(date +%s%N)/1000000))"')
local f = torch.DiskFile(string.format('/tmp/%s', uniq), 'w')
f:binary()
f:writeObject(net)
f:close()
return string.format('/tmp/%s', uniq)
end
local function deserialize(file)
local f = torch.DiskFile(file)
f:binary()
local net = f:readObject()
f:close()
os.execute(string.format('rm %s', file))
return net
end
function test.DataParallelTable()
for k, typename in ipairs(typenames) do
test_DataParallelTable(typename)
end
end
function test_DataParallelTable(gtype)
local width = 16
local height = 16
local pool = 4
local feat = 8
local filt = 5
local numConvs = 2
local numSgdSteps = 10
local syncGpuCpuParamsEvery = 4
assert(numGpus > 1)
-- test for various batchSizes, not necessarily multiples of nGpus:
for _,batchSize in ipairs {2 * numGpus, 9, 15} do
collectgarbage()
-- Build a CPU model
local cpuClassifier = buildNet(width, height, pool, feat, filt, true,
numConvs)
-- Build a multi-GPU model
local gClassifier = nn.DataParallelTable(1):type(gtype)
for i = 1, numGpus do
local curGpu = math.fmod(baseGpu+(i-1)-1, cutorch.getDeviceCount()) + 1
cutorch.setDevice(curGpu)
gClassifier:add(cpuClassifier:clone():type(gtype), curGpu)
end
cutorch.setDevice(baseGpu)
-- Now wrap them in layers that will split up the input tensor and join the
-- output tensor (I know this seems stupid - and it is - but we need to test
-- DataParallelTable under table inputs and when it is embedded in a network.
local cNet = nn.Sequential()
cNet:add(createSplitNetwork(2,3))
cNet:add(cpuClassifier)
cNet:add(nn.JoinTable(2))
cNet:type(gtype)
local gNet = nn.Sequential()
gNet:add(createSplitNetwork(2,3))
gNet:add(gClassifier)
gNet:add(nn.JoinTable(2):type(gtype))
gNet:get(1):type(gtype)
gNet:get(3):type(gtype)
-- Force in a serialization / deserialization pass ------------
local file = serialize(gNet)
gNet = nil
collectgarbage()
collectgarbage()
gNet = deserialize(file)
----------------------------------------------------------------
local cInput = torch.rand(batchSize, 3, height, width):type(gtype)
local gInput = cInput:type(gtype)
local cTarget = torch.rand(batchSize, 2):type(gtype)
local gTarget = cTarget:type(gtype):type(gtype)
local cParams, cGradParams = cNet:getParameters()
local gParams, gGradParams = gNet:getParameters()
assert(cutorch.getDevice() == baseGpu,
'getParameters: didnt restore GPU state')
-- Set up an MSE optimizer on the GPU and CPU
local optimStateCpu = {
learningRate = 0.1, -- Artificially big learning rate
weightDecay = 0,
momentum = 0.9,
dampening = 0,
learningRateDecay = 0,
nesterov = true,
}
local optimStateGpu = copyTable(optimStateCpu)
local optimMethod = optim.sgd
local criterionCpu = nn.MSECriterion():type(gtype)
local criterionGpu = criterionCpu:clone():type(gtype)
for i = 1, numSgdSteps do
collectgarbage()
local fevalCpu = function(x)
if x ~= cParams then cParams:copy(x) end
cNet:zeroGradParameters()
-- FPROP + BPROP on CPU
local output = cNet:forward(cInput)
local err = criterionCpu:forward(output, cTarget)
local gradOutput = criterionCpu:backward(output, cTarget)
local gradInput = cNet:backward(cInput, gradOutput)
return err, cGradParams
end
local fevalGpu = function(x)
if x ~= gParams then gParams:copy(x) end
gNet:zeroGradParameters()
assert(cutorch.getDevice() == baseGpu,
'zeroGradParameters: didnt restore GPU state')
-- FPROP + BPROP on GPU
local output = gNet:forward(gInput)
assert(cutorch.getDevice() == baseGpu,
'DataParallelTable:forward didnt restore GPU state')
local err = criterionGpu:forward(output, gTarget)
local gradOutput = criterionGpu:backward(output, gTarget)
local gradInput = gNet:backward(gInput, gradOutput)
assert(cutorch.getDevice() == baseGpu,
'DataParallelTable:add didnt restore GPU state')
return err, gGradParams
end
-- Perform an SGD step on the GPU and CPU
optimMethod(fevalCpu, cParams, optimStateCpu)
optimMethod(fevalGpu, gParams, optimStateGpu)
gNet:findModules('nn.DataParallelTable')[1]:syncParameters()
assert(cutorch.getDevice() == baseGpu,
'DataParallelTable:syncParameters didnt restore GPU state')
-- Now make sure that everything is the same
local cOutput = cNet.output
local gOutput = gNet.output
local cGradInput = cNet.gradInput
local gGradInput = gNet.gradInput
mytester:assertlt((cOutput:double() - gOutput:double()):abs():max(),
precision(gtype, cOutput:clone():double():abs():max()), 'fprop error ' .. gtype)
mytester:assertlt((criterionCpu.gradInput:double() -
criterionCpu.gradInput:double()):abs():max(),
precision(gtype, criterionGpu.gradInput:clone():double():abs():max()),
'CRITERION BPROP error ' .. gtype)
mytester:assertlt((cParams:double() - gParams:double()):abs():max(),
precision(gtype, cParams:clone():double():abs():max()), 'parameters error ' .. gtype)
mytester:assertlt((cGradParams:double() - gGradParams:double()):abs():max(),
precision(gtype, cGradParams:clone():double():abs():max()), 'BPROP error (gradParams) ' .. gtype)
mytester:assertlt((cGradInput:double() - gGradInput:double()):abs():max(),
precision(gtype, cGradInput:clone():double():abs():max()), 'BPROP error (gradInput) ' .. gtype)
-- Sync the CPU and GPU weights every few "epochs" to prevent floating point
-- drift between SGD iterations (ie, they will eventually be divergent after
-- enough iters)
if math.fmod(i, syncGpuCpuParamsEvery) == 0 then
local cp = cNet:parameters()
local gp = gNet:get(2):get(1):parameters()
assert(#cp == #gp)
for j = 1, #cp do
cp[j]:copy(gp[j])
end
end
end
end
end
function test.DataParallelTable_smallBatch()
for k, typename in ipairs(typenames) do
test_DataParallelTable_smallBatch(typename)
end
end
function test_DataParallelTable_smallBatch(gtype)
local net = nn.SpatialConvolution(3, 3, 3, 5):type(gtype)
local dpt = nn.DataParallelTable(1):type(gtype)
for i=1,numGpus do
cutorch.withDevice(i, function()
dpt:add(net:clone():type(gtype), i)
end)
end
-- Check for batches that are smaller than numGpus or don't divide evenly
for _,batchSize in ipairs{numGpus-1,2*numGpus-1} do
local input = torch[gtype:match('torch.(%a+)')](batchSize,3,10,10):uniform(-1, 1)
-- Check that forward works as expected
local output = dpt:forward(input)
local expected = net:forward(input)
assert((expected - output):abs():max() < precision(gtype, expected:clone():abs():max()), 'unexpected output')
local gradOutput = output:clone():uniform(-1, 1)
local gradInput = dpt:updateGradInput(input, gradOutput)
local expected = net:updateGradInput(input, gradOutput)
assert((expected - gradInput):abs():max() < precision(gtype, expected:clone():abs():max()), 'unexpected gradInput')
end
end
function test.DataParallelTable_emptyTensor()
for k, typename in ipairs(typenames) do
test_DataParallelTable_emptyTensor(typename)
end
end
function test_DataParallelTable_emptyTensor(gtype)
local net = nn.Sequential():add(nn.SelectTable(2)):add(nn.Linear(10,2)):type(gtype)
local dpt = nn.DataParallelTable(1):type(gtype)
for i=1,numGpus do
cutorch.withDevice(i, function()
dpt:add(net:clone():type(gtype), i)
end)
end
local input = {torch[gtype:match('torch.(%a+)')](0), torch[gtype:match('torch.(%a+)')](numGpus, 10):fill(1)}
local output = dpt:forward(input)
local expected = net:forward(input)
assert((output - expected ):abs():max() < precision(gtype, expected:clone():abs():max()), 'unexpected output')
local gradOutput = output:clone():uniform(-1,1)
local gradInput = dpt:backward(input, gradOutput)
local expected = net:backward(input, gradOutput)
assert((expected[2] - gradInput[2]):abs():max() < precision(gtype, expected[2]:clone():abs():max()), 'unexpected gradInput')
end
function test.DataParallelTable_type()
for k, typename in ipairs(typenames) do
test_DataParallelTable_type(typename)
end
end
function test_DataParallelTable_type(gtype)
local ctype = t2cpu[gtype]
local net = nn.SpatialConvolution(3, 3, 3, 5):type(ctype)
local dpt = nn.DataParallelTable(1):type(gtype)
for i=1,numGpus do
cutorch.withDevice(i, function()
dpt:add(net:clone(), i)
end)
end
dpt:type(gtype)
ok = pcall(function() dpt:type(ctype) end)
assert(not ok, 'should not be able to call DataParallelTable:type(' .. ctype .. ')')
end
function test.DataParallelTable_sync()
for k, typename in ipairs(typenames) do
test_DataParallelTable_sync(typename)
end
end
function test_DataParallelTable_sync(gtype)
-- Test that DataParallelTable automatically syncParameters in updateOutput
-- if you forget to call :syncParameters()
local nSteps = 10
local net = nn.Sequential()
:add(nn.Linear(10, 10))
:add(nn.ReLU(true))
:add(nn.Linear(10, 10))
:type(gtype)
local dpt = nn.DataParallelTable(1):type(gtype)
for i=1,numGpus do
cutorch.withDevice(i, function()
dpt:add(net:clone(), i)
end)
end
local criterion = nn.MSECriterion():type(gtype)
local optimState = {
learningRate = 1,
momentum = 0,
}
local input = torch[gtype:match('torch.(%a+)')](numGpus,10)
local target = torch[gtype:match('torch.(%a+)')](numGpus,10)
local function feval(net)
local params, gradParams = net:getParameters()
return params, function(x)
net:zeroGradParameters()
local output = net:forward(input)
local err = criterion:forward(output, target)
local gradOutput = criterion:backward(output, target)
local gradInput = net:backward(input, gradOutput)
return err, gradParams
end
end
local paramsDpt, fevalDpt = feval(dpt)
local paramsBase, fevalBase = feval(net)
for i=1,nSteps do
input:uniform(-1, 1)
target:uniform(-1, 1)
optim.sgd(fevalDpt, paramsDpt, optimState)
optim.sgd(fevalBase, paramsBase, optimState)
end
assert((paramsDpt - paramsBase):abs():max() < precision(gtype, paramsDpt:clone():abs():max()),
'parameters do not match')
end
function test.DataParallelTable_serialize()
for k, typename in ipairs(typenames) do
test_DataParallelTable_serialize(typename)
end
end
function test_DataParallelTable_serialize(gtype)
-- Test serialization after getParameters()
local net = nn.Linear(10, 10):type(gtype)
local dpt = nn.DataParallelTable(1):type(gtype)
for i=1,numGpus do
cutorch.withDevice(i, function()
dpt:add(net:clone():type(gtype), i)
end)
end
dpt:getParameters()
dpt = deserialize(serialize(dpt))
local input = torch[gtype:match('torch.(%a+)')](numGpus,10):uniform(-1, 1)
-- Check that forward works as expected
local output = dpt:forward(input)
assert(output and output:sum() ~= 0, 'unexpected output')
-- Zero the weights on the first tower and sync paramteters
-- to check that Tensors are pointing to the proper storages
dpt.flattenedParams[1][1]:zero()
dpt:syncParameters()
output = dpt:forward(input)
assert(output:sum() == 0, 'weights not zeroed')
end
function test.DataParallelTable_flattenParameters()
for k, typename in ipairs(typenames) do
test_DataParallelTable_flattenParameters(typename)
end
end
function test_DataParallelTable_flattenParameters(gtype)
-- Wrap only a part of a network with data parallel table and
-- check if the correct number of parameters have been copied
local seq = nn.Sequential()
local layer1 = nn.Linear(10, 10):type(gtype)
local layer2 = nn.Linear(10, 5):type(gtype)
local dpt = nn.DataParallelTable(1, true, true):threads():type(gtype)
dpt:add(layer2, torch.range(1, numGpus):totable())
seq:add(layer1):add(dpt)
seq:getParameters()
local input = torch.randn(7, 10):type(gtype)
seq:forward(input)
-- There are 55 parameters in layer 2 (50 + 5 bias weights)
assert(dpt.flattenedParams[1][1]:size(1) == 55, "Incorrect number of " ..
"parameters copied")
-- Check grad weights
assert(dpt.flattenedParams[1][2]:size(1) == 55, "Incorrect number of " ..
"parameters copied")
end
function test.DataParallelTable_misc()
for k, typename in ipairs(typenames) do
test_DataParallelTable_misc(typename)
end
end
function test_DataParallelTable_misc(gtype)
local net = nn.Sequential()
:add(nn.Linear(3, 10))
:add(nn.ReLU())
:add(nn.Linear(10, 7))
local dpt = nn.DataParallelTable(1):type(gtype)
:add(net, torch.range(1, numGpus):totable())
:threads()
:type(gtype)
local input = torch.randn(8, 3):type(gtype)
local output = dpt:forward(input)
-- check that clone works
dpt = dpt:clone()
local output2 = dpt:forward(input)
assert((output2 - output):abs():max() == 0)
-- check findModules and listModules
local modules = dpt:listModules()
assert(#modules == #net:listModules() + 1)
assert(torch.type(modules[1]) == 'nn.DataParallelTable')
assert(torch.type(modules[2]) == 'nn.Sequential')
assert(#dpt:findModules('nn.ReLU') == 1)
end
function test.DataParallelTable_noGradInput()
for k, typename in ipairs(typenames) do
test_DataParallelTable_noGradInput(typename)
end
end
function test_DataParallelTable_noGradInput(gtype)
local net = nn.Sequential()
:add(nn.LookupTable(10, 10))
:add(nn.Linear(10, 7))
:add(nn.ReLU())
:type(gtype)
local dpt = nn.DataParallelTable(1)
:add(net, torch.range(1, numGpus):totable())
:threads()
:type(gtype)
local input = torch.Tensor(5):random(10):type(gtype)
local output1 = net:forward(input):clone()
local gradOutput = output1:clone():uniform(-1, 1)
local gradInput1 = net:backward(input, gradOutput):clone()
local output2 = dpt:forward(input)
local gradInput2 = dpt:backward(input, gradOutput)
mytester:assertlt((output1 - output2):abs():max(), precision(gtype, output1:clone():abs():max()),
'forward prop error')
mytester:asserteq(gradInput2:nElement(), gradInput1:nElement())
end
function test.DataParallelTable_accGradParameters()
for k, typename in ipairs(typenames) do
test_DataParallelTable_accGradParameters(typename)
end
end
function test_DataParallelTable_accGradParameters(gtype)
local net = nn.Sequential()
:add(nn.Linear(3, 10))
:add(nn.ReLU())
:add(nn.Linear(10, 7))
:type(gtype)
local inputs = {}
local gradOutputs = {}
for i=1,3 do
inputs[i] = torch.randn(8, 3):type(gtype)
gradOutputs[i] = torch.randn(8, 7):type(gtype)
end
local configs = {
{1, false, false},
{1, true, false},
}
local function accumulateGradient(m)
m:zeroGradParameters()
for i=1,#inputs do
m:forward(inputs[i])
m:backward(inputs[i], gradOutputs[i])
end
m:updateParameters(0.5)
end
local base = net:clone()
accumulateGradient(base)
local expected = base:forward(inputs[1])
for _, config in ipairs(configs) do
local dpt = nn.DataParallelTable(table.unpack(config))
:add(net:clone(), torch.range(1, numGpus):totable()):type(gtype)
accumulateGradient(dpt)
local output = dpt:forward(inputs[1])
mytester:assertlt((output - expected):abs():max(), precision(gtype, expected:clone():abs():max()), 'invalid output ' .. gtype)
end
end
function test.DataParallelTable_apply()
for k, typename in ipairs(typenames) do
test_DataParallelTable_apply(typename)
end
end
function test_DataParallelTable_apply(gtype)
local net = nn.Sequential()
:add(nn.Linear(3, 10))
:add(nn.ReLU())
:add(nn.Linear(10, 7))
:type(gtype)
local inputs = {}
local gradOutputs = {}
for i=1,3 do
inputs[i] = torch.randn(8, 3):type(gtype)
gradOutputs[i] = torch.randn(8, 7):type(gtype)
end
local configs = {
{1, false, false},
{1, true, false},
}
local function trainNetwork(m)
-- Test that apply doesn't break everything. This will be very slow
-- in the training loop, but should still be correct.
local function emptyFn() end
m:apply(emptyFn)
for i=1,#inputs do
m:zeroGradParameters()
m:forward(inputs[i])
m:backward(inputs[i], gradOutputs[i])
m:updateParameters(0.1)
m:apply(emptyFn)
end
end
local base = net:clone()
trainNetwork(base)
local expected = base:forward(inputs[1])
for _, usethreads in ipairs{false,true} do
for _, config in ipairs(configs) do
local dpt = nn.DataParallelTable(table.unpack(config))
:add(net:clone(), torch.range(1, numGpus):totable()):type(gtype)
if usethreads then
dpt:threads()
end
trainNetwork(dpt)
local output = dpt:forward(inputs[1])
mytester:assertlt((output - expected):abs():max(), precision(gtype, expected:clone():abs():max()),
'invalid output: flatten=' .. tostring(config[2]) ..
' threads=' .. tostring(usethreads))
end
end
end
function test.DataParallelTable_streams()
for k, typename in ipairs(typenames) do
test_DataParallelTable_streams(typename)
end
end
function test_DataParallelTable_streams(gtype)
local net = nn.Sequential()
:add(nn.Linear(3, 10))
:add(nn.ReLU())
:add(nn.Linear(10, 7))
:type(gtype)
local input = torch.randn(8, 3):type(gtype)
local gradOutput = torch.randn(8, 7):type(gtype)
local gOutput = net:forward(input):clone()
net:zeroGradParameters()
local gGradInput = net:backward(input, gradOutput):clone()
local configs = {
{1, false, false},
{1, true, false},
{1, true, true},
}
local function test(dpt)
local output = dpt:forward(input)
dpt:zeroGradParameters()
local gradInput = dpt:backward(input, gradOutput)
mytester:assert((output - gOutput):abs():max() == 0, 'invalid output')
mytester:assert((gradInput - gGradInput):abs():max() == 0,
'invalid gradInput')
end
for _, stream in ipairs{0, 1} do
cutorch.setStream(stream)
for _, config in ipairs(configs) do
for _, threads in ipairs{false, true} do
local dpt = nn.DataParallelTable(table.unpack(config))
:add(net, torch.range(1, numGpus):totable())
:type(gtype)
if threads then
dpt:threads(function()
cutorch.reserveStreams(1)
cutorch.setStream(stream)
end)
end
test(dpt)
end
end
end
cutorch.setStream(0)
end
function test.DataParallelTable_emptyData()
for k, typename in ipairs(typenames) do
test_DataParallelTable_emptyData(typename)
end
end
function test_DataParallelTable_emptyData(gtype)
local function eq(a,b)
if not torch.isTensor(a) then
local res = true
for i = 1, #a do
res = res and eq(a[i], b[i])
end
return res
end
return a:clone():add(-b):abs():max() == 0
end
local identity = nn.Linear(5,5)
identity.bias:zero()
identity.weight=torch.eye(5)
local a = nn.DataParallelTable(1)
a:add(identity, torch.range(1,numGpus):totable())
a:type(gtype)
local inputs = {torch.range(1,numGpus*5):reshape(numGpus,5):type(gtype),
torch.range(1,5):reshape(1,5):type(gtype),
torch.range(1,10):reshape(2,5):type(gtype),
}
for _, input in ipairs(inputs) do
local output = a:forward(input)
local gradInput = a:backward(input, output)
mytester:assert(eq(input, output))
mytester:assert(eq(input, gradInput))
end
a = nn.DataParallelTable(1)
a:add(nn.ParallelTable():add(identity):add(identity), torch.range(1,numGpus):totable())
a:type(gtype)
for _, input in ipairs(inputs) do
input = {input, input}
local output = a:forward(input)
local gradInput = a:backward(input, output)
mytester:assert(eq(input, output))
mytester:assert(eq(input, gradInput))
end
end
function test.ProfileDataParallelTable()
for k, typename in ipairs(typenames) do
test_ProfileDataParallelTable(typename)
end
end
function test_ProfileDataParallelTable(gtype)
local width = 32
local height = 32
local pool = 4
local feat = 128
local filt = 7
local numConvs = 4
local numRepeats = 10
local modulesToTest = {}
modulesToTest['DataParallelTable'] = nn.DataParallelTable
if profileDp then
modulesToTest['DataParallel'] = nn.DataParallel
end
local deviceCount = numGpus
assert(deviceCount > 1)
for moduleName, module in pairs(modulesToTest) do
for numGpus = 1, deviceCount do
collectgarbage()
print('Profiling ' .. moduleName .. ' with ' .. numGpus .. ' gpus')
local batchSize = 2 * 3 * 4
assert(math.fmod(batchSize, numGpus) == 0)
-- Build a CPU model
local cNet = buildNet(width, height, pool, feat, filt, false, numConvs)
-- Build a multi-GPU model
local gNet = module(1)
if (moduleName == 'DataParallel') then
cutorch.setDevice(baseGpu)
gNet:type(gtype)
elseif (moduleName == 'DataParallelTable') then
gNet:type(gtype)
end
for i = 1, numGpus do
local curGpu = math.fmod(baseGpu+(i-1)-1, cutorch.getDeviceCount())+1
cutorch.setDevice(curGpu)
gNet:add(cNet:clone():type(gtype), curGpu)
end
cutorch.setDevice(baseGpu)
local input = torch.rand(batchSize, 3, height, width):type(gtype)
local target = torch.rand(batchSize, 2):type(gtype)
local gParams, gGradParams
if (moduleName == 'DataParallelTable') then
-- Force in a serialization / deserialization pass ------------
local file = serialize(gNet)
gNet = nil
collectgarbage()
collectgarbage()
gNet = deserialize(file)
----------------------------------------------------------------
gParams, gGradParams = gNet:getParameters()
end
-- Set up an MSE optimizer on the GPU
local optimState = {
learningRate = 0.1,
weightDecay = 0,
momentum = 0.9,
dampening = 0,
learningRateDecay = 0,
nesterov = true,
}
local optimMethod = optim.sgd
local criterion = nn.MSECriterion():type(gtype)
local timeGpuNet = 0
local opt
if (moduleName == 'DataParallel') then
opt = nn.Optim(gNet, optimState)
end
-- Call forward and backward once to hide allocations in profile
do
local output = gNet:forward(input)
gNet:backward(input, output)
end
for i = 1, numRepeats do
collectgarbage()
local fevalGpu = function(x)
if x ~= gParams then gParams:copy(x) end
gNet:zeroGradParameters()
local output = gNet:forward(input)
local err = criterion:forward(output, target)
local gradOutput = criterion:backward(output, target)
local gradInput = gNet:backward(input, gradOutput)
return err, gGradParams
end
-- Perform an SGD step and profile it
sys.tic()
if (moduleName == 'DataParallelTable') then
optimMethod(fevalGpu, gParams, optimState)
gNet:findModules('nn.DataParallelTable')[1]:syncParameters()
else
opt:optimize(optim.sgd, input, target, criterion)
end
cutorch.synchronize()
timeGpuNet = timeGpuNet + sys.toc()
collectgarbage()
end
print(' Time per FPROP+BPROP: ' .. timeGpuNet / numRepeats)
end
end
end
-- Now run the test above
--checkHalf() -- half not enabled yet for DataParallelTable
mytester:add(test)
mytester:run()
Computing file changes ...