https://github.com/Microsoft/CNTK
Tip revision: 66d8088c279cb656f3b1af6dd5685a94e890491e authored by Lingfeng Wu on 05 June 2017, 20:02:04 UTC
Merge remote-tracking branch 'origin/master' into chenzhehuai/lfbmmi-negstream
Merge remote-tracking branch 'origin/master' into chenzhehuai/lfbmmi-negstream
Tip revision: 66d8088
CntkBatchNormalization.cuh
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4100) // 'identifier': unreferenced formal parameter
#pragma warning(disable : 4127) // conditional expression is constant
#pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union
#pragma warning(disable : 4458) // declaration of 'identifier' hides class member
#pragma warning(disable : 4515) // 'namespace': namespace uses itself
#endif
#include <cub/cub.cuh>
#ifdef _MSC_VER
#pragma warning(pop)
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
size_t RoundUpToMultiple(size_t n, size_t blockSize)
{
return (n + blockSize - 1) / blockSize;
}
cudaError_t GetLastCudaError()
{
cudaError_t prelaunchErr = cudaGetLastError();
assert(cudaSuccess == prelaunchErr);
if (prelaunchErr != cudaSuccess)
return prelaunchErr;
#ifndef NO_SYNC
cudaError_t executionErr = cudaStreamSynchronize(GetStream());
assert(cudaSuccess == executionErr);
if (executionErr != cudaSuccess)
return executionErr;
#endif
return cudaSuccess;
}
template <int U, typename T>
__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
template <>
__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
{
// src must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
auto v = *(const float2*)src;
dst[0] = v.x;
dst[1] = v.y;
}
template <>
__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
{
// src must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
// Can do the following instead (use ld.global.nc.* on CC 3.5+):
// asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(src));
// Similar for shared memory (e.g. ld.shared.*)
auto v = *(const float4*)src;
dst[0] = v.x;
dst[1] = v.y;
dst[2] = v.z;
dst[3] = v.w;
}
template <int U, typename T>
__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
template <>
__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
{
// dst must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
float2 v;
v.x = src[0];
v.y = src[1];
*(reinterpret_cast<float2*>(dst)) = v;
}
template <>
__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
{
// dst must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
float4 v;
v.x = src[0];
v.y = src[1];
v.z = src[2];
v.w = src[3];
*(reinterpret_cast<float4*>(dst)) = v;
}
template <typename T>
__device__ __forceinline__ T Shuffle(T input, int srcLane)
{
#ifdef __CUDA_ARCH__
// shfl is supported only on Kepler+
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
return cub::ShuffleIndex(input, srcLane);
#else
assert(false);
return input; // keep compiler happy
#endif
}
namespace Operations
{
__device__ float RSqrt(float a)
{
// REVIEW alexeyk: rsqrtf is just one MUFU.RSQ instruction so it's faster than
// __frsqrt_rn intrinsic which performs round-to-nearest-even rounding which adds ~10 other instructions.
// __frsqrt_rn is unbiased rounding though, need to verify whether it is a better choice for BN implementation.
//return __frsqrt_rn(a);
assert(::isfinite(a) && a > 0);
return rsqrtf(a);
}
__device__ double RSqrt(double a)
{
assert(::isfinite(a) && a > 0);
return rsqrt(a);
}
}
// This function is used to select correct unroll factor.
// REVIEW alexeyk: ask our C++ gurus (Marko/Amit) if there is better way.
template <template <int> class Func, typename T, typename ...Targs>
void Call(size_t vectorSize, Targs... args)
{
if ((vectorSize % 4) == 0)
Func<4>::template Call<T>(args...);
else if ((vectorSize % 2) == 0)
Func<2>::template Call<T>(args...);
else
Func<1>::template Call<T>(args...);
}
//--------------------------------------------------------------------
// Mean and variance computation
//--------------------------------------------------------------------
// The kernel implements online, parallel and numerically stable algorithm
// for computing batch mean and variance (and inverse standard deviation) with one pass over the data.
// It uses algorithms by Knuth/Welford and Chan et al (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
// In short, algorithm has 2 steps:
// 1. Each thread strides over the input and computes mean and
// m2 value (used to compute variance and inverse standard deviation at the end) - Welford algorithm.
// 2. Parallel reduction (Chan algorithm) performed by columns (note that
// thread block and grid X dimensions go along the vector and Y dimension - along the batch).
// As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
//
// Running mean and variance will be averaged according to an exponential
// averaging factor (expAvgFactor), taking the running statistics with weight
// (1 - expAvgFactor).
// Batch mean and inverse standard deviation will be further averaged according
// to a blending factor (blendFactor), taking the running statistics with
// weight blendFactor.
// If (expAvgFactor = 0) && (blendFactor = 1), there is no need to call this
// function, since there no update based on batch data is involved (inference
// mode).
//
// Averaging into running variables (runMean, runVariance):
// expAvgFactor == 0 - use running mean/var instead of the actual batch mean/var.
// 0 < expAvgFactor < 1 - average running mean/var with actual batch mean/var, e.g.,
// new runMean = expAvgFactor * actual batch mean + (1 - expAvgFactor) * runMean
// expAvgFactor == 1 - use actual batch mean/var
//
// Blending into batch variables (based on new running statistics computed above):
// blendFactor == 1 - use (new) running mean/var instead of the current actual batch mean/var.
// 0 < blendFactor < 1 - blend new running mean/var with averaged mean/var of the current minibatch, e.g.,
// new xMean = (1 - blendFactor) * actual batch mean + blendFactor * new runMean
// blendFactor == 0 - use actual batch mean/var
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
const ElemType* x, // (in) input data
double expAvgFactor, // TODO why not ElemType? same for the other parameters, functions?
double blendFactor,
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev) // (out) this minibatch's mean and inverse stddev
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert((vectorSize % U) == 0);
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert(::isfinite(epsilon) && epsilon > 0);
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
assert(batchSize >= 1);
if (expAvgFactor != 0 || blendFactor != 1)
{
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
// --- estimate this minibatch's mean/variance
// first estimate mean over all data for this thread
int n = 0;
ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
ElemType m2[U]; // likewise for variance
ElemType im2[U]; // and inverse stddev
#pragma unroll
for (int k = 0; k < U; k++)
{
mean[k] = 0;
m2[k] = 0;
}
int icolSrc = threadIdx.y;
const ElemType* psrc = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
n++;
ElemType curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
m2[k] += d * (curVal[k] - mean[k]);
}
psrc += vectorSize * BlockDimY;
}
// now reduce minibatch mean/variance across threads
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
{
int srcLane = laneId + BlockDimX * i;
int n2 = Shuffle(n, srcLane);
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
}
n = nsum;
}
}
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
__shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
const int iwarp = tid / CUB_PTX_WARP_THREADS;
if (iwarp > 0 && laneId < BlockDimX)
{
if (laneId == 0)
nRes[iwarp - 1] = n;
#pragma unroll
for (int k = 0; k < U; k++)
{
meanRes[laneId * U + k][iwarp - 1] = mean[k];
m2Res[laneId * U + k][iwarp - 1] = m2[k];
}
}
__syncthreads();
// --- final reduction and update of running mean/variance
// Accumulate and write final results.
// REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
if (threadIdx.y == 0)
{
// Use simple loop as number of warps is small, 8 at max.
#pragma unroll
for (int i = 0; i < cwarp - 1; i++)
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
}
n = nsum;
}
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
ElemType run[U];
ElemType x[U];
// Compute running mean and batch mean.
LoadValues<U>(runMean + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
{
run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
x[k] = blendFactor * run[k] + (1.0 - blendFactor) * mean[k];
}
StoreValues<U>(run, runMean + idxDstBase);
StoreValues<U>(x, xMean + idxDstBase);
// At this point, runMean[] and xMean[] have been updated
// Compute running variance and batch inverse standard deviation
LoadValues<U>(runVariance + idxDstBase, run);
// TODO add back special cases
#pragma unroll
for (int k = 0; k < U; k++)
{
// Compute batch inverse standard deviation and variance
ElemType runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
// Average
run[k] = expAvgFactor * runVariance + (1.0 - expAvgFactor) * run[k];
// Blend
im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
}
}
StoreValues<U>(run, runVariance + idxDstBase);
StoreValues<U>(im2, xInvStdDev + idxDstBase);
// at this point, runVariance[] xInvStdDev[] have been updated
}
}
else if (threadIdx.y == 0)
{
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
ElemType run[U];
// Copy mean
LoadValues<U>(runMean + idxDstBase, run);
StoreValues<U>(run, xMean + idxDstBase);
// Copy & convert variance
LoadValues<U>(runVariance + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
run[k] = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
StoreValues<U>(run, xInvStdDev + idxDstBase);
}
}
// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
// but also W and H dimensions.
// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
double expAvgFactor, double blendFactor,
ElemType* runMean, ElemType* runVariance,
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
assert(::isfinite(epsilon) && epsilon > 0);
assert(batchSize >= 1);
if (expAvgFactor != 0 || blendFactor != 1)
{
int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
int irowSrcLim = (blockIdx.x + 1) * spatialSize;
int n = 0;
ElemType mean[U];
ElemType m2[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
mean[k] = 0;
m2[k] = 0;
}
int icolSrc = threadIdx.y;
const ElemType* psrcBase = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
const ElemType* psrc = psrcBase;
// Stride over all values in feature map (W and H dimensions).
for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
{
n++;
ElemType curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n;
m2[k] += d * (curVal[k] - mean[k]);
}
}
psrcBase += vectorSize * BlockDimY;
}
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
{
int srcLane = laneId + i;
int n2 = Shuffle(n, srcLane);
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
}
n = nsum;
}
}
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[U][cwarp - 1];
__shared__ ElemType m2Res[U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
const int iwarp = tid / CUB_PTX_WARP_THREADS;
if (iwarp > 0 && laneId == 0)
{
nRes[iwarp - 1] = n;
#pragma unroll
for (int k = 0; k < U; k++)
{
meanRes[k][iwarp - 1] = mean[k];
m2Res[k][iwarp - 1] = m2[k];
}
}
__syncthreads();
// One thread will accumulate and write final results.
if (tid == 0)
{
// Use simple loop as number of warps is small, 8 at max.
#pragma unroll
for (int i = 0; i < cwarp - 1; i++)
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[k][i] + d[k] * n * dScaled;
}
n = nsum;
}
// Final step - accumlate results in mean[0] and m2[0].
// REVIEW alexeyk: move outside of the loop, before storing values to smem.
#pragma unroll
for (int k = 1; k < U; k++)
{
ElemType d = mean[k] - mean[0];
ElemType dScaled = d * n / (n + k * n);
mean[0] += dScaled;
m2[0] += m2[k] + d * k * n * dScaled;
}
// TODO add back special cases
runMean[blockIdx.x] = expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x];
xMean[blockIdx.x] = blendFactor * runMean[blockIdx.x] + (1.0 - blendFactor) * mean[0];
ElemType runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
runVariance[blockIdx.x] = expAvgFactor * runV + (1.0 - expAvgFactor) * runVariance[blockIdx.x];
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
}
}
}
else if (threadIdx.y == 0 && threadIdx.x == 0)
{
xMean[blockIdx.x] = runMean[blockIdx.x];
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
}
}
// The struct is used by Call function to select proper template in runtime based on the size of the vector.
// The same pattern is used in other cases of similar structs.
template <int U>
struct ComputeBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize,
const ElemType* x, // (in) input data
double expAvgFactor,
double blendFactor,
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize),
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
}
};
template <int U>
struct ComputeSpatialBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
double expAvgFactor, double blendFactor, ElemType* runMean, ElemType* runVariance,
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
{
assert((vectorSize % spatialSize) == 0);
assert((spatialSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
// Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
}
};
//--------------------------------------------------------------------
// Forward propagation
// All functions accept input/outputs tensors in column-major format where each column is a vector of a minibatch.
// In convolutional case (i.e. spatial=true), each vector is in CHW format where W dimension has stride = 1.
// Tensors for biases and inverse stddevs have dimensions that equal to vector dimension in non-convolutional (i.e. spatial=false)
// or Cx1x1 in convolutional case.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType>
__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize,
double epsilon,
const ElemType* x, ElemType* y,
const ElemType* bnScale, const ElemType* bnBias,
const ElemType* runningMean, const ElemType* runningVariance,
const ElemType* batchMean, ElemType* batchInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((vectorSize % U) == 0);
assert(!Spatial || (spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
__shared__ ElemType scaleS[BlockDimX * U];
__shared__ ElemType biasS[BlockDimX * U];
int offs = threadIdx.x * U;
// REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
if (threadIdx.y == 0)
{
if (Spatial)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
int imap = (irowBase + k) / spatialSize;
meanS[offs + k] = NormalizeRunningStats ? runningMean[imap] : batchMean[imap];
invStdDevS[offs + k] = NormalizeRunningStats
? Operations::RSqrt(static_cast<ElemType>(runningVariance[imap] + epsilon))
: batchInvStdDev[imap];
scaleS[offs + k] = bnScale[imap];
biasS[offs + k] = bnBias[imap];
}
}
else
{
LoadValues<U>((NormalizeRunningStats ? runningMean : batchMean) + irowBase, meanS + offs);
#pragma unroll
for (int k = 0; k < U; k++)
{
invStdDevS[offs + k] = NormalizeRunningStats
? Operations::RSqrt(static_cast<ElemType>(runningVariance[irowBase + k] + epsilon))
: batchInvStdDev[irowBase + k];
}
LoadValues<U>(bnScale + irowBase, scaleS + offs);
LoadValues<U>(bnBias + irowBase, biasS + offs);
}
}
__syncthreads();
ElemType mean[U];
ElemType invStdDev[U];
ElemType scale[U];
ElemType bias[U];
LoadValues<U>(meanS + offs, mean);
LoadValues<U>(invStdDevS + offs, invStdDev);
LoadValues<U>(scaleS + offs, scale);
LoadValues<U>(biasS + offs, bias);
int icol = blockIdx.y * BlockDimY + threadIdx.y;
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
const ElemType* psrc = x + startOffs;
ElemType* pdst = y + startOffs;
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
{
ElemType val[U];
LoadValues<U>(psrc, val);
#pragma unroll
for (int k = 0; k < U; k++)
{
val[k] = scale[k] * (val[k] - mean[k]) * invStdDev[k] + bias[k];
}
StoreValues<U>(val, pdst);
}
}
template <int U>
struct NormalizeBatchTraining
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
bool normalizeRunningStats, double epsilon,
const ElemType* x, ElemType* y, // (in, out) data to normalize -> normalized data
const ElemType* bnScale, const ElemType* bnBias, // (in) scale/bias to denormalize with
const ElemType* runningMean, const ElemType* runningVariance, // (in) running mean/variance
const ElemType* batchMean, ElemType* batchInvStdDev, // (in) batch mean/stddev to normalize with
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3((unsigned int)RoundUpToMultiple(vectorSize, BlockDimX * U));
if (spatial)
{
if (normalizeRunningStats)
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
else
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
}
else
{
if (normalizeRunningStats)
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
else
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
}
}
};
//--------------------------------------------------------------------
// Backpropagation
// BatchNormalizationBackward back-propagates derivatives of batch normalization function
// with respect to the inputs and scale and bias parameters.
// All tensor dimensions and assumptions are the same as in case of forward propagation.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
assert((vectorSize % U) == 0);
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
// REVIEW alexeyk: first part looks very similar to kComputeBatchMeanAndInvStdDev, any chance to refactor?
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
ElemType mean[U];
ElemType invStdDev[U];
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
// Read mean and inv std dev.
if (threadIdx.y == 0)
{
LoadValues<U>(savedMean + irowSrcBase, mean);
LoadValues<U>(savedInvStdDev + irowSrcBase, invStdDev);
StoreValues<U>(mean, &meanS[threadIdx.x * U]);
StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
}
__syncthreads();
if (threadIdx.y != 0)
{
LoadValues<U>(&meanS[threadIdx.x * U], mean);
LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
}
ElemType ds[U];
ElemType db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] = 0;
db[k] = 0;
}
int icolSrc = threadIdx.y;
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
const ElemType* px = x + startOffs;
const ElemType* pdy = dy + startOffs;
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
{
ElemType curX[U];
ElemType curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
db[k] += pdy[k];
}
}
// Final reduction.
__shared__ ElemType dsS[BlockDimY][BlockDimX * U];
__shared__ ElemType dbS[BlockDimY][BlockDimX * U];
StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
__syncthreads();
// Very simple block reduction. As the block y dim is small (e.g. 16) then the loop
// is executed very few times (e.g. 4) so the performance is good.
// Can be potentially improved by using shuffle instructions (as in kComputeBatchMeanAndInvStdDev).
#pragma unroll
for (int y = BlockDimY / 2; y > 0; y /= 2)
{
if (threadIdx.y < y)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
dsS[threadIdx.y][threadIdx.x * U + k] += dsS[threadIdx.y + y][threadIdx.x * U + k];
dbS[threadIdx.y][threadIdx.x * U + k] += dbS[threadIdx.y + y][threadIdx.x * U + k];
}
__syncthreads();
}
}
// Write results.
if (threadIdx.y == 0)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
dScale[irowSrcBase + k] = dsS[0][threadIdx.x * U + k];
dBias[irowSrcBase + k] = dbS[0][threadIdx.x * U + k];
}
}
}
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = blockIdx.x * spatialSize + threadIdx.x * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
int irowLim = (blockIdx.x + 1) * spatialSize;
ElemType mean;
ElemType invStdDev;
__shared__ ElemType meanS;
__shared__ ElemType invStdDevS;
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
// Read mean and inv std dev.
if (tid == 0)
{
meanS = savedMean[blockIdx.x];
invStdDevS = savedInvStdDev[blockIdx.x];
}
__syncthreads();
if (tid != 0)
{
mean = meanS;
invStdDev = invStdDevS;
}
ElemType ds[U];
ElemType db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] = 0;
db[k] = 0;
}
int icolSrc = threadIdx.y;
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowBase;
const ElemType* pxBase = x + startOffs;
const ElemType* pdyBase = dy + startOffs;
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, pxBase += stride, pdyBase += stride)
{
const ElemType* px = pxBase;
const ElemType* pdy = pdyBase;
// Stride over all values in feature map (W and H dimensions).
for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
{
ElemType curX[U];
ElemType curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
db[k] += pdy[k];
}
}
}
__syncthreads();
using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
// Note: must use separate temp storages for each reduction.
__shared__ typename BlockReduce::TempStorage tmp1;
ElemType dsRes = BlockReduce(tmp1).Sum(ds);
__shared__ typename BlockReduce::TempStorage tmp2;
ElemType dbRes = BlockReduce(tmp2).Sum(db);
if (tid == 0)
{
dScale[blockIdx.x] = dsRes;
dBias[blockIdx.x] = dbRes;
}
}
template <int U>
struct ComputeScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
template <int U>
struct ComputeSpatialScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.z == 1);
assert((vectorSize % U) == 0);
assert(Spatial || spatialSize == 1);
assert(!Spatial || (spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
ElemType scale[U];
ElemType ds[U];
ElemType db[U];
ElemType mean[U];
ElemType invStdDev[U];
// REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
if (Spatial)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
int imap = (irowBase + k) / spatialSize;
scale[k] = bnScale[imap];
ds[k] = dScale[imap];
db[k] = dBias[imap];
mean[k] = savedMean[imap];
invStdDev[k] = savedInvStdDev[imap];
}
}
else
{
LoadValues<U>(bnScale + irowBase, scale);
LoadValues<U>(dScale + irowBase, ds);
LoadValues<U>(dBias + irowBase, db);
LoadValues<U>(savedMean + irowBase, mean);
LoadValues<U>(savedInvStdDev + irowBase, invStdDev);
}
int icol = blockIdx.y * BlockDimY + threadIdx.y;
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
const ElemType* px = x + startOffs;
const ElemType* pdy = dy + startOffs;
ElemType* pdx = dx + startOffs;
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
{
ElemType xCur[U];
ElemType dyCur[U];
ElemType dxCur[U];
LoadValues<U>(px, xCur);
LoadValues<U>(pdy, dyCur);
LoadValues<U>(pdx, dxCur);
// From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
// The formulas for dBias and dScale happen to occur as subexpressions in this gradient as well.
// Leveraging this, this gradient can be simplified to:
// t1 = scale * dL/dyi * invStdDev
// t2 = mbStatsWeight * (-scale / m) * invStdDev * xHat * dL/dScale
// t3 = mbStatsWeight * (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
// with
// dBias = Reduce(dy)
// dScale = Reduce(dy * xHat)
// Simplifying this a bit more, we get the formula below.
ElemType val[U];
int m = Spatial ? batchSize * spatialSize : batchSize;
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
// scale * invStdDev * (
// dL/dyi
// - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
// )
val[k] = dxCur[k] // (adding to gradient)
+ (scale[k] * invStdDev[k]) * (
dyCur[k]
- mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
}
StoreValues<U>(val, pdx);
}
}
template <int U>
struct BackpropagateBatchNormGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
if (spatial)
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
else
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
}
};
}}}