https://github.com/Microsoft/CNTK
Tip revision: 16a41cef30894ca92667bd93079cd6fa11b3e92d authored by Sayan Pathak on 02 November 2017, 16:10:10 UTC
Added super resolution tutorial contributed by Borna with added code to minimize test downloads, fix tests, added documentation and small editorial changes to LSGAN tutorial
Added super resolution tutorial contributed by Borna with added code to minimize test downloads, fix tests, added documentation and small editorial changes to LSGAN tutorial
Tip revision: 16a41ce
CntkBatchNormalization.cuh
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4100) // 'identifier': unreferenced formal parameter
#pragma warning(disable : 4127) // conditional expression is constant
#pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union
#pragma warning(disable : 4458) // declaration of 'identifier' hides class member
#pragma warning(disable : 4515) // 'namespace': namespace uses itself
#endif
#include <cub/cub.cuh>
#ifdef _MSC_VER
#pragma warning(pop)
#endif
namespace Microsoft { namespace MSR { namespace CNTK {
size_t RoundUpToMultiple(size_t n, size_t blockSize)
{
return (n + blockSize - 1) / blockSize;
}
cudaError_t GetLastCudaError()
{
cudaError_t prelaunchErr = cudaGetLastError();
assert(cudaSuccess == prelaunchErr);
if (prelaunchErr != cudaSuccess)
return prelaunchErr;
#ifndef NO_SYNC
cudaError_t executionErr = cudaStreamSynchronize(GetStream());
assert(cudaSuccess == executionErr);
if (executionErr != cudaSuccess)
return executionErr;
#endif
return cudaSuccess;
}
template <int U, typename T>
__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
template <>
__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
{
// src must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
auto v = *(const float2*)src;
dst[0] = v.x;
dst[1] = v.y;
}
template <>
__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
{
// src must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
// Can do the following instead (use ld.global.nc.* on CC 3.5+):
// asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(src));
// Similar for shared memory (e.g. ld.shared.*)
auto v = *(const float4*)src;
dst[0] = v.x;
dst[1] = v.y;
dst[2] = v.z;
dst[3] = v.w;
}
template <int U, typename T>
__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
{
#pragma unroll
for (int i = 0; i < U; i++)
dst[i] = src[i];
}
template <>
__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
{
// dst must be aligned at 8 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
float2 v;
v.x = src[0];
v.y = src[1];
*(reinterpret_cast<float2*>(dst)) = v;
}
template <>
__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
{
// dst must be aligned at 16 bytes boundary.
assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
float4 v;
v.x = src[0];
v.y = src[1];
v.z = src[2];
v.w = src[3];
*(reinterpret_cast<float4*>(dst)) = v;
}
template <typename T>
__device__ __forceinline__ T Shuffle(T input, int srcLane)
{
#ifdef __CUDA_ARCH__
// shfl is supported only on Kepler+
static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
return cub::ShuffleIndex(input, srcLane);
#else
assert(false);
return input; // keep compiler happy
#endif
}
namespace Operations
{
__device__ float RSqrt(float a)
{
// REVIEW alexeyk: rsqrtf is just one MUFU.RSQ instruction so it's faster than
// __frsqrt_rn intrinsic which performs round-to-nearest-even rounding which adds ~10 other instructions.
// __frsqrt_rn is unbiased rounding though, need to verify whether it is a better choice for BN implementation.
//return __frsqrt_rn(a);
assert(::isfinite(a) && a > 0);
return rsqrtf(a);
}
__device__ double RSqrt(double a)
{
assert(::isfinite(a) && a > 0);
return rsqrt(a);
}
}
// This function is used to select correct unroll factor.
// REVIEW alexeyk: ask our C++ gurus (Marko/Amit) if there is better way.
template <template <int> class Func, typename T, typename ...Targs>
void Call(size_t vectorSize, Targs... args)
{
if ((vectorSize % 4) == 0)
Func<4>::template Call<T>(args...);
else if ((vectorSize % 2) == 0)
Func<2>::template Call<T>(args...);
else
Func<1>::template Call<T>(args...);
}
//--------------------------------------------------------------------
// Mean and variance computation
//--------------------------------------------------------------------
// The kernel implements online, parallel and numerically stable algorithm
// for computing batch mean and variance (and inverse standard deviation) with one pass over the data.
// It uses algorithms by Knuth/Welford and Chan et al (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
// In short, algorithm has 2 steps:
// 1. Each thread strides over the input and computes mean and
// m2 value (used to compute variance and inverse standard deviation at the end) - Welford algorithm.
// 2. Parallel reduction (Chan algorithm) performed by columns (note that
// thread block and grid X dimensions go along the vector and Y dimension - along the batch).
// As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
//
// Running mean and variance will be averaged according to an exponential
// averaging factor (expAvgFactor), taking the running statistics with weight
// (1 - expAvgFactor).
// Batch mean and inverse standard deviation will be further averaged according
// to a blending factor (blendFactor), taking the running statistics with
// weight blendFactor.
// If (expAvgFactor = 0) && (blendFactor = 1), there is no need to call this
// function, since there no update based on batch data is involved (inference
// mode).
//
// Averaging into running variables (runMean, runVariance):
// expAvgFactor == 0 - use running mean/var instead of the actual batch mean/var.
// 0 < expAvgFactor < 1 - average running mean/var with actual batch mean/var, e.g.,
// new runMean = expAvgFactor * actual batch mean + (1 - expAvgFactor) * runMean
// expAvgFactor == 1 - use actual batch mean/var
//
// Blending into batch variables (based on new running statistics computed above):
// blendFactor == 1 - use (new) running mean/var instead of the current actual batch mean/var.
// 0 < blendFactor < 1 - blend new running mean/var with averaged mean/var of the current minibatch, e.g.,
// new xMean = (1 - blendFactor) * actual batch mean + blendFactor * new runMean
// blendFactor == 0 - use actual batch mean/var
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
const ElemType* x, // (in) input data
double expAvgFactor, // TODO why not ElemType? same for the other parameters, functions?
double blendFactor,
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev) // (out) this minibatch's mean and inverse stddev
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert((vectorSize % U) == 0);
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert(::isfinite(epsilon) && epsilon > 0);
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
assert(batchSize >= 1);
if (expAvgFactor != 0 || blendFactor != 1)
{
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
// --- estimate this minibatch's mean/variance
// first estimate mean over all data for this thread
int n = 0;
ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
ElemType m2[U]; // likewise for variance
ElemType im2[U]; // and inverse stddev
#pragma unroll
for (int k = 0; k < U; k++)
{
mean[k] = 0;
m2[k] = 0;
}
int icolSrc = threadIdx.y;
const ElemType* psrc = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
n++;
ElemType curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
m2[k] += d * (curVal[k] - mean[k]);
}
psrc += vectorSize * BlockDimY;
}
// now reduce minibatch mean/variance across threads
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
{
int srcLane = laneId + BlockDimX * i;
int n2 = Shuffle(n, srcLane);
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
}
n = nsum;
}
}
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
__shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
const int iwarp = tid / CUB_PTX_WARP_THREADS;
if (iwarp > 0 && laneId < BlockDimX)
{
if (laneId == 0)
nRes[iwarp - 1] = n;
#pragma unroll
for (int k = 0; k < U; k++)
{
meanRes[laneId * U + k][iwarp - 1] = mean[k];
m2Res[laneId * U + k][iwarp - 1] = m2[k];
}
}
__syncthreads();
// --- final reduction and update of running mean/variance
// Accumulate and write final results.
// REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
if (threadIdx.y == 0)
{
// Use simple loop as number of warps is small, 8 at max.
#pragma unroll
for (int i = 0; i < cwarp - 1; i++)
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
}
n = nsum;
}
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
ElemType run[U];
ElemType x[U];
// Compute running mean and batch mean.
LoadValues<U>(runMean + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
{
run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
x[k] = blendFactor * run[k] + (1.0 - blendFactor) * mean[k];
}
StoreValues<U>(run, runMean + idxDstBase);
StoreValues<U>(x, xMean + idxDstBase);
// At this point, runMean[] and xMean[] have been updated
// Compute running variance and batch inverse standard deviation
LoadValues<U>(runVariance + idxDstBase, run);
// TODO add back special cases
#pragma unroll
for (int k = 0; k < U; k++)
{
// Compute batch inverse standard deviation and variance
ElemType runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
// Average
run[k] = expAvgFactor * runVariance + (1.0 - expAvgFactor) * run[k];
// Blend
im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
}
}
StoreValues<U>(run, runVariance + idxDstBase);
StoreValues<U>(im2, xInvStdDev + idxDstBase);
// at this point, runVariance[] xInvStdDev[] have been updated
}
}
else if (threadIdx.y == 0)
{
size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
ElemType run[U];
// Copy mean
LoadValues<U>(runMean + idxDstBase, run);
StoreValues<U>(run, xMean + idxDstBase);
// Copy & convert variance
LoadValues<U>(runVariance + idxDstBase, run);
#pragma unroll
for (int k = 0; k < U; k++)
run[k] = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
StoreValues<U>(run, xInvStdDev + idxDstBase);
}
}
// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
// but also W and H dimensions.
// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
double expAvgFactor, double blendFactor,
ElemType* runMean, ElemType* runVariance,
double epsilon, ElemType* xMean, ElemType* xInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
assert(::isfinite(epsilon) && epsilon > 0);
assert(batchSize >= 1);
if (expAvgFactor != 0 || blendFactor != 1)
{
int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
int irowSrcLim = (blockIdx.x + 1) * spatialSize;
int n = 0;
ElemType mean[U];
ElemType m2[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
mean[k] = 0;
m2[k] = 0;
}
int icolSrc = threadIdx.y;
const ElemType* psrcBase = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY)
{
const ElemType* psrc = psrcBase;
// Stride over all values in feature map (W and H dimensions).
for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
{
n++;
ElemType curVal[U];
LoadValues<U>(psrc, curVal);
// No need for separate unrolling, SASS looks good.
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType d = curVal[k] - mean[k];
// REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
// Using precise math slows down the code by about 40%.
mean[k] += d / n;
m2[k] += d * (curVal[k] - mean[k]);
}
}
psrcBase += vectorSize * BlockDimY;
}
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
const int laneId = tid & 0x1f;
// First, reduce within warp using shuffle.
if (n > 0)
{
#pragma unroll
for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
{
int srcLane = laneId + i;
int n2 = Shuffle(n, srcLane);
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = Shuffle(mean[k], srcLane) - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
}
n = nsum;
}
}
// Storage for each warp in a thread block. First warp ("accumulator") holds
// final results so it does not need shared memory.
const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
__shared__ ElemType meanRes[U][cwarp - 1];
__shared__ ElemType m2Res[U][cwarp - 1];
__shared__ int nRes[cwarp - 1];
// Each warp (except warp0) will write accumulated results to shared memory.
const int iwarp = tid / CUB_PTX_WARP_THREADS;
if (iwarp > 0 && laneId == 0)
{
nRes[iwarp - 1] = n;
#pragma unroll
for (int k = 0; k < U; k++)
{
meanRes[k][iwarp - 1] = mean[k];
m2Res[k][iwarp - 1] = m2[k];
}
}
__syncthreads();
// One thread will accumulate and write final results.
if (tid == 0)
{
// Use simple loop as number of warps is small, 8 at max.
#pragma unroll
for (int i = 0; i < cwarp - 1; i++)
{
int n2 = nRes[i];
int nsum = n + n2;
ElemType d[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
d[k] = meanRes[k][i] - mean[k];
ElemType dScaled = d[k] * n2 / nsum;
mean[k] += dScaled;
m2[k] += m2Res[k][i] + d[k] * n * dScaled;
}
n = nsum;
}
// Final step - accumlate results in mean[0] and m2[0].
// REVIEW alexeyk: move outside of the loop, before storing values to smem.
#pragma unroll
for (int k = 1; k < U; k++)
{
ElemType d = mean[k] - mean[0];
ElemType dScaled = d * n / (n + k * n);
mean[0] += dScaled;
m2[0] += m2[k] + d * k * n * dScaled;
}
// TODO add back special cases
runMean[blockIdx.x] = expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x];
xMean[blockIdx.x] = blendFactor * runMean[blockIdx.x] + (1.0 - blendFactor) * mean[0];
ElemType runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
runVariance[blockIdx.x] = expAvgFactor * runV + (1.0 - expAvgFactor) * runVariance[blockIdx.x];
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
if (blendFactor != 0)
{
ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
}
}
}
else if (threadIdx.y == 0 && threadIdx.x == 0)
{
xMean[blockIdx.x] = runMean[blockIdx.x];
xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
}
}
// The struct is used by Call function to select proper template in runtime based on the size of the vector.
// The same pattern is used in other cases of similar structs.
template <int U>
struct ComputeBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize,
const ElemType* x, // (in) input data
double expAvgFactor,
double blendFactor,
ElemType* runMean, ElemType* runVariance, // (in/out) running mean/variance, gets updated with current minibatch
double epsilon,
ElemType* xMean, ElemType* xInvStdDev, // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize),
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
}
};
template <int U>
struct ComputeSpatialBatchMeanAndInvStdDev
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
double expAvgFactor, double blendFactor, ElemType* runMean, ElemType* runVariance,
double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
{
assert((vectorSize % spatialSize) == 0);
assert((spatialSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create grid with only one block in y(batch)-dimension as kernel uses striding.
// Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
}
};
//--------------------------------------------------------------------
// Forward propagation
// All functions accept input/outputs tensors in column-major format where each column is a vector of a minibatch.
// In convolutional case (i.e. spatial=true), each vector is in CHW format where W dimension has stride = 1.
// Tensors for biases and inverse stddevs have dimensions that equal to vector dimension in non-convolutional (i.e. spatial=false)
// or Cx1x1 in convolutional case.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType>
__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize,
double epsilon,
const ElemType* x, ElemType* y,
const ElemType* bnScale, const ElemType* bnBias,
const ElemType* runningMean, const ElemType* runningVariance,
const ElemType* batchMean, ElemType* batchInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((vectorSize % U) == 0);
assert(!Spatial || (spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
__shared__ ElemType scaleS[BlockDimX * U];
__shared__ ElemType biasS[BlockDimX * U];
int offs = threadIdx.x * U;
// REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
if (threadIdx.y == 0)
{
if (Spatial)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
int imap = (irowBase + k) / spatialSize;
meanS[offs + k] = NormalizeRunningStats ? runningMean[imap] : batchMean[imap];
invStdDevS[offs + k] = NormalizeRunningStats
? Operations::RSqrt(static_cast<ElemType>(runningVariance[imap] + epsilon))
: batchInvStdDev[imap];
scaleS[offs + k] = bnScale[imap];
biasS[offs + k] = bnBias[imap];
}
}
else
{
LoadValues<U>((NormalizeRunningStats ? runningMean : batchMean) + irowBase, meanS + offs);
#pragma unroll
for (int k = 0; k < U; k++)
{
invStdDevS[offs + k] = NormalizeRunningStats
? Operations::RSqrt(static_cast<ElemType>(runningVariance[irowBase + k] + epsilon))
: batchInvStdDev[irowBase + k];
}
LoadValues<U>(bnScale + irowBase, scaleS + offs);
LoadValues<U>(bnBias + irowBase, biasS + offs);
}
}
__syncthreads();
ElemType mean[U];
ElemType invStdDev[U];
ElemType scale[U];
ElemType bias[U];
LoadValues<U>(meanS + offs, mean);
LoadValues<U>(invStdDevS + offs, invStdDev);
LoadValues<U>(scaleS + offs, scale);
LoadValues<U>(biasS + offs, bias);
int icol = blockIdx.y * BlockDimY + threadIdx.y;
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
const ElemType* psrc = x + startOffs;
ElemType* pdst = y + startOffs;
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
{
ElemType val[U];
LoadValues<U>(psrc, val);
#pragma unroll
for (int k = 0; k < U; k++)
{
val[k] = scale[k] * (val[k] - mean[k]) * invStdDev[k] + bias[k];
}
StoreValues<U>(val, pdst);
}
}
template <int U>
struct NormalizeBatchTraining
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
bool normalizeRunningStats, double epsilon,
const ElemType* x, ElemType* y, // (in, out) data to normalize -> normalized data
const ElemType* bnScale, const ElemType* bnBias, // (in) scale/bias to denormalize with
const ElemType* runningMean, const ElemType* runningVariance, // (in) running mean/variance
const ElemType* batchMean, ElemType* batchInvStdDev, // (in) batch mean/stddev to normalize with
cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3((unsigned int)RoundUpToMultiple(vectorSize, BlockDimX * U));
if (spatial)
{
if (normalizeRunningStats)
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
else
kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
}
else
{
if (normalizeRunningStats)
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
else
kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U><<<gdim, bdim, 0, stream>>>(
(int)vectorSize, (int)spatialSize, (int)batchSize,
epsilon,
x, y, bnScale, bnBias,
runningMean, runningVariance,
batchMean, batchInvStdDev);
}
}
};
//--------------------------------------------------------------------
// Backpropagation
// BatchNormalizationBackward back-propagates derivatives of batch normalization function
// with respect to the inputs and scale and bias parameters.
// All tensor dimensions and assumptions are the same as in case of forward propagation.
//--------------------------------------------------------------------
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
assert((vectorSize % U) == 0);
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
// REVIEW alexeyk: first part looks very similar to kComputeBatchMeanAndInvStdDev, any chance to refactor?
int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowSrcBase >= vectorSize)
return;
assert(irowSrcBase + U <= vectorSize);
ElemType mean[U];
ElemType invStdDev[U];
__shared__ ElemType meanS[BlockDimX * U];
__shared__ ElemType invStdDevS[BlockDimX * U];
// Read mean and inv std dev.
if (threadIdx.y == 0)
{
LoadValues<U>(savedMean + irowSrcBase, mean);
LoadValues<U>(savedInvStdDev + irowSrcBase, invStdDev);
StoreValues<U>(mean, &meanS[threadIdx.x * U]);
StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
}
__syncthreads();
if (threadIdx.y != 0)
{
LoadValues<U>(&meanS[threadIdx.x * U], mean);
LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
}
ElemType ds[U];
ElemType db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] = 0;
db[k] = 0;
}
int icolSrc = threadIdx.y;
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
const ElemType* px = x + startOffs;
const ElemType* pdy = dy + startOffs;
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
{
ElemType curX[U];
ElemType curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
db[k] += pdy[k];
}
}
// Final reduction.
__shared__ ElemType dsS[BlockDimY][BlockDimX * U];
__shared__ ElemType dbS[BlockDimY][BlockDimX * U];
StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
__syncthreads();
// Very simple block reduction. As the block y dim is small (e.g. 16) then the loop
// is executed very few times (e.g. 4) so the performance is good.
// Can be potentially improved by using shuffle instructions (as in kComputeBatchMeanAndInvStdDev).
#pragma unroll
for (int y = BlockDimY / 2; y > 0; y /= 2)
{
if (threadIdx.y < y)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
dsS[threadIdx.y][threadIdx.x * U + k] += dsS[threadIdx.y + y][threadIdx.x * U + k];
dbS[threadIdx.y][threadIdx.x * U + k] += dbS[threadIdx.y + y][threadIdx.x * U + k];
}
__syncthreads();
}
}
// Write results.
if (threadIdx.y == 0)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
dScale[irowSrcBase + k] = dsS[0][threadIdx.x * U + k];
dBias[irowSrcBase + k] = dbS[0][threadIdx.x * U + k];
}
}
}
template <int BlockDimX, int BlockDimY, int U, typename ElemType>
__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.y == 1);
assert(gridDim.z == 1);
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = blockIdx.x * spatialSize + threadIdx.x * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
int irowLim = (blockIdx.x + 1) * spatialSize;
ElemType mean;
ElemType invStdDev;
__shared__ ElemType meanS;
__shared__ ElemType invStdDevS;
const int tid = threadIdx.y * BlockDimX + threadIdx.x;
// Read mean and inv std dev.
if (tid == 0)
{
meanS = savedMean[blockIdx.x];
invStdDevS = savedInvStdDev[blockIdx.x];
}
__syncthreads();
if (tid != 0)
{
mean = meanS;
invStdDev = invStdDevS;
}
ElemType ds[U];
ElemType db[U];
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] = 0;
db[k] = 0;
}
int icolSrc = threadIdx.y;
size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowBase;
const ElemType* pxBase = x + startOffs;
const ElemType* pdyBase = dy + startOffs;
size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
// Stride over all vectors in the batch.
for (; icolSrc < batchSize; icolSrc += BlockDimY, pxBase += stride, pdyBase += stride)
{
const ElemType* px = pxBase;
const ElemType* pdy = pdyBase;
// Stride over all values in feature map (W and H dimensions).
for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
{
ElemType curX[U];
ElemType curdY[U];
LoadValues<U>(px, curX);
LoadValues<U>(pdy, curdY);
#pragma unroll
for (int k = 0; k < U; k++)
{
ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
db[k] += pdy[k];
}
}
}
__syncthreads();
using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
// Note: must use separate temp storages for each reduction.
__shared__ typename BlockReduce::TempStorage tmp1;
ElemType dsRes = BlockReduce(tmp1).Sum(ds);
__shared__ typename BlockReduce::TempStorage tmp2;
ElemType dbRes = BlockReduce(tmp2).Sum(db);
if (tid == 0)
{
dScale[blockIdx.x] = dsRes;
dBias[blockIdx.x] = dbRes;
}
}
template <int U>
struct ComputeScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
template <int U>
struct ComputeSpatialScaleAndBiasGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
// Create a grid that has uses striding in y-dimension to cover whole minibatch.
auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
}
};
// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
const ElemType* savedMean, const ElemType* savedInvStdDev)
{
static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
assert(blockDim.x == BlockDimX);
assert(blockDim.y == BlockDimY);
assert(blockDim.z == 1);
assert(gridDim.z == 1);
assert((vectorSize % U) == 0);
assert(Spatial || spatialSize == 1);
assert(!Spatial || (spatialSize % U) == 0);
assert((vectorSize % spatialSize) == 0);
int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
if (irowBase >= vectorSize)
return;
assert(irowBase + U <= vectorSize);
ElemType scale[U];
ElemType ds[U];
ElemType db[U];
ElemType mean[U];
ElemType invStdDev[U];
// REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
if (Spatial)
{
#pragma unroll
for (int k = 0; k < U; k++)
{
int imap = (irowBase + k) / spatialSize;
scale[k] = bnScale[imap];
ds[k] = dScale[imap];
db[k] = dBias[imap];
mean[k] = savedMean[imap];
invStdDev[k] = savedInvStdDev[imap];
}
}
else
{
LoadValues<U>(bnScale + irowBase, scale);
LoadValues<U>(dScale + irowBase, ds);
LoadValues<U>(dBias + irowBase, db);
LoadValues<U>(savedMean + irowBase, mean);
LoadValues<U>(savedInvStdDev + irowBase, invStdDev);
}
int icol = blockIdx.y * BlockDimY + threadIdx.y;
size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
const ElemType* px = x + startOffs;
const ElemType* pdy = dy + startOffs;
ElemType* pdx = dx + startOffs;
size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
{
ElemType xCur[U];
ElemType dyCur[U];
ElemType dxCur[U];
LoadValues<U>(px, xCur);
LoadValues<U>(pdy, dyCur);
LoadValues<U>(pdx, dxCur);
// From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
// The formulas for dBias and dScale happen to occur as subexpressions in this gradient as well.
// Leveraging this, this gradient can be simplified to:
// t1 = scale * dL/dyi * invStdDev
// t2 = mbStatsWeight * (-scale / m) * invStdDev * xHat * dL/dScale
// t3 = mbStatsWeight * (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
// with
// dBias = Reduce(dy)
// dScale = Reduce(dy * xHat)
// Simplifying this a bit more, we get the formula below.
ElemType val[U];
int m = Spatial ? batchSize * spatialSize : batchSize;
#pragma unroll
for (int k = 0; k < U; k++)
{
ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
// scale * invStdDev * (
// dL/dyi
// - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
// )
val[k] = dxCur[k] // (adding to gradient)
+ (scale[k] * invStdDev[k]) * (
dyCur[k]
- mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
}
StoreValues<U>(val, pdx);
}
}
template <int U>
struct BackpropagateBatchNormGradients
{
template <typename ElemType>
static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
{
assert((vectorSize % U) == 0);
assert(batchSize >= 1);
const int BlockDimX = 32 / U;
const int BlockDimY = 4 * U;
auto bdim = dim3(BlockDimX, BlockDimY);
auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
if (spatial)
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
else
{
kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
}
}
};
}}}