https://github.com/Microsoft/CNTK
Tip revision: a05c3c642648373f4ede0956e4286257c3d59a61 authored by liqfu on 24 August 2018, 17:46:51 UTC
CNTK splice allows broadcast. This case is handled in the change. For noop (identity) ops, its inputs and outputs types shall be set according to upstream ops. ToBatch/ToSequence and Unpack Batch/Sequence ops added during model importing need tp be skipped. Model import need to handle ops with multiple outputs.
CNTK splice allows broadcast. This case is handled in the change. For noop (identity) ops, its inputs and outputs types shall be set according to upstream ops. ToBatch/ToSequence and Unpack Batch/Sequence ops added during model importing need tp be skipped. Model import need to handle ops with multiple outputs.
Tip revision: a05c3c6
Convolution.cuh
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <math_constants.h>
#include "half.hpp"
namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// The file contains CUDA kernels that are used in reference convolution
// engine. All these kernels look very similar as they use the same
// idea of precomputed maps described in ConvolveGeometry.h
// That is, 'mpRowCol' maps each convolution output to the start of the
// input. 'mpRowIwht', 'mpRowRun' and 'runs' provide maps that allow
// to get indices of the active weight when applying the convolution.
// See ConvolveGeometry.h (MpRowCol, MpRowIwht etc) for more details.
// -----------------------------------------------------------------------
template <typename ElemType>
__global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__ kernel,
const int* mpRowCol, const int* mpRowIwht,
const int* mpRowRun, const int* __restrict__ runs,
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
src += blockIdx.y * srcVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < srcVecSize);
comp_t sum = 0;
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs[imask + i] == 0)
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
sum += (comp_t)kernel[ivBase + skip + i] * (comp_t)src[colBase + dcol];
}
dst[row] = sum;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restrict__ kernel,
const int* mpRowCol, const int* mpRowIwht,
const int* mpRowRun, const int* __restrict__ runs,
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
srcGrad += blockIdx.y * srcVecSize;
grad += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < dstVecSize);
comp_t g = srcGrad[row];
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs[imask + i] == 0)
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
atomicAdd(&grad[colBase + dcol], (ElemType)((comp_t)g * (comp_t)kernel[ivBase + skip + i]));
}
srcGrad += blockDim.y * srcVecSize;
grad += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int outVecSize,
const ElemType* __restrict__ in,
const int* mpRowCol, const int* mpRowIwht,
const int* mpRowRun, const int* __restrict__ runs,
const ElemType* __restrict__ srcGrad,
ElemType* kernelGrad)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= outVecSize)
return;
in += blockIdx.y * inVecSize;
srcGrad += blockIdx.y * outVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
int ivBase = mpRowIwht[row];
assert(0 <= colBase && colBase < inVecSize);
comp_t g = srcGrad[row];
int i0 = mpRowRun[row];
int skip = runs[i0++];
int size = runs[i0++];
int imask = i0 + size;
for (int i = 0; i < size; i++)
{
if (runs[imask + i] == 0)
continue;
int dcol = runs[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
atomicAdd(&kernelGrad[ivBase + skip + i], (ElemType)((comp_t)g * (comp_t)in[colBase + dcol]));
}
in += blockDim.y * inVecSize;
srcGrad += blockDim.y * outVecSize;
}
}
template <typename ElemType>
__global__ void kMaxPoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
src += blockIdx.y * srcVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < srcVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
ElemType res = src[colBase + indices[i0]];
for (int i = 1; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
res = max(res, src[colBase + dcol]);
}
dst[row] = res;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kMaxPoolingBackward(int batchSize, const ElemType* out, const ElemType* in,
const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
in += blockIdx.y * dstVecSize;
out += blockIdx.y * srcVecSize;
srcGrad += blockIdx.y * srcVecSize;
grad += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < dstVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
assert(size > 0);
ElemType g = srcGrad[row];
ElemType m = out[row];
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
if (in[colBase + dcol] >= m)
{
atomicAdd(&grad[colBase + dcol], g);
break;
}
}
in += blockDim.y * dstVecSize;
out += blockDim.y * srcVecSize;
srcGrad += blockDim.y * srcVecSize;
grad += blockDim.y * dstVecSize;
}
}
__device__ float round_(float a)
{
return roundf(a);
}
__device__ double round_(double a)
{
return round(a);
}
// For each image, for each ROI, this function treats that ROI as an image
// and does max pooling so that it has output size pooledHeight x pooledWidth.
// The kernel operates on one location in the output tensor, computes which ROI
// and image should populate that location, computes the subset of the image
// corresponding to the ROI and which pixels in that subset should go into the
// output location, then takes the max value over that window.
// src: Images [W x H x C x N]
// roiData: ROIs [4 x numROIs x N],
// dst: Pooled ROIs [PW x PH x C x numROIs x N]
// argmax: max positions [PW x PH x C x numROIs x N]
// spatialScale ratio of input feature map to the original image.
// where PW = Pooled Width, PH = Pooled Height, C = Channels, N = Batch Size
template <typename ElemType>
__global__ void kMaxROIPoolingForward(const int totalIterations,
const int numROIs, const int numImg,
const int channels, const int width, const int height,
const int pooledWidth, const int pooledHeight, const ElemType* src,
const ElemType* roiData, ElemType* dst, ElemType* argmax, double spatialScale)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
// index loops over all totalRois*c*pooledHeight*pooledWidth output locations.
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
index < (totalIterations); index += blockDim.x * gridDim.x)
{
// output is [W x H x C x N]
// n is the global ROI index (the new batch index)
int pw = index % pooledWidth;
int ph = (index / pooledWidth) % pooledHeight;
int c = (index / pooledWidth / pooledHeight) % channels;
int n = index / pooledWidth / pooledHeight / channels;
// each ROI is 4 elements: (x, y, w, h)
roiData += n * 4;
// roi data is relative to original image size
int roiStartW = (int)(round_(roiData[0] * spatialScale));
int roiStartH = (int)(round_(roiData[1] * spatialScale));
int roiEndW = (int)(round_(roiData[2] * spatialScale));
int roiEndH = (int)(round_(roiData[3] * spatialScale));
int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
int roiHeight = max(roiEndH - roiStartH + 1, (int)1);
comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;
// compute window for this output location.
int hstart = (int)(ph * winH);
int wstart = (int)(pw * winW);
int hend = (int)(ceilf((ph + 1) * winH));
int wend = (int)(ceilf((pw + 1) * winW));
// Add ROI offsets and clip to input boundaries
hstart = min(max(hstart + roiStartH, 0), height);
hend = min(max(hend + roiStartH, 0), height);
wstart = min(max(wstart + roiStartW, 0), width);
wend = min(max(wend + roiStartW, 0), width);
bool isempty = (hend <= hstart) || (wend <= wstart);
// Define an empty pooling region to be zero
comp_t maxval = isempty ? (comp_t)0 : (comp_t)-CUDART_INF_F;
int maxidx = -1;
int imgIdx = n / numROIs;
src += (imgIdx * channels + c) * height * width;
for (int h = hstart; h < hend; h++)
{
for (int w = wstart; w < wend; w++)
{
int srcIndex = w + h * width;
if (src[srcIndex] > maxval)
{
maxval = src[srcIndex];
maxidx = srcIndex;
}
}
}
dst[index] = maxval;
argmax[index] = maxidx;
}
}
// The kernel operates on one location in the input to the ROIPoolingNode (one image location).
// It loops over the ROIs corresponding to that image, seeing which ones could contain the location
// in their output. For each ROI, it checks the argmax data to see if that ROI indeed chose
// this pixel location as the maximum. If so, it increments the gradient term for the input location.
template <typename ElemType>
__global__ void kMaxROIPoolingBackward(const int totalIterations,
const int numROIs, const int numImg,
const int channels, const int width, const int height,
const int pooledWidth, const int pooledHeight, const ElemType* pooledGrad,
const ElemType* roiData, ElemType* grad, const ElemType* argmax, double spatialScale)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
// index loops over all input locations (locations in the original input tensor).
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
index < (totalIterations); index += blockDim.x * gridDim.x)
{
// images are laid out [W x H x C x N]
// (n, c, h, w) is an element in the input image
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
// compute range of ROIs corresponding to this image:
int roiMin = n * numROIs;
int roiMax = (n + 1) * numROIs;
comp_t gradient = 0;
for (int roiN = roiMin; roiN < roiMax; roiN++)
{
// each ROI is 4 elements: (x, y, w, h)
const ElemType* roiOffset = roiData + roiN * 4;
// ROI data is absolute pixel value in the original image size
int roiStartW = (int)(round_(roiOffset[0] * spatialScale));
int roiStartH = (int)(round_(roiOffset[1] * spatialScale));
int roiEndW = (int)(round_(roiOffset[2] * spatialScale));
int roiEndH = (int)(round_(roiOffset[3] * spatialScale));
int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
int roiHeight = max(roiEndH - roiStartH + 1, (int)1);
// skip this ROI if it doesn't contain our input location.
const bool inROI = (w >= roiStartW && w < roiStartW + roiWidth &&
h >= roiStartH && h < roiStartH + roiHeight);
if (!inROI)
continue;
comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;
// what pooled nodes in the output for this ROI could have pooled this input location?
// we use int here since the computation can yield a negative result
int phstart = (int)((float)(h - roiStartH) / winH);
int pwstart = (int)((float)(w - roiStartW) / winW);
int phend = (int)(ceilf((float)(h - roiStartH + 1) / winH));
int pwend = (int)(ceilf((float)(w - roiStartW + 1) / winW));
phstart = min(max(phstart, 0), pooledHeight);
pwstart = min(max(pwstart, 0), pooledWidth);
phend = min(max(phend, 0), pooledHeight);
pwend = min(max(pwend, 0), pooledWidth);
// go right up to this channel of this ROI.
int offset = (roiN * channels + c) * pooledWidth * pooledHeight;
const ElemType* offsetPoolGrad = pooledGrad + offset;
const ElemType* offsetArgmax = argmax + offset;
for (int ph = phstart; ph < phend; ph++)
{
for (int pw = pwstart; pw < pwend; pw++)
{
if ((int)offsetArgmax[ph * pooledWidth + pw] == (h * width + w))
{
gradient += (comp_t)offsetPoolGrad[ph * pooledWidth + pw];
}
}
}
}
atomicAdd(&grad[index], (ElemType)gradient);
}
}
template <typename ElemType>
__global__ void kMaxUnpooling(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ src, const ElemType* poolIn, int srcVecSize,
ElemType* dst, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
src += blockIdx.y * srcVecSize;
poolIn += blockIdx.y * dstVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < dstVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
ElemType curMax = poolIn[colBase + indices[i0]];
ElemType prevMax = curMax;
int imax = 0;
for (int i = 1; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
curMax = max(curMax, poolIn[colBase + dcol]);
if (curMax > prevMax)
{
prevMax = curMax;
imax = i;
}
}
int dcol = indices[i0 + imax];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
dst[colBase + dcol] = src[row];
src += blockIdx.y * srcVecSize;
poolIn += blockIdx.y * dstVecSize;
dst += blockIdx.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ src, int srcVecSize,
ElemType* dst, int dstVecSize)
{
typedef typename TypeSelector<ElemType>::comp_t comp_t;
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= dstVecSize)
return;
src += blockIdx.y * srcVecSize;
dst += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < srcVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
comp_t sum = 0;
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
sum += (comp_t)src[colBase + dcol];
}
dst[row] = sum / (comp_t)size;
src += blockDim.y * srcVecSize;
dst += blockDim.y * dstVecSize;
}
}
template <typename ElemType>
__global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
const ElemType* __restrict__ srcGrad, int srcVecSize,
ElemType* grad, int dstVecSize)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= srcVecSize)
return;
srcGrad += blockIdx.y * srcVecSize;
grad += blockIdx.y * dstVecSize;
for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
{
int colBase = mpRowCol[row];
assert(0 <= colBase && colBase < dstVecSize);
int i0 = mpRowIndices[row];
int size = indices[i0++];
assert(size > 0);
ElemType g = srcGrad[row] / size;
for (int i = 0; i < size; i++)
{
int dcol = indices[i0 + i];
assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
atomicAdd(&grad[colBase + dcol], g);
}
srcGrad += blockDim.y * srcVecSize;
grad += blockDim.y * dstVecSize;
}
}
}}}