Content - 4ba0c5cacf661315855480a4be8b1bbaa5d65cb3 - a2e2c59/Convolution.cuh

Convolution.cuh
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <math_constants.h>
#include "half.hpp"

namespace Microsoft { namespace MSR { namespace CNTK {

// -----------------------------------------------------------------------
// The file contains CUDA kernels that are used in reference convolution
// engine. All these kernels look very similar as they use the same
// idea of precomputed maps described in ConvolveGeometry.h
// That is, 'mpRowCol' maps each convolution output to the start of the
// input. 'mpRowIwht', 'mpRowRun' and 'runs' provide maps that allow
// to get indices of the active weight when applying the convolution.
// See ConvolveGeometry.h (MpRowCol, MpRowIwht etc) for more details.
// -----------------------------------------------------------------------

template <typename ElemType>
__global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__ kernel,
                                    const int* mpRowCol, const int* mpRowIwht,
                                    const int* mpRowRun, const int* __restrict__ runs,
                                    const ElemType* __restrict__ src, int srcVecSize,
                                    ElemType* dst, int dstVecSize)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= dstVecSize)
        return;

    src += blockIdx.y * srcVecSize;
    dst += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        int ivBase = mpRowIwht[row];
        assert(0 <= colBase && colBase < srcVecSize);

        comp_t sum = 0;
        int i0 = mpRowRun[row];
        int skip = runs[i0++];
        int size = runs[i0++];
        int imask = i0 + size;
        for (int i = 0; i < size; i++)
        {
            if (runs[imask + i] == 0)
                continue;
            int dcol = runs[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
            sum += (comp_t)kernel[ivBase + skip + i] * (comp_t)src[colBase + dcol];
        }
        dst[row] = sum;

        src += blockDim.y * srcVecSize;
        dst += blockDim.y * dstVecSize;
    }
}

template <typename ElemType>
__global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restrict__ kernel,
                                         const int* mpRowCol, const int* mpRowIwht,
                                         const int* mpRowRun, const int* __restrict__ runs,
                                         const ElemType* __restrict__ srcGrad, int srcVecSize,
                                         ElemType* grad, int dstVecSize)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= srcVecSize)
        return;

    srcGrad += blockIdx.y * srcVecSize;
    grad += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        int ivBase = mpRowIwht[row];
        assert(0 <= colBase && colBase < dstVecSize);

        comp_t g = srcGrad[row];
        int i0 = mpRowRun[row];
        int skip = runs[i0++];
        int size = runs[i0++];
        int imask = i0 + size;
        for (int i = 0; i < size; i++)
        {
            if (runs[imask + i] == 0)
                continue;
            int dcol = runs[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
            atomicAdd(&grad[colBase + dcol], (ElemType)((comp_t)g * (comp_t)kernel[ivBase + skip + i]));
        }

        srcGrad += blockDim.y * srcVecSize;
        grad += blockDim.y * dstVecSize;
    }
}

template <typename ElemType>
__global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int outVecSize,
                                           const ElemType* __restrict__ in,
                                           const int* mpRowCol, const int* mpRowIwht,
                                           const int* mpRowRun, const int* __restrict__ runs,
                                           const ElemType* __restrict__ srcGrad,
                                           ElemType* kernelGrad)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= outVecSize)
        return;

    in += blockIdx.y * inVecSize;
    srcGrad += blockIdx.y * outVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        int ivBase = mpRowIwht[row];
        assert(0 <= colBase && colBase < inVecSize);

        comp_t g = srcGrad[row];
        int i0 = mpRowRun[row];
        int skip = runs[i0++];
        int size = runs[i0++];
        int imask = i0 + size;
        for (int i = 0; i < size; i++)
        {
            if (runs[imask + i] == 0)
                continue;
            int dcol = runs[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
            atomicAdd(&kernelGrad[ivBase + skip + i], (ElemType)((comp_t)g * (comp_t)in[colBase + dcol]));
        }

        in += blockDim.y * inVecSize;
        srcGrad += blockDim.y * outVecSize;
    }
}

template <typename ElemType>
__global__ void kMaxPoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
                                   const ElemType* __restrict__ src, int srcVecSize,
                                   ElemType* dst, int dstVecSize)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= dstVecSize)
        return;

    src += blockIdx.y * srcVecSize;
    dst += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        assert(0 <= colBase && colBase < srcVecSize);

        int i0 = mpRowIndices[row];
        int size = indices[i0++];
        ElemType res = src[colBase + indices[i0]];
        for (int i = 1; i < size; i++)
        {
            int dcol = indices[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
            res = max(res, src[colBase + dcol]);
        }
        dst[row] = res;

        src += blockDim.y * srcVecSize;
        dst += blockDim.y * dstVecSize;
    }
}

template <typename ElemType>
__global__ void kMaxPoolingBackward(int batchSize, const ElemType* out, const ElemType* in,
                                    const int* mpRowCol, const int* mpRowIndices, const int* indices,
                                    const ElemType* __restrict__ srcGrad, int srcVecSize,
                                    ElemType* grad, int dstVecSize)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= srcVecSize)
        return;

    in += blockIdx.y * dstVecSize;
    out += blockIdx.y * srcVecSize;
    srcGrad += blockIdx.y * srcVecSize;
    grad += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        assert(0 <= colBase && colBase < dstVecSize);

        int i0 = mpRowIndices[row];
        int size = indices[i0++];
        assert(size > 0);
        ElemType g = srcGrad[row];
        ElemType m = out[row];
        for (int i = 0; i < size; i++)
        {
            int dcol = indices[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
            if (in[colBase + dcol] >= m)
            {
                atomicAdd(&grad[colBase + dcol], g);
                break;
            }
        }

        in += blockDim.y * dstVecSize;
        out += blockDim.y * srcVecSize;
        srcGrad += blockDim.y * srcVecSize;
        grad += blockDim.y * dstVecSize;
    }
}

__device__ float round_(float a)
{
    return roundf(a);
}

__device__ double round_(double a)
{
    return round(a);
}

// For each image, for each ROI, this function treats that ROI as an image
// and does max pooling so that it has output size pooledHeight x pooledWidth.
// The kernel operates on one location in the output tensor, computes which ROI
// and image should populate that location, computes the subset of the image
// corresponding to the ROI and which pixels in that subset should go into the
// output location, then takes the max value over that window.
// src: Images              [W x H x C x N]
// roiData: ROIs            [4 x numROIs x N],
// dst: Pooled ROIs         [PW x PH x C x numROIs x N]
// argmax: max positions    [PW x PH x C x numROIs x N]
// spatialScale             ratio of input feature map to the original image.
// where PW = Pooled Width, PH = Pooled Height, C = Channels, N = Batch Size
template <typename ElemType>
__global__ void kMaxROIPoolingForward(const int totalIterations,
    const int numROIs, const int numImg,
    const int channels, const int width, const int height,
    const int pooledWidth, const int pooledHeight, const ElemType* src,
    const ElemType* roiData, ElemType* dst, ElemType* argmax, double spatialScale)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // index loops over all totalRois*c*pooledHeight*pooledWidth output locations.
    for (int index = blockIdx.x * blockDim.x + threadIdx.x;
        index < (totalIterations); index += blockDim.x * gridDim.x)
    {

        // output is [W x H x C x N]
        // n is the global ROI index (the new batch index)
        int pw = index % pooledWidth;
        int ph = (index / pooledWidth) % pooledHeight;
        int c = (index / pooledWidth / pooledHeight) % channels;
        int n = index / pooledWidth / pooledHeight / channels;

        // each ROI is 4 elements: (x, y, w, h)
        roiData += n * 4;

        // roi data is relative to original image size
        int roiStartW = (int)(round_(roiData[0] * spatialScale));
        int roiStartH = (int)(round_(roiData[1] * spatialScale));
        int roiEndW = (int)(round_(roiData[2] * spatialScale));
        int roiEndH = (int)(round_(roiData[3] * spatialScale));

        int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
        int roiHeight = max(roiEndH - roiStartH + 1, (int)1);

        comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
        comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;

        // compute window for this output location.
        int hstart = (int)(ph * winH);
        int wstart = (int)(pw * winW);
        int hend = (int)(ceilf((ph + 1) * winH));
        int wend = (int)(ceilf((pw + 1) * winW));

        // Add ROI offsets and clip to input boundaries
        hstart = min(max(hstart + roiStartH, 0), height);
        hend = min(max(hend + roiStartH, 0), height);
        wstart = min(max(wstart + roiStartW, 0), width);
        wend = min(max(wend + roiStartW, 0), width);

        bool isempty = (hend <= hstart) || (wend <= wstart);
        // Define an empty pooling region to be zero
        comp_t maxval = isempty ? (comp_t)0 : (comp_t)-CUDART_INF_F;
        int maxidx = -1;

        int imgIdx = n / numROIs;
        src += (imgIdx * channels + c) * height * width;
        for (int h = hstart; h < hend; h++)
        {
            for (int w = wstart; w < wend; w++)
            {
                int srcIndex = w + h * width;
                if (src[srcIndex] > maxval)
                {
                    maxval = src[srcIndex];
                    maxidx = srcIndex;
                }
            }
        }
        dst[index] = maxval;
        argmax[index] = maxidx;
    }
}

// The kernel operates on one location in the input to the ROIPoolingNode (one image location).
// It loops over the ROIs corresponding to that image, seeing which ones could contain the location
// in their output. For each ROI, it checks the argmax data to see if that ROI indeed chose
// this pixel location as the maximum. If so, it increments the gradient term for the input location.
template <typename ElemType>
__global__ void kMaxROIPoolingBackward(const int totalIterations,
    const int numROIs, const int numImg,
    const int channels, const int width, const int height,
    const int pooledWidth, const int pooledHeight, const ElemType* pooledGrad,
    const ElemType* roiData, ElemType* grad, const ElemType* argmax, double spatialScale)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // index loops over all input locations (locations in the original input tensor).
    for (int index = blockIdx.x * blockDim.x + threadIdx.x;
        index < (totalIterations); index += blockDim.x * gridDim.x)
    {
        // images are laid out [W x H x C x N]
        // (n, c, h, w) is an element in the input image
        int w = index % width;
        int h = (index / width) % height;
        int c = (index / width / height) % channels;
        int n = index / width / height / channels;

        // compute range of ROIs corresponding to this image:
        int roiMin = n * numROIs;
        int roiMax = (n + 1) * numROIs;

        comp_t gradient = 0;
        for (int roiN = roiMin; roiN < roiMax; roiN++)
        {
            // each ROI is 4 elements: (x, y, w, h)
            const ElemType* roiOffset = roiData + roiN * 4;

            // ROI data is absolute pixel value in the original image size
            int roiStartW = (int)(round_(roiOffset[0] * spatialScale));
            int roiStartH = (int)(round_(roiOffset[1] * spatialScale));
            int roiEndW = (int)(round_(roiOffset[2] * spatialScale));
            int roiEndH = (int)(round_(roiOffset[3] * spatialScale));

            int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
            int roiHeight = max(roiEndH - roiStartH + 1, (int)1);

            // skip this ROI if it doesn't contain our input location.
            const bool inROI = (w >= roiStartW && w < roiStartW + roiWidth &&
                h >= roiStartH && h < roiStartH + roiHeight);
            if (!inROI)
                continue;

            comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
            comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;

            // what pooled nodes in the output for this ROI could have pooled this input location?
            // we use int here since the computation can yield a negative result
            int phstart = (int)((float)(h - roiStartH) / winH);
            int pwstart = (int)((float)(w - roiStartW) / winW);
            int phend = (int)(ceilf((float)(h - roiStartH + 1) / winH));
            int pwend = (int)(ceilf((float)(w - roiStartW + 1) / winW));

            phstart = min(max(phstart, 0), pooledHeight);
            pwstart = min(max(pwstart, 0), pooledWidth);
            phend = min(max(phend, 0), pooledHeight);
            pwend = min(max(pwend, 0), pooledWidth);

            // go right up to this channel of this ROI.
            int offset = (roiN * channels + c) * pooledWidth * pooledHeight;
            const ElemType* offsetPoolGrad = pooledGrad + offset;
            const ElemType* offsetArgmax = argmax + offset;

            for (int ph = phstart; ph < phend; ph++)
            {
                for (int pw = pwstart; pw < pwend; pw++)
                {
                    if ((int)offsetArgmax[ph * pooledWidth + pw] == (h * width + w))
                    {
                        gradient += (comp_t)offsetPoolGrad[ph * pooledWidth + pw];
                    }
                }
            }
        }

        atomicAdd(&grad[index], (ElemType)gradient);
    }
}

template <typename ElemType>
__global__ void kMaxUnpooling(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
                              const ElemType* __restrict__ src, const ElemType* poolIn, int srcVecSize,
                              ElemType* dst, int dstVecSize)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= srcVecSize)
        return;

    src    += blockIdx.y * srcVecSize;
    poolIn += blockIdx.y * dstVecSize;
    dst    += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        assert(0 <= colBase && colBase < dstVecSize);

        int i0 = mpRowIndices[row];
        int size = indices[i0++];
        ElemType curMax = poolIn[colBase + indices[i0]];
        ElemType prevMax = curMax;
        int imax = 0;
        for (int i = 1; i < size; i++)
        {
            int dcol = indices[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
            curMax = max(curMax, poolIn[colBase + dcol]);
            if (curMax > prevMax)
            {
                prevMax = curMax;
                imax = i;
            }

        }

        int dcol = indices[i0 + imax];
        assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);

        dst[colBase + dcol] = src[row];

        src    += blockIdx.y * srcVecSize;
        poolIn += blockIdx.y * dstVecSize;
        dst    += blockIdx.y * dstVecSize;
    }
}

template <typename ElemType>
__global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
                                       const ElemType* __restrict__ src, int srcVecSize,
                                       ElemType* dst, int dstVecSize)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= dstVecSize)
        return;

    src += blockIdx.y * srcVecSize;
    dst += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        assert(0 <= colBase && colBase < srcVecSize);

        int i0 = mpRowIndices[row];
        int size = indices[i0++];
        comp_t sum = 0;
        for (int i = 0; i < size; i++)
        {
            int dcol = indices[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
            sum += (comp_t)src[colBase + dcol];
        }
        dst[row] = sum / (comp_t)size;

        src += blockDim.y * srcVecSize;
        dst += blockDim.y * dstVecSize;
    }
}

template <typename ElemType>
__global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
                                        const ElemType* __restrict__ srcGrad, int srcVecSize,
                                        ElemType* grad, int dstVecSize)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= srcVecSize)
        return;

    srcGrad += blockIdx.y * srcVecSize;
    grad += blockIdx.y * dstVecSize;

    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
    {
        int colBase = mpRowCol[row];
        assert(0 <= colBase && colBase < dstVecSize);

        int i0 = mpRowIndices[row];
        int size = indices[i0++];
        assert(size > 0);
        ElemType g = srcGrad[row] / size;
        for (int i = 0; i < size; i++)
        {
            int dcol = indices[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
            atomicAdd(&grad[colBase + dcol], g);
        }

        srcGrad += blockDim.y * srcVecSize;
        grad += blockDim.y * dstVecSize;
    }
}

}}}