Content - 95596948bbe6b9525622180bbc87c2995f724fa2 - a9429f4/Source/Math/GPUMatrixCUDAKernels.cuh

visit type:
Tip revision: a8e76a0261bd86abc2717bed593b592b93b53198 authored by Vadim Mazalov on 30 October 2018, 03:28:55 UTC
Enable NCCL Group start/end
Tip revision: a8e76a0
GPUMatrixCUDAKernels.cuh
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include "BestGpu.h"

#ifndef CPUONLY

#pragma push_macro("TENSOR_OPS_DECL")
#define TENSOR_OPS_DECL __device__ __host__
#include "CommonMatrix.h"
#include "GPUMatrix.h"
#include "TensorOps.h" // for exp_() etc.
#include "device_functions.h"
#include <cuda_runtime.h>
#include <assert.h>
#include <float.h>
#include "half.hpp"
#include "fpgeneric.h"
#pragma pop_macro("TENSOR_OPS_DECL")

// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.

#ifndef CUDA_LONG
#define CUDA_LONG int32_t
#endif

#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing

// On older GPUs, CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __inline__ __device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*) address;
    unsigned long long int old = *address_as_ull, assumed;
    do
    {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif
// overload atomicAdd for half
static __inline__ __device__ half atomicAdd(half* address, half val)
{
    assert(false); // TODO: implement later
    return val;
}


// TODO: replace this with TensorOps.h LogAdd(). It differs in using ElemType throughout, while this one seems to use 'double' versions of exp() and log().
// The 'k' in the name is to avoid naming conflicts with various versions of logadd() that are defined throughout the codebase.
template <class ElemType>
static inline __device__ __host__ ElemType logaddk(ElemType x, ElemType y)
{
    ElemType temp, diff, z;

    if (x < y)
    {
        temp = x;
        x = y;
        y = temp;
    }
    diff = y - x;
    if (diff < MINLOGEXP)
    {
        return (x < LSMALL) ? LZERO : x;
    }
    else
    {
        z = exp(diff);
        return x + log(1.0 + z);
    }
}

namespace Microsoft { namespace MSR { namespace CNTK {

// ---------------------------------------------------------------------------
// GridDim -- helper to choose the CUDA grid dimensions
// ---------------------------------------------------------------------------

template <class INT, class INT2>
static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
{
    return (INT)(((size_t) a + (size_t) b - 1) / (size_t) b); // these size_t casts are necessary since b may be INT_MAX (for maxGridSize[])
}

struct GridDim
{
    enum : CUDA_LONG
    {
        maxThreadsPerBlock = 1024, // use this many threads per block
        maxWarpsPerBlock = 32,     // use this many warps per block. This means 1024 threads for warpSize=32
    };

    // use these for launching
    //   GridDim grid(NN);
    //   kernel<<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, ...>>>(...)
    int m_blocksPerGrid, m_threadsPerBlock; // (these may in the future be extended to multi-dimensional ones)
    CUDA_LONG m_N;

    GridDim(CUDA_LONG N) // linear grid
    {
        m_N = N;
        if (N == 0) // CUDA will fail to launch with 0 blocks
            N = 1;

        // get device information
        const auto& props = GetDeviceProps();
        CUDA_LONG numProcs = props.multiProcessorCount;
        CUDA_LONG warpSize = props.warpSize;

        // distribute warps evenly over processors
        CUDA_LONG warpsPerProc = CeilDiv(N, numProcs * warpSize);

        // if too many warps per block then reduce #warps
        // This limits the number of threads to 512.
        if (warpsPerProc > maxWarpsPerBlock)
        {
            CUDA_LONG overBy = CeilDiv(warpsPerProc, maxWarpsPerBlock); // we are over by this factor
            warpsPerProc = CeilDiv(warpsPerProc, overBy);
        }

        // put it back together
        m_threadsPerBlock = warpsPerProc * warpSize;        // =a multiple of 32 that is as close to 1024 as makes sense given NN
        m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
        if (m_blocksPerGrid == 1)
            m_threadsPerBlock = N; // don't launch more than necessary  --TODO: Does this make a difference at all?
        assert(m_blocksPerGrid * m_threadsPerBlock >= N);
    }

    static const std::vector<cudaDeviceProp>& GetCachedDeviceProps()
    {
        std::call_once(s_cachedDevicePropsInitFlag, [=]{
            int numDevices;
            // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
            CUDA_CALL(cudaDeviceSynchronize());
            CUDA_CALL(cudaGetDeviceCount(&numDevices));
            s_cachedDeviceProps.resize(numDevices);
            for (int i = 0; i < numDevices; i++)
                CUDA_CALL(cudaGetDeviceProperties(&s_cachedDeviceProps[i], i));
        });

        return s_cachedDeviceProps;
    }

    static size_t GetCurrentDeviceId()
    {
        int deviceId;
        cudaGetDevice(&deviceId);
        return (size_t)deviceId;
    }


    // get device properties of current device
    static const cudaDeviceProp& GetDeviceProps()
    {
        const auto& cachedDevicesProps = GetCachedDeviceProps();
        return cachedDevicesProps[GetCurrentDeviceId()];
    }

    // compute our location on the grid
    static __device__ CUDA_LONG GetLinearThreadId()
    {
        return blockDim.x * blockIdx.x + threadIdx.x;
    }

private:
    // TODO: drop call_once and co. and make cached devices a local static, once we're on VS2015.
    static std::vector<cudaDeviceProp> s_cachedDeviceProps;
    static std::once_flag s_cachedDevicePropsInitFlag;
};

#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
    CUDA_LONG id = GridDim::GetLinearThreadId();   \
    if (id >= N)                                   \
        return;

#ifdef __GNUC__
#define UNUSED_FUNCTION_ATTRIBUTE __attribute__((unused))
#else
#define UNUSED_FUNCTION_ATTRIBUTE
#endif

// ===========================================================================
// CUDA kernels follow, lots of them
// ===========================================================================

// _elementWise*() kernels
//
// Designed to operate on contiguous blocks of memory, where the output is a simple function of the inputs.
// The first parameters of every function are inputs, and the last two arguments to each function are always
// (ElemenType *res, CUDA_LONG N), a pointer and length of the output block. Each thread computes a function
// of the inputs for one value in the output.

template <class ElemType>
__global__ void _elementWisePowerOnCuda(
    const ElemType alpha,
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    if (alpha == 0)
    {
        res[id] = 1;
    }
    else if (alpha == 1)
    {
        res[id] = a[id];
    }
    else if (alpha == 2)
    {
        res[id] = (comp_t)a[id] * (comp_t)a[id];
    }
    else if (alpha == 3)
    {
        res[id] = (comp_t)a[id] * (comp_t)a[id] * (comp_t)a[id];
    }
    else
    {
        res[id] = pow_((comp_t)a[id], (comp_t)alpha);
    }
};

// Note that this code is inefficient on CUDA due to diverging code paths.
// Use Sigmoid() in TensorOps.h instead, which solves this problem.
template <class ElemType>
__global__ void _elementWiseSigmoidOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
#if 0 // this computes the same thing but is twice as fast on CUDA
    res[id] = Microsoft::MSR::CNTK::Sigmoid((comp_t)a[id]);
#else
    if (a[id] >= 0)
    {
        comp_t e = exp_(-(comp_t)a[id]);
        res[id] = 1 / (1 + e);
    }
    else
    {
        comp_t e = exp_((comp_t)a[id]);
        res[id] = e / (1 + e);
    }
#endif
};

template <class ElemType>
__global__ void _assignSigmoidOf(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);

// This function computes 1 / (1 + e^(-x)) which yields 1 / (1 + e^|x|) if x is negative,
// and e^x / (1 + e^x) if x is positive.
// BUGBUG: This does not invert the calculation when the exp argument becomes large, potentially causing overflows.
//         There is a second version of this function that does. That should be used.
#if 0 // this has the same speed now, although not identical accuracy
    res[id] = Microsoft::MSR::CNTK::Sigmoid((comp_t)a[id]);
#else
    comp_t negElem = -(comp_t)a[id];
    comp_t e = exp_(negElem);

    res[id] = 1 / (e + 1);
#endif
};

template <class ElemType>
__global__ void _elementWiseLinRectDerivativeOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = (a[id] <= 0) ? 0 : 1;
}

template <class ElemType>
__global__ void _elementWiseSigmoidDerivativeOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = a[id] * (1 - a[id]);
}

template <class ElemType>
__global__ void _elementWiseTanhOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = tanh_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseAtanhOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = atanh_(a[id]);
};

//to prevent negative values caused by floating operations, we force inputs to be >=0
//this may, however, hide problems in the caller.
template <class ElemType>
__global__ void _elementWiseSqrtOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = sqrt_(max((ElemType) 0, a[id]));
};

template <class ElemType>
__global__ void _elementWiseExpOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = exp_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseLogOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = ((comp_t)a[id] < EPS_IN_LOG) ? (comp_t)LOG_OF_EPS_IN_LOG : log_((comp_t)a[id]);
};

template <class ElemType>
__global__ void _elementWiseAbsOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = fabs_((comp_t)a[id]);
};

template <class ElemType>
__global__ void _elementWiseCosineOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = cos_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseNegativeSineOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = -sin_((comp_t)a[id]);
};

template <class ElemType>
__global__ void _elementWiseTanOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = tan_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseAcosOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = acos_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseAsinOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = asin_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseAtanOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = atan_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseCoshOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = cosh_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseSinhOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = sinh_(a[id]);
};

template <class ElemType>
__global__ void _elementWiseAsinhOnCuda(
    const ElemType* a,
    ElemType* res,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    res[id] = asinh_(a[id]);
};

template <class ElemType>
__global__ void _setValue(
    ElemType* a,
    const ElemType v,
    const CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = v;
};

template <class ElemType>
__global__ void _setValue(
    ElemType* a,
    const ElemType* d_v,
    const CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = d_v[0];
};

template <class ElemType, class ElemType2>
__global__ void _castValue(
    const ElemType2* a,
    ElemType* c,
    const CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    c[id] = (ElemType)a[id];
};

template <class ElemType>
__global__ void _copyColumnsStrided(ElemType* dest, ElemType* src, CUDA_LONG N, CUDA_LONG numRows, CUDA_LONG destNumColsStride, CUDA_LONG srcNumColsStride)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG denseColIdx = id / numRows;
    CUDA_LONG rowIdx = id - (denseColIdx * numRows);

    dest[(denseColIdx * destNumColsStride * numRows) + rowIdx] = src[(denseColIdx * srcNumColsStride * numRows) + rowIdx];
}

template <class ElemType>
__global__ void _assignToRowSliceValuesOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG startIndex, const CUDA_LONG destRows, const CUDA_LONG srcRows)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / srcRows;
    CUDA_LONG row = id - (col * srcRows);

    dest[col * destRows + row + startIndex] = src[id];
}

template <class ElemType>
__global__ void _assignRowSliceValuesOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG startIndex, const CUDA_LONG destRows, const CUDA_LONG srcRows)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / destRows;
    CUDA_LONG row = id - (col * destRows);

    // dest[id] = src[col*srcRows + row + startIndex];
    dest[id] = src[IDX2C(row + startIndex, col, srcRows)];
}

template <class ElemType>
__global__ void _addToRowSliceValuesOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG startIndex, const CUDA_LONG destRows, const CUDA_LONG srcRows)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / srcRows; // src is the full matrix, rowslice is taken from the dest
    CUDA_LONG row = id - (col * srcRows);

    // dest[col*destRows + row + startIndex] += src[id];
    dest[IDX2C(row + startIndex, col, destRows)] = (comp_t)dest[IDX2C(row + startIndex, col, destRows)] + (comp_t)src[id];
}

template <class ElemType>
__global__ void _addWithRowSliceValuesOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG startIndex, const CUDA_LONG destRows, const CUDA_LONG srcRows)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / destRows; // dest is the full matrix, rowslice is taken from the src
    CUDA_LONG row = id - (col * destRows);

    dest[id] = (comp_t)dest[id] + (comp_t)src[IDX2C(row + startIndex, col, srcRows)];
}

template <class ElemType>
__global__ void _assignToDiagonalValuesOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG srcCols)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / srcCols;
    CUDA_LONG row = id - (col * srcCols);

    if (row == col)
        dest[row] = src[id];
}

template <class ElemType>
__global__ void _assignRowStackValuesOf(ElemType* dest, ElemType** srces, size_t* startRowIndeces, const CUDA_LONG numSrces, const CUDA_LONG N, const CUDA_LONG destRows, const CUDA_LONG destCols)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / destRows; // dest is the full matrix, rowslice is taken from the src
    CUDA_LONG row = id - (col * destRows);

    // can we replace the for loop with something better?
    int srcId = 0;
    for (; srcId < numSrces; srcId++)
    {
        if (startRowIndeces[srcId + 1] > row)
            break;
    }

    dest[id] = srces[srcId][IDX2C(row - startRowIndeces[srcId], col, startRowIndeces[srcId + 1] - startRowIndeces[srcId])];
}

template <class ElemType>
__global__ void _assignRepeatOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG srcRows, const CUDA_LONG srcCols, const CUDA_LONG destRows)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG destCol = id / destRows;
    CUDA_LONG destRow = id - (destCol * destRows);

    CUDA_LONG srcRow = destRow % srcRows;
    CUDA_LONG srcCol = destCol % srcCols;

    dest[id] = src[IDX2C(srcRow, srcCol, srcRows)];
}

template <class ElemType>
__global__ void _addToRowRepeatValuesOf(ElemType* dest, ElemType* src, const CUDA_LONG N, const CUDA_LONG srcRows, const CUDA_LONG srcCols, const CUDA_LONG destRows)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG col = id / srcRows;
    CUDA_LONG row = (id - (col * srcRows)) % destRows;

    // dest[col*destRows + row + startIndex] += src[id];
    dest[IDX2C(row, col, destRows)] = (comp_t)dest[IDX2C(row, col, destRows)] + (comp_t)src[id];
}

template <class ElemType>
__global__ void _assignPositiveAndShiftedNegSample(ElemType* dest, const ElemType* src, const CUDA_LONG N, const CUDA_LONG srcRows, const CUDA_LONG srcCols, const CUDA_LONG destRows, const CUDA_LONG posNumber, const CUDA_LONG shiftNumber)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    CUDA_LONG destCol = id / destRows;
    CUDA_LONG destRow = id - (destCol * destRows);

    CUDA_LONG sampleInDestCol = destRow / srcRows;
    CUDA_LONG srcRow = destRow - srcRows * sampleInDestCol;
    CUDA_LONG srcCol = sampleInDestCol < posNumber ? destCol : (destCol + shiftNumber + sampleInDestCol - posNumber) % srcCols;

    dest[id] = src[IDX2C(srcRow, srcCol, srcRows)];
}

template <class ElemType>
__global__ void _addFoldedPositiveAndShiftedNegSample(ElemType* folded, const ElemType* unfolded, const CUDA_LONG unfoldedN, const CUDA_LONG unfoldedRows, const CUDA_LONG unfoldedCols, const CUDA_LONG foldedRows, const CUDA_LONG posNumber, const CUDA_LONG shiftNumber)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= unfoldedN)
        return;

    CUDA_LONG unfoldedCol = id / unfoldedRows;
    CUDA_LONG unfoldedRow = id - (unfoldedCol * unfoldedRows);

    CUDA_LONG sampleInUnfoldedCol = unfoldedRow / foldedRows;
    CUDA_LONG foldedRow = unfoldedRow - foldedRows * sampleInUnfoldedCol;
    CUDA_LONG foldedCol = sampleInUnfoldedCol < posNumber ? unfoldedCol : (unfoldedCol + shiftNumber + sampleInUnfoldedCol - posNumber) % unfoldedCols;

    atomicAdd(&folded[IDX2C(foldedRow, foldedCol, foldedRows)], unfolded[id]);
}

template <class ElemType>
__global__ void _assignDifferenceOf1(
    ElemType* us,
    const ElemType alpha,
    const ElemType* a,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    us[id] = (comp_t)alpha - (comp_t)a[id];
};

template <class ElemType>
__global__ void _assignDifferenceOf2(
    ElemType* us,
    const ElemType alpha,
    const ElemType* a,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    us[id] = (comp_t)a[id] - (comp_t)alpha;
};

///a is a scalar
template <class ElemType>
__global__ void _scaleAndAddScalar(
    ElemType* c,
    const CUDA_LONG N,
    const ElemType alpha,
    const ElemType* a,
    const ElemType* b)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    c[id] = (comp_t)alpha * (comp_t)a[0] + (comp_t)b[id];
};

template <class ElemType>
__global__ void _multiply1x1AndWeightedAdd(
    ElemType alpha, const ElemType* a, const ElemType* b, ElemType beta, ElemType* c, CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    comp_t f = (comp_t)alpha * (comp_t)*a; // scalar matrix
    if (beta == 0)           // don't even read the memory if beta is 0
        c[id] = (comp_t)b[id] * f;
    else
        c[id] = (comp_t)b[id] * f + (comp_t)c[id] * (comp_t)beta;
}

template <class ElemType>
__global__ void _addValue(
    ElemType* a,
    const ElemType v,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = (comp_t)a[id] + (comp_t)v;
};

template <class ElemType>
__global__ void _addValue(
    ElemType* a,
    const ElemType* d_v,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = (comp_t)a[id] + (comp_t)d_v[0];
};

template <class ElemType>
__global__ void _elemMul(
    ElemType* a,
    const ElemType* b,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = (comp_t)a[id] * (comp_t)b[id];
};

template <class ElemType>
__global__ void _assignElementProductOf(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    us[id] = (comp_t)a[id] * (comp_t)b[id];
}

template <class ElemType>
__global__ void _assignKhatriRaoProductOf(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG rowsA,
    const CUDA_LONG rowsB,
    const CUDA_LONG cols)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;

    const CUDA_LONG rows = rowsA * rowsB;
    const CUDA_LONG col = id / rows;
    if (col >= cols)
        return;

    const CUDA_LONG row = id % rows;
    const CUDA_LONG rowB = row / rowsA;
    const CUDA_LONG rowA = row % rowsA;

    us[id] = (comp_t)a[rowA + col * rowsA] * (comp_t)b[rowB + col * rowsB];
}

template <class ElemType>
__global__ void _addColumnReshapeProductOf(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG rowsB,
    const CUDA_LONG rowsC,
    const CUDA_LONG cols,
    const bool transposeAColumn)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;

    const CUDA_LONG col = id / rowsC;
    if (col >= cols)
        return;

    const CUDA_LONG row = id % rowsC;
    CUDA_LONG bBase = col * rowsB;
    CUDA_LONG aBase = bBase * rowsC;
    comp_t v = 0;

    if (transposeAColumn)
    {
        aBase += row * rowsB;
        for (CUDA_LONG i = 0; i < rowsB; i++)
        {
            v += (comp_t)a[aBase++] * (comp_t)b[bBase++];
        }
    }
    else
    {
        aBase += row;
        for (CUDA_LONG i = 0; i < rowsB; i++)
        {
            v += (comp_t)a[aBase] * (comp_t)b[bBase++];
            aBase += rowsC;
        }
    }
    us[row + col * rowsC] = (comp_t)us[row + col * rowsC] + v;
}

template <class ElemType>
__global__ void _assignElementDivisionOf(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    comp_t smallValue = EPS_IN_INVERSE;

    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    comp_t v = b[id];

    if (v < 0 && v > -smallValue)
        us[id] = (comp_t)a[id] / (-smallValue);
    else if (v >= 0 && v < smallValue)
        us[id] = (comp_t)a[id] / smallValue;
    else
        us[id] = (comp_t)a[id] / v;
}

template <class ElemType>
__global__ void _elemInverse(
    ElemType* us,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    comp_t smallValue = EPS_IN_INVERSE;

    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    if (us[id] < 0 && us[id] > -smallValue)
        us[id] = 1 / -smallValue;
    else if (us[id] >= 0 && us[id] < smallValue)
        us[id] = 1 / smallValue;
    else
        us[id] = 1 / (comp_t)us[id];
}

template <class ElemType>
__global__ void _logSoftMaxColWise(
    ElemType* a,
    const CUDA_LONG m_numCols,
    const CUDA_LONG m_numRows) // ld
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
    if (col_id >= m_numCols)
        return;

    __shared__ comp_t maxV[GridDim::maxThreadsPerBlock];
    __shared__ comp_t Sum[GridDim::maxThreadsPerBlock];
    maxV[threadIdx.x] = a[IDX2C(0, col_id, m_numRows)];
    Sum[threadIdx.x] = 0;

    for (CUDA_LONG i = 0; i < m_numRows; ++i)
    {
        if (a[IDX2C(i, col_id, m_numRows)] > maxV[threadIdx.x])
        {
            maxV[threadIdx.x] = a[IDX2C(i, col_id, m_numRows)];
        }
    }

    for (CUDA_LONG i = 0; i < m_numRows; ++i)
    {
        comp_t tmp = (comp_t)a[IDX2C(i, col_id, m_numRows)] - maxV[threadIdx.x];
        Sum[threadIdx.x] += exp_(tmp);
    }
    Sum[threadIdx.x] = maxV[threadIdx.x] + log_(Sum[threadIdx.x]);
    for (CUDA_LONG i = 0; i < m_numRows; ++i)
    {
        a[IDX2C(i, col_id, m_numRows)] = (comp_t)a[IDX2C(i, col_id, m_numRows)] - Sum[threadIdx.x];
    }
}

//template<class ElemType>
//__global__ void _assignColumnwiseSoftmaxOf(
//    const ElemType *a,
//    ElemType* us,
//    const CUDA_LONG m_numCols,
//    const CUDA_LONG m_numRows) // thead per column
//{
//    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
//    if (col_id>=m_numCols)
//        return;
//
//    __shared__ ElemType maxV[GridDim::maxThreadsPerBlock];
//    __shared__ ElemType Sum[GridDim::maxThreadsPerBlock];
//    maxV[threadIdx.x]=a[IDX2C(0,col_id,m_numRows)];
//    Sum[threadIdx.x]=0;
//
//    for (CUDA_LONG i=0;i<m_numRows;++i)
//    {
//        if (a[IDX2C(i,col_id,m_numRows)]>maxV[threadIdx.x])
//        {
//            maxV[threadIdx.x]=a[IDX2C(i,col_id,m_numRows)];
//        }
//    }
//
//    for (CUDA_LONG i=0;i<m_numRows;++i)
//    {
//        us[IDX2C(i,col_id,m_numRows)] = exp_(a[IDX2C(i,col_id,m_numRows)]-maxV[threadIdx.x]);
//        Sum[threadIdx.x] +=  us[IDX2C(i,col_id,m_numRows)];
//    }
//
//    for (CUDA_LONG i=0;i<m_numRows;++i)
//    {
//        us[IDX2C(i,col_id,m_numRows)] /= Sum[threadIdx.x] ;
//    }
//}

// each block processes one column. There must be 512 threads in a block
template <class ElemType>
__global__ void _assignColumnwiseLogSoftmaxOf512Threads(
    const ElemType* a,
    ElemType* us,
    const CUDA_LONG m_numCols,
    const CUDA_LONG m_numRows)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // We first find max per column
    __shared__ comp_t partials[512];
    partials[threadIdx.x] = -10000000;

    for (int i = threadIdx.x; i < m_numRows; i += 512)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x], (comp_t)a[IDX2C(i, blockIdx.x, m_numRows)]);
    }
    __syncthreads();

    if (threadIdx.x < 256)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 256], partials[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x < 128)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 128], partials[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x < 64)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 64], partials[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x < 32)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 32], partials[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x < 16)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 16], partials[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x < 8)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 8], partials[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x < 4)
    {
        partials[threadIdx.x] = max(partials[threadIdx.x + 4], partials[threadIdx.x]);
    }
    __syncthreads();

    __shared__ comp_t colMax[1];
    if (threadIdx.x == 0)
    {
        colMax[0] = max(max(partials[0], partials[1]), max(partials[2], partials[3]));
    }
    __syncthreads();
    partials[threadIdx.x] = 0.0f;

    // Now start finding sums
    for (int i = threadIdx.x; i < m_numRows; i += 512)
    {
        comp_t tmp = (comp_t)a[IDX2C(i, blockIdx.x, m_numRows)] - colMax[0];
        us[IDX2C(i, blockIdx.x, m_numRows)] = tmp;
        partials[threadIdx.x] += exp_(tmp);
    }
    __syncthreads();

    if (threadIdx.x < 256)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 256];
    }
    __syncthreads();

    if (threadIdx.x < 128)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 128];
    }
    __syncthreads();

    if (threadIdx.x < 64)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 64];
    }
    __syncthreads();

    if (threadIdx.x < 32)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 32];
    }
    __syncthreads();

    if (threadIdx.x < 16)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 16];
    }
    __syncthreads();

    if (threadIdx.x < 8)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 8];
    }
    __syncthreads();

    if (threadIdx.x < 4)
    {
        partials[threadIdx.x] += partials[threadIdx.x + 4];
    }
    __syncthreads();

    __shared__ comp_t colSum[1];
    if (threadIdx.x == 0)
    {
        colSum[0] = partials[0] + partials[1] + partials[2] + partials[3];
        colSum[0] = log_(colSum[0]);
    }
    __syncthreads();

    for (int i = threadIdx.x; i < m_numRows; i += 512)
    {
        us[IDX2C(i, blockIdx.x, m_numRows)] = (comp_t)us[IDX2C(i, blockIdx.x, m_numRows)] - colSum[0];
    }
}

template <class ElemType>
__global__ void _logSoftMaxRowWise(
    ElemType* a,
    const CUDA_LONG m_numCols,
    const CUDA_LONG m_numRows) // ld
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row_id = blockDim.x * blockIdx.x + threadIdx.x;
    if (row_id >= m_numRows)
        return;

    __shared__ comp_t maxV[GridDim::maxThreadsPerBlock];
    __shared__ comp_t Sum[GridDim::maxThreadsPerBlock];
    maxV[threadIdx.x] = a[IDX2C(row_id, 0, m_numRows)];
    Sum[threadIdx.x] = 0;

    for (CUDA_LONG j = 0; j < m_numCols; ++j)
    {
        if (a[IDX2C(row_id, j, m_numRows)] > maxV[threadIdx.x])
        {
            maxV[threadIdx.x] = a[IDX2C(row_id, j, m_numRows)];
        }
    }

    for (CUDA_LONG j = 0; j < m_numCols; ++j)
    {
        comp_t tmp = (comp_t)a[IDX2C(row_id, j, m_numRows)] - maxV[threadIdx.x];
        Sum[threadIdx.x] += exp_(tmp);
    }
    Sum[threadIdx.x] = maxV[threadIdx.x] + log_(Sum[threadIdx.x]);
    for (CUDA_LONG j = 0; j < m_numCols; ++j)
    {
        a[IDX2C(row_id, j, m_numRows)] = (comp_t)a[IDX2C(row_id, j, m_numRows)] - Sum[threadIdx.x];
    }
}

// each block processes one column. There must be 512 threads in a block
template <class ElemType>
__global__ void _assignColumnwiseHardmaxOf512Threads(
    const ElemType* a,
    ElemType* us,
    const CUDA_LONG m_numCols,
    const CUDA_LONG m_numRows)
{
    // We first find max per column
    __shared__ ElemType partials[512];
    __shared__ int colMaxI[512];
    int row = threadIdx.x % m_numRows;
    colMaxI[threadIdx.x] = row;
    partials[threadIdx.x] = a[IDX2C(row, blockIdx.x, m_numRows)];

    for (int i = threadIdx.x; i < m_numRows; i += 512)
    {
        if (partials[threadIdx.x] < a[IDX2C(i, blockIdx.x, m_numRows)])
        {
            partials[threadIdx.x] = a[IDX2C(i, blockIdx.x, m_numRows)];
            colMaxI[threadIdx.x] = i;
        }
    }
    __syncthreads();

    if (m_numRows > 256)
    {
        if (threadIdx.x < 256)
        {
            int other = threadIdx.x + 256;
            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (m_numRows > 128)
    {
        if (threadIdx.x < 128)
        {
            int other = threadIdx.x + 128;

            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (m_numRows > 64)
    {
        if (threadIdx.x < 64)
        {
            int other = threadIdx.x + 64;
            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (m_numRows > 32)
    {
        if (threadIdx.x < 32)
        {
            int other = threadIdx.x + 32;
            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (m_numRows > 16)
    {
        if (threadIdx.x < 16)
        {
            int other = threadIdx.x + 16;
            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (m_numRows > 8)
    {
        if (threadIdx.x < 8)
        {
            int other = threadIdx.x + 8;
            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (m_numRows > 4)
    {
        if (threadIdx.x < 4)
        {
            int other = threadIdx.x + 4;
            if (partials[threadIdx.x] < partials[other])
            {
                partials[threadIdx.x] = partials[other];
                colMaxI[threadIdx.x] = colMaxI[other];
            }
        }
        __syncthreads();
    }

    if (threadIdx.x == 0)
    {
        for (int i = 1; i < 4 && i < m_numRows; i++)
        {
            if (partials[0] < partials[i])
            {
                partials[0] = partials[i];
                colMaxI[0] = colMaxI[i];
            }
        }
    }
    __syncthreads();

    for (int i = threadIdx.x; i < m_numRows; i += 512)
    {
        us[IDX2C(i, blockIdx.x, m_numRows)] = (i == colMaxI[0]) ? 1 : 0;
    }
}

template <class ElemType>
__global__ void _assignTruncateBottom(
    ElemType* us,
    const ElemType* a,
    const ElemType threshold,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    us[id] = a[id] < threshold ? threshold : a[id];
}

template <class ElemType>
__global__ void _assignTruncateTop(
    ElemType* us,
    const ElemType* a,
    const ElemType threshold,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    us[id] = a[id] > threshold ? threshold : a[id];
}

template <class ElemType>
__global__ void _setToZeroIfAbsLessThan(
    ElemType* a,
    const ElemType threshold,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    if (fabs_((comp_t)a[id]) < (comp_t)threshold)
        a[id] = 0;
}

template <class ElemType>
__global__ void _areEqual(
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG N,
    const ElemType threshold,
    long* d_res)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    if (fabs_((comp_t)a[id] - (comp_t)b[id]) > (comp_t)threshold)
    {
        d_res[0] = 0;
    }
}

// see Matrix<ElemType>::TensorShuffleScaleAndAdd() for comments
template <class ElemType>
__global__ void _tensorShuffleScaleAndAdd(
    ElemType keepWeight, const ElemType* pa, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const ElemType* pb, ElemType* pc)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    size_t N = D * S * M * K * T;
    CUDA_LONG na = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
    if (na >= N)
        return;
    // recover the 5 indices from the loop counter
    size_t d = na % D;
    size_t s = (na / D) % S;
    size_t m = (na / D / S) % M;
    size_t k = (na / D / S / M) % K;
    size_t t = (na / D / S / M / K) % T;
    // compute index for the a and b/c tensors
    size_t nb = (((t * S + s) * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S x T): k/K and s/S swapped
    // perform the computation
    comp_t cval = (comp_t)keepWeight ? (comp_t)keepWeight * (comp_t)pb[nb] : (comp_t)0; // if weight is 0 then don't bother to read memory (efficiency) or to multiply (NaN-safe)
    cval += (comp_t)scaleFactor * (comp_t)pa[na];
    pc[nb] = cval;
}

// see Matrix<ElemType>::TensorShuffleScaleAndAdd() for comments
template <class ElemType>
__global__ void _tensorShuffleScaleAndAddRowSparse(
    const ElemType* anzValues, // source nz values
    const GPUSPARSE_INDEX_TYPE* aRowIndex,
    const GPUSPARSE_INDEX_TYPE* aColCSCIndex,
    ElemType* cnzValues, // target nz values
    GPUSPARSE_INDEX_TYPE* cRowIndex,
    GPUSPARSE_INDEX_TYPE* cColCSCIndex,
    size_t D, size_t S, size_t M, size_t K, size_t T,
    size_t nz)
{
    CUDA_LONG N = blockDim.x * blockIdx.x + threadIdx.x; // input tensor of dimension (D x S x M x K x T)
    if (N < aColCSCIndex[0] || N >= aColCSCIndex[T])
        return;

    size_t col;
    for (col = 0; col < T; col++)
    {
        if (aColCSCIndex[col + 1] > N)
            break;
    }

    size_t na = aRowIndex[N];
    int start = aColCSCIndex[col];
    int end = aColCSCIndex[col + 1];

    // recover the 5 indices from the loop counter
    size_t d = (na) % D;
    size_t s = (na / D) % S;
    size_t m = (na / D / S) % M;
    size_t k = (na / D / S / M) % K;

    // compute index for the a and b/c tensors
    size_t nc = ((s * M + m) * K + k) * D + d; // output tensor of dimension (D x K x M x S): k/K and s/S swapped

    int rowIdx = start;
    for (size_t j = start; j < end; j++)
    {
        // recover the 5 indices from the loop counter
        size_t na_i = aRowIndex[j];
        size_t d_i = (na_i) % D;
        size_t s_i = (na_i / D) % S;
        size_t m_i = (na_i / D / S) % M;
        size_t k_i = (na_i / D / S / M) % K;

        // compute index for the a and b/c tensors
        size_t nc_i = ((s_i * M + m_i) * K + k_i) * D + d_i; // output tensor of dimension (D x K x M x S): k/K and s/S swapped
        if (nc_i < nc)
        {
            rowIdx++;
        }
    }

    cnzValues[rowIdx] = anzValues[N];
    cRowIndex[rowIdx] = nc;

    if (N == 0)
    {
        for (int i = 0; i <= T; i++)
        {
            cColCSCIndex[i] = aColCSCIndex[i];
        }
    }
}

template <class ElemType>
__global__ void _hasElement(
    const ElemType* a,
    const CUDA_LONG N,
    ElemType* d_res // [2x1] vector. The first is the value to be compared and the second is the 0/1 to return
    )
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    if (a[id] == d_res[0])
    {
        d_res[1] = 1;
    }
}

template <class ElemType>
__global__ void _setDiagonalValue(
    ElemType* a,
    const ElemType v,
    const CUDA_LONG N,
    const CUDA_LONG ld)
{
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[IDX2C(id, id, ld)] = v;
}

template <class ElemType>
__global__ void _setDiagonalValueFromVector(
    ElemType* a,
    const ElemType* b,
    const CUDA_LONG N,
    const CUDA_LONG ld)
{
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[IDX2C(id, id, ld)] = b[id];
}

template <class ElemType>
__global__ void _adagrad(
    ElemType* a,
    ElemType* d_v,
    const CUDA_LONG N,
    ElemType* multipliers)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    const comp_t floor = 1e-16f;
    comp_t a_comp = a[id];
    comp_t d_v_comp = d_v[id];

    a_comp += d_v_comp * d_v_comp;
    a[id] = a_comp;
    comp_t temp = sqrt_(a_comp + floor);
    d_v[id] = d_v_comp / temp;

    if (multipliers != nullptr)
        multipliers[id] = 1 / temp;
}

template <class ElemType>
__global__ void _adagrad4BlockSparse(
    ElemType* a,          // dense
    const size_t numRows, // number of rows in a and in d_v
    ElemType* d_v,        // block sparse
    const GPUSPARSE_INDEX_TYPE* blockId2ColOrRow,
    ElemType* multipliers,
    const bool colMajor,
    const size_t len,  // major dim, numRows in colMajor and numcols in rowMajor
    const CUDA_LONG N) // total number of non-zero values
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    const ElemType floor = 1e-16f;
    CUDA_LONG blockid = id / len;
    CUDA_LONG row = colMajor ? id - blockid * len : blockId2ColOrRow[blockid];
    CUDA_LONG col = colMajor ? blockId2ColOrRow[blockid] : id - blockid * len;

    size_t indexInA = row + col * numRows;
    a[indexInA] += d_v[id] * d_v[id];
    ElemType temp = sqrt_(a[indexInA] + floor);
    d_v[id] /= temp;

    if (multipliers != nullptr)
        multipliers[id] = 1 / temp;
}

template <class ElemType>
__global__ void _fsadagrad(CUDA_LONG size, ElemType* grad, ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
                           ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, ElemType typedUnitGainFactor)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    const comp_t unitGainFactor = (comp_t)typedUnitGainFactor;
    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride)
    {
        comp_t g = grad[idx];
        comp_t adaSqr = (comp_t)adaWeight * (comp_t)smoothAda[idx] + (1.0f - (comp_t)adaWeight) * g * g;
        smoothAda[idx] = adaSqr;
        if (adaSqr != 0.0f)
        {
            comp_t w;
            w = (comp_t)adaMul * rsqrt_(adaSqr);

            if (w > static_cast<comp_t>(10.0))
                w = static_cast<comp_t>(10.0);
            g *= w;
        }

        if ((comp_t)mom > 0.0f)
        {
            g = (comp_t)mom * (comp_t)smoothMom[idx] + unitGainFactor * g;
            smoothMom[idx] = g;
        }

        g *= (comp_t)lr;
        val[idx] = (comp_t)val[idx] - g;
    }
}

template<class ElemType>
inline __device__ ElemType _getvalue4BlockSparseCol(ElemType* v, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len, CUDA_LONG idx)
{
    CUDA_LONG col = idx / len;
    CUDA_LONG row = idx - col * len;
    CUDA_LONG blockid = colOrRow2blockId[col];
    return (blockid == SparseIndex_NotAssigned) ? (ElemType)0 : v[blockid * len + row];
}

template<class ElemType>
inline __device__ void _scalevalue4BlockSparseCol(ElemType* v, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len, CUDA_LONG idx, ElemType s)
{
    CUDA_LONG col = idx / len;
    CUDA_LONG row = idx - col * len;
    CUDA_LONG blockid = colOrRow2blockId[col];
    if (blockid != SparseIndex_NotAssigned)
    {
        v[blockid * len + row] *= s;
    }
}

template <class ElemType>
__global__ void _fsadagrad4BlockSparseCol(CUDA_LONG size,
    ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
    ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
    ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, ElemType unitGainFactor)
{
    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride)
    {
        ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, idx);
        ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
        smoothAda[idx] = adaSqr;
        if (adaSqr != 0.0f)
        {
            ElemType w;
            w = adaMul * rsqrt_(adaSqr);

            if (w > 10.0f)
                w = 10.0f;
            g *= w;
        }

        if (mom > 0.0f)
        {
            g = mom * smoothMom[idx] + unitGainFactor * g;
            smoothMom[idx] = g;
        }

        g *= lr;
        val[idx] -= g;
    }
}

template <class ElemType>
__global__ void _rmsprop_init(
    ElemType* avars, ElemType* signs, ElemType* steps,
    ElemType* curr_grad,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= N)
        return;

    comp_t tmp = curr_grad[i];
    avars[i] = tmp * tmp;
    signs[i] = ElemType(0.0);
    steps[i] = ElemType(0.02);
}

template <class ElemType>
__global__ void _rmsprop_init4BlockSparseCol(
    ElemType* avars, ElemType* signs, ElemType* steps,
    ElemType* curr_grad, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
    const CUDA_LONG N)
{
    CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= N)
        return;

    ElemType tmp = _getvalue4BlockSparseCol(curr_grad, colOrRow2blockId, len, i);

    avars[i] = tmp * tmp;
    signs[i] = ElemType(0.0);
    steps[i] = ElemType(0.02);
}

template <class ElemType>
__global__ void _rmsprop(
    ElemType* avars, ElemType* signs, ElemType* steps,
    ElemType* curr_grad,
    const CUDA_LONG N,
    ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN,
    ElemType floor,
    ElemType* upd_gpu,
    ElemType* multipliers)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= N)
        return;

    comp_t avars_comp = avars[i];
    comp_t RMS_GAMMA_comp = RMS_GAMMA;
    comp_t curr_grad_comp = curr_grad[i];
    comp_t steps_comp = steps[i];

    avars_comp = RMS_GAMMA_comp * avars_comp + (comp_t(1.0) - RMS_GAMMA_comp) * (curr_grad_comp * curr_grad_comp);

    // // grad sign base 3: 0->neg, 1->zero, 2->pos
    // const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));

    // // signs[i] contains three consecutive grad_sign
    // signs[i]  = 3*(int(signs[i]) % 9) + grad_sign;

    // // update according to the following table:
    // // (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC
    // // (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC
    // // otherwise: no action

    // switch(int(upd_gpu[int(signs[i])]))
    // {
    // case 0:
    //    steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
    //    break;
    // case 2:
    //    steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
    //    break;
    // }
    // curr_grad[i] *= steps[i] / sqrt_(avars[i] + floor);

    const int grad_sign = (comp_t(0) < curr_grad_comp) - (curr_grad_comp < comp_t(0));

    if (signs[i] * grad_sign > 0)
        steps_comp = min(steps_comp * (comp_t)RMS_WGT_INC, (comp_t)RMS_WGT_MAX);
    else
        steps_comp = max(steps_comp * (comp_t)RMS_WGT_DEC, (comp_t)RMS_WGT_MIN);

    comp_t temp = steps_comp / sqrt_(avars_comp + (comp_t)floor);
    curr_grad[i] = curr_grad_comp * temp;
    signs[i] = grad_sign;
    avars[i] = avars_comp;
    steps[i] = steps_comp;

    if (multipliers != nullptr)
        multipliers[i] = temp;
}

template <class ElemType>
__global__ void _rmsprop4BlockSparseCol(
    ElemType* avars, ElemType* signs, ElemType* steps,
    ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
    const CUDA_LONG N,
    ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN,
    ElemType floor,
    ElemType* upd_gpu,
    ElemType* multipliers)
{
    CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= N)
        return;

    ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, i);

    avars[i] = RMS_GAMMA * avars[i] + (ElemType(1.0) - RMS_GAMMA) * (g * g);

    // // grad sign base 3: 0->neg, 1->zero, 2->pos
    // const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));

    // // signs[i] contains three consecutive grad_sign
    // signs[i]  = 3*(int(signs[i]) % 9) + grad_sign;

    // // update according to the following table:
    // // (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC
    // // (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC
    // // otherwise: no action

    // switch(int(upd_gpu[int(signs[i])]))
    // {
    // case 0:
    //    steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
    //    break;
    // case 2:
    //    steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
    //    break;
    // }
    // curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);

    const int grad_sign = (ElemType(0) < g) - (g < ElemType(0));

    if (signs[i] * grad_sign > 0)
        steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
    else
        steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);

    ElemType temp = steps[i] / sqrt_(avars[i] + floor);
    _scalevalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, i, temp);
    signs[i] = grad_sign;

    if (multipliers != nullptr)
        multipliers[i] = temp;
}

template <class ElemType>
__global__ void _rescaleToRange(
    ElemType* a,
    const CUDA_LONG N,
    const ElemType low,
    const ElemType high)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = (comp_t)a[id] * ((comp_t)high - (comp_t)low) + (comp_t)low;
}

template <class ElemType>
__global__ void _truncated_normal_transform(
    ElemType* a,
    const CUDA_LONG N,
    const ElemType mean,
    const ElemType sigma)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    const comp_t high = (comp_t)0.97724986805182079; // normcdf(2);
    const comp_t low = (comp_t)0.022750131948179195; // normcdf(-2);
    a[id] = normcdfinv((comp_t)a[id] * (high - low) + low) * (comp_t)sigma + (comp_t)mean;
}

template <class ElemType>
__global__ void _gumbelFromUniform(
    ElemType* a,
    const CUDA_LONG N,
    const ElemType loc,
    const ElemType scale)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    a[id] = (comp_t)loc - (comp_t)scale * log_(comp_t(1e-40) - log_((comp_t)a[id])); //a[id] is uniform in (0,1] exactly opposite from every other rng implementation
}

template <class ElemType>
__global__ void _setMaskAndScale(
    ElemType* a,
    const CUDA_LONG N,
    const ElemType maskRate,
    const ElemType scaleValue)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    bool tmp = a[id] <= maskRate;
    if(tmp)     a[id] = 0;
    else     a[id] = scaleValue;
}

template <class ElemType>
__global__ void _vectorSum(
    ElemType* c,       // output
    const ElemType* a, // input
    const CUDA_LONG n, // a.numRows
    const CUDA_LONG m, // a.numCols
    const bool isColWise)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if ((isColWise && id >= m) || (!isColWise && id >= n))
        return;

    comp_t sum = 0;

    if (isColWise)
    {
        for (CUDA_LONG i = 0; i < n; ++i)
        {
            sum += (comp_t)a[IDX2C(i, id, n)];
        }
    }
    else
    {
        for (CUDA_LONG j = 0; j < m; ++j)
        {
            sum += (comp_t)a[IDX2C(id, j, n)];
        }
    }
    c[id] = sum;
}

template <class ElemType>
__global__ void _vectorNorm1(
    ElemType* c,       // output
    const ElemType* a, // input
    const CUDA_LONG n, // a.numRows
    const CUDA_LONG m, // a.numCols
    const bool isColWise)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if ((isColWise && id >= m) || (!isColWise && id >= n))
        return;

    comp_t sum = 0;

    if (isColWise)
    {
        for (CUDA_LONG i = 0; i < n; ++i)
        {
            sum += fabs_((comp_t)a[IDX2C(i, id, n)]);
        }
    }
    else
    {
        for (CUDA_LONG j = 0; j < m; ++j)
        {
            sum += fabs_((comp_t)a[IDX2C(id, j, n)]);
        }
    }
    c[id] = sum;
}

//one column per thread
template <class ElemType>
__global__ void _vectorNorm2(
    ElemType* c,       // output
    const ElemType* a, // input
    const CUDA_LONG N, // a.GetNumRows();
    const CUDA_LONG M, // a.GetNumCols();
    const bool isColWise)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if ((isColWise && id >= M) || (!isColWise && id >= N))
        return;

    comp_t sum = 0;
    if (isColWise)
    {
        for (CUDA_LONG i = 0; i < N; ++i)
        {
            comp_t v = (comp_t)a[IDX2C(i, id, N)];
            sum += v * v;
        }
    }
    else
    {
        for (CUDA_LONG j = 0; j < M; ++j)
        {
            comp_t v = (comp_t)a[IDX2C(id, j, N)];
            sum += v * v;
        }
    }

    c[id] = sqrt_(sum);
}

template <class ElemType>
__global__ void _convertInd2ValsAdjustInd(
    ElemType* inds,
    const ElemType* M,
    ElemType* vals,
    const CUDA_LONG n, // number of cols
    const CUDA_LONG m, // number of rows
    const bool isColWise)
{
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if ((isColWise && id >= n) || (!isColWise && id >= m))
        return;
    inds[id]--;
    if (isColWise)
    {
        vals[id] = M[IDX2C((int) inds[id], id, m)];
    }
    else
    {
        vals[id] = M[IDX2C(id, (int) inds[id], m)];
    }
}

//assume each column is an input sample. Each sample is stored in [channel, row, col]  (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11)
template <class ElemType>
__global__ void _assignPackedConvolutionInput(ElemType* packedMatrix, const ElemType* inputSubBatch, const CUDA_LONG batchSize,
                                              const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputChannels,
                                              const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputChannels,
                                              const CUDA_LONG kernelWidth, const CUDA_LONG kernelHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample, const bool zeroPadding)
{
    const CUDA_LONG inputHeightTimesChannel = inputHeight * inputChannels;
    const size_t inputDim = inputWidth * inputHeightTimesChannel;

    const CUDA_LONG idall = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = idall / inputDim;
    if (sample >= batchSize)
        return;

    const CUDA_LONG id = idall % inputDim;
    const CUDA_LONG y = id / inputHeightTimesChannel; // inputCol

    const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
    const size_t packedInputColsPerSample = outputWidth * outputHeight; // output size per channel

    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels)
    // IN_ELEM_COLPOS = sample

    const CUDA_LONG nXC = id % inputHeightTimesChannel; // channel + inputRow*inputChannels
    const CUDA_LONG x = nXC / inputChannels;            // inputRow
    const CUDA_LONG c = nXC % inputChannels;            // channel

    ElemType currentInputValue = inputSubBatch[id + sample * inputDim];

    CUDA_LONG x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    if (zeroPadding)
    {
        const CUDA_LONG halfKernelWidth = kernelWidth / 2;
        const CUDA_LONG halfKernelHeight = kernelHeight / 2;

        x0 = max((ElemType)0.0f, ceil((x - (ElemType) kernelHeight + 1.0f + halfKernelHeight) / (ElemType) verticalSubsample)); // row : first wrow in which x is in
        x1 = x + halfKernelHeight - x0 * verticalSubsample;                                                           // first posxInKernel
        y0 = max((ElemType)0.0f, ceil((y - (ElemType) kernelWidth + 1.0f + halfKernelWidth) / (ElemType) horizontalSubsample)); // col : first wcol in which y is in
        y1 = y + halfKernelWidth - y0 * horizontalSubsample;                                                          // first posyInKernel
    }
    else
    {
        x0 = max((ElemType)0.0f, ceil((x - (ElemType) kernelHeight + 1) / (ElemType) verticalSubsample));  // row : first wrow in which x is in
        x1 = x - x0 * verticalSubsample;                                                         // first posxInKernel
        y0 = max((ElemType)0.0f, ceil((y - (ElemType) kernelWidth + 1) / (ElemType) horizontalSubsample)); // col : first wcol in which y is in
        y1 = y - y0 * horizontalSubsample;                                                       // first posyInKernel
    }

    // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight)
    // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow

    CUDA_LONG packColBase = sample * packedInputColsPerSample + y0 * outputHeight;
    for (CUDA_LONG wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel >= 0; wcol++, posyInKernel -= horizontalSubsample)
    {
        CUDA_LONG packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight;
        for (CUDA_LONG wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel >= 0; wrow++, posxInKernel -= verticalSubsample)
        {
            const CUDA_LONG packRow = packRowBase + posxInKernel;
            const CUDA_LONG packCol = packColBase + wrow;
            packedMatrix[packRow + packCol * packedInputRows] = currentInputValue;
        }
        packColBase += outputHeight;
    }
}

//assume each column is an input sample. Each sample is stored in [channel, row, col]  (r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11)
template <class ElemType>
__global__ void _unpackConvolutionInput(const ElemType* packedMatrix, ElemType* inputSubBatch, const CUDA_LONG batchSize,
                                        const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputChannels,
                                        const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputChannels,
                                        const CUDA_LONG kernelWidth, const CUDA_LONG kernelHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample, const bool zeroPadding)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    const CUDA_LONG inputHeightTimesChannel = inputHeight * inputChannels;
    const size_t inputDim = inputWidth * inputHeightTimesChannel;

    const CUDA_LONG idall = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = idall / inputDim;
    if (sample >= batchSize)
        return;

    const CUDA_LONG id = idall % inputDim;
    const CUDA_LONG y = id / inputHeightTimesChannel; // inputCol

    const size_t packedInputRows = kernelWidth * kernelHeight * inputChannels;
    const size_t packedInputColsPerSample = outputWidth * outputHeight; // output size per channel

    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * inputChannels)
    // IN_ELEM_COLPOS = sample

    const CUDA_LONG nXC = id % inputHeightTimesChannel; // channel + inputRow*inputChannels
    const CUDA_LONG x = nXC / inputChannels;            // inputRow
    const CUDA_LONG c = nXC % inputChannels;            // channel

    CUDA_LONG x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    if (zeroPadding)
    {
        const CUDA_LONG halfKernelWidth = kernelWidth / 2;
        const CUDA_LONG halfKernelHeight = kernelHeight / 2;

        x0 = max(0.0f, ceil((x - (comp_t) kernelHeight + 1.0f + halfKernelHeight) / (comp_t) verticalSubsample)); // row : first wrow in which x is in
        x1 = x + halfKernelHeight - x0 * verticalSubsample;                                                           // first posxInKernel
        y0 = max(0.0f, ceil((y - (comp_t) kernelWidth + 1.0f + halfKernelWidth) / (comp_t) horizontalSubsample)); // col : first wcol in which y is in
        y1 = y + halfKernelWidth - y0 * horizontalSubsample;                                                          // first posyInKernel
    }
    else
    {
        x0 = max(0.0f, ceil((x - (comp_t) kernelHeight + 1) / (comp_t) verticalSubsample));  // row : first wrow in which x is in
        x1 = x - x0 * verticalSubsample;                                                         // first posxInKernel
        y0 = max(0.0f, ceil((y - (comp_t) kernelWidth + 1) / (comp_t) horizontalSubsample)); // col : first wcol in which y is in
        y1 = y - y0 * horizontalSubsample;                                                       // first posyInKernel
    }

    // PACK_ELEM_ROWPOS(channel, posxInKernel, posyInKernel) = (channel * kernelWidth * kernelHeight + posxInKernel + posyInKernel * kernelHeight)
    // PACK_ELEM_COLPOS(sample, wrow, wcol) = (sample*packedInputColsPerSample + outputHeight*wcol + wrow

    comp_t currentInputValue = inputSubBatch[id + sample * inputDim];
    CUDA_LONG packColBase = sample * packedInputColsPerSample + y0 * outputHeight;
    for (CUDA_LONG wcol = y0, posyInKernel = y1; wcol < outputWidth && posyInKernel >= 0; wcol++, posyInKernel -= horizontalSubsample)
    {
        CUDA_LONG packRowBase = c * kernelWidth * kernelHeight + posyInKernel * kernelHeight;
        for (CUDA_LONG wrow = x0, posxInKernel = x1; wrow < outputHeight && posxInKernel >= 0; wrow++, posxInKernel -= verticalSubsample)
        {
            const CUDA_LONG packRow = packRowBase + posxInKernel;
            const CUDA_LONG packCol = packColBase + wrow;
            currentInputValue += (comp_t)packedMatrix[packRow + packCol * packedInputRows];
        }
        packColBase += outputHeight;
    }

    inputSubBatch[id + sample * inputDim] = currentInputValue;
}

template <class ElemType>
__global__ void _assignMaxPoolingResult(ElemType* outputBatch, const ElemType* inputBatch, const CUDA_LONG batchSize, const CUDA_LONG channels,
                                        const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputSizePerSample,
                                        const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputSizePerSample,
                                        const CUDA_LONG windowWidth, const CUDA_LONG windowHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample)
{
    const CUDA_LONG outputIndex = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = outputIndex / outputSizePerSample;
    if (sample >= batchSize)
        return;

    const CUDA_LONG outputIndexWithinSample = outputIndex % outputSizePerSample;
    const CUDA_LONG inputHeightTimesChannel = inputHeight * channels;
    const CUDA_LONG outputHeightTimesChannel = outputHeight * channels;

    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
    // IN_ELEM_COLPOS = sample

    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
    // OUT_ELEM_COLPOS = sample

    const CUDA_LONG y = outputIndexWithinSample / outputHeightTimesChannel;   // wcol
    const CUDA_LONG nXC = outputIndexWithinSample % outputHeightTimesChannel; // channel + wrow*channels
    const CUDA_LONG x = nXC / channels;                                       // wrow
    const CUDA_LONG c = nXC % channels;                                       // channel

    const ElemType* inputBatchBase4Sample = inputBatch + sample * inputSizePerSample;
    register ElemType maxVal = -FLT_MAX;
    const CUDA_LONG rowInWindowBase = (x * verticalSubsample + y * horizontalSubsample * inputHeight) * channels + c;
    for (CUDA_LONG colInWindow = 0; colInWindow < windowWidth; colInWindow++)
    {
        CUDA_LONG rowInInput = rowInWindowBase + colInWindow * inputHeightTimesChannel;
        for (CUDA_LONG rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++)
        {
            const ElemType val = inputBatchBase4Sample[rowInInput];
            maxVal = max(maxVal, val);
            rowInInput += channels;
        }
    }
    outputBatch[outputIndexWithinSample + sample * outputSizePerSample] = maxVal;
}

template <class ElemType>
__global__ void _addMaxPoolingGradient(ElemType* inputGradientBatch, const ElemType* outputGradientBatch, const ElemType* inputBatch, const ElemType* outputBatch,
                                       const CUDA_LONG batchSize, const CUDA_LONG channels,
                                       const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputSizePerSample,
                                       const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputSizePerSample,
                                       const CUDA_LONG windowWidth, const CUDA_LONG windowHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    const CUDA_LONG inputIndex = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = inputIndex / inputSizePerSample;
    if (sample >= batchSize)
        return;

    const CUDA_LONG inputIndexWithinSample = inputIndex % inputSizePerSample;

    const CUDA_LONG inputHeightTimesChannel = inputHeight * channels;
    const CUDA_LONG outputHeightTimesChannel = outputHeight * channels;

    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
    // IN_ELEM_COLPOS = sample

    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
    // OUT_ELEM_COLPOS = sample

    const CUDA_LONG y = inputIndexWithinSample / inputHeightTimesChannel;   // col in input
    const CUDA_LONG nXC = inputIndexWithinSample % inputHeightTimesChannel; // channel + row*chanels
    const CUDA_LONG x = nXC / channels;                                     // row in input
    const CUDA_LONG c = nXC % channels;                                     // channel

    CUDA_LONG startOutX = max(0.0f, ceil((x - (ElemType) windowHeight + 1) / (ElemType) verticalSubsample));     // inclusive start
    CUDA_LONG endOutX = (x / verticalSubsample < outputHeight - 1) ? x / verticalSubsample : outputHeight - 1;   // inclusive end
    CUDA_LONG startOutY = max(0.0f, ceil((y - (ElemType) windowWidth + 1) / (ElemType) horizontalSubsample));    // inclusive start
    CUDA_LONG endOutY = (y / horizontalSubsample < outputWidth - 1) ? y / horizontalSubsample : outputWidth - 1; // inclusive end

    ElemType* inputGradientBatchBase4Sample = inputGradientBatch + sample * inputSizePerSample;
    const ElemType* outputGradientBatchBase4Sample = outputGradientBatch + sample * outputSizePerSample;
    const ElemType* outputBatchBase4Sample = outputBatch + sample * outputSizePerSample;

    comp_t inputValue = inputBatch[inputIndexWithinSample + sample * inputSizePerSample];
    for (CUDA_LONG outY = startOutY; outY <= endOutY; outY++)
    {
        for (CUDA_LONG outX = startOutX; outX <= endOutX; outX++)
        {
            CUDA_LONG outputIndex = outY * outputHeightTimesChannel + outX * channels + c;
            if (inputValue == (comp_t)outputBatchBase4Sample[outputIndex])
                inputGradientBatchBase4Sample[inputIndexWithinSample] = (comp_t)inputGradientBatchBase4Sample[inputIndexWithinSample] + (comp_t)outputGradientBatchBase4Sample[outputIndex];
        }
    }
}
template <class ElemType>
__global__ void _assignAveragePoolingResult(ElemType* outputBatch, const ElemType* inputBatch, const CUDA_LONG batchSize, const CUDA_LONG channels,
                                            const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputSizePerSample,
                                            const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputSizePerSample,
                                            const CUDA_LONG windowWidth, const CUDA_LONG windowHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    const CUDA_LONG outputIndex = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = outputIndex / outputSizePerSample;
    if (sample >= batchSize)
        return;

    const CUDA_LONG outputIndexWithinSample = outputIndex % outputSizePerSample;
    const CUDA_LONG inputHeightTimesChannel = inputHeight * channels;
    const CUDA_LONG outputHeightTimesChannel = outputHeight * channels;

    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
    // IN_ELEM_COLPOS = sample

    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
    // OUT_ELEM_COLPOS = sample

    const CUDA_LONG y = outputIndexWithinSample / outputHeightTimesChannel;   // wcol
    const CUDA_LONG nXC = outputIndexWithinSample % outputHeightTimesChannel; // channel + wrow*channels
    const CUDA_LONG x = nXC / channels;                                       // wrow
    const CUDA_LONG c = nXC % channels;                                       // channel

    const ElemType* inputBatchBase4Sample = inputBatch + sample * inputSizePerSample;

    register comp_t average = 0;
    const CUDA_LONG rowInWindowBase = (x * verticalSubsample + y * horizontalSubsample * inputHeight) * channels + c;
    for (CUDA_LONG colInWindow = 0; colInWindow < windowWidth; colInWindow++)
    {
        CUDA_LONG rowInInput = rowInWindowBase + colInWindow * inputHeightTimesChannel;
        for (CUDA_LONG rowInWindow = 0; rowInWindow < windowHeight; rowInWindow++)
        {
            average += (comp_t)inputBatchBase4Sample[rowInInput];
            rowInInput += channels;
        }
    }

    outputBatch[outputIndexWithinSample + sample * outputSizePerSample] = average / (comp_t)windowWidth / (comp_t)windowHeight;
}

template <class ElemType>
__global__ void _addAveragePoolingGradient(ElemType* inputGradientBatch, const ElemType* outputGradientBatch,
                                           const CUDA_LONG batchSize, const CUDA_LONG channels,
                                           const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputSizePerSample,
                                           const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputSizePerSample,
                                           const CUDA_LONG windowWidth, const CUDA_LONG windowHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    const CUDA_LONG inputIndex = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = inputIndex / inputSizePerSample;
    if (sample >= batchSize)
        return;

    const CUDA_LONG inputIndexWithinSample = inputIndex % inputSizePerSample;

    const CUDA_LONG inputHeightTimesChannel = inputHeight * channels;
    const CUDA_LONG outputHeightTimesChannel = outputHeight * channels;
    const CUDA_LONG windowSize = windowWidth * windowHeight;

    // IN_ELEM_ROWPOS(channel, row, col) = (channel + (row + col * inputHeight) * channels)
    // IN_ELEM_COLPOS = sample

    // OUT_ELEM_ROWPOS(channel, wrow, wcol) = (channel + (wrow + wcol * outputHeight) * channels)
    // OUT_ELEM_COLPOS = sample

    const CUDA_LONG y = inputIndexWithinSample / inputHeightTimesChannel;   // col in input
    const CUDA_LONG nXC = inputIndexWithinSample % inputHeightTimesChannel; // channel + row*chanels
    const CUDA_LONG x = nXC / channels;                                     // row in input
    const CUDA_LONG c = nXC % channels;                                     // channel

    CUDA_LONG startOutX = max(0.0f, ceil((x - (ElemType) windowHeight + 1) / (ElemType) verticalSubsample));     // inclusive start
    CUDA_LONG endOutX = (x / verticalSubsample < outputHeight - 1) ? x / verticalSubsample : outputHeight - 1;   // inclusive end
    CUDA_LONG startOutY = max(0.0f, ceil((y - (ElemType) windowWidth + 1) / (ElemType) horizontalSubsample));    // inclusive start
    CUDA_LONG endOutY = (y / horizontalSubsample < outputWidth - 1) ? y / horizontalSubsample : outputWidth - 1; // inclusive end

    ElemType* inputGradientBatchBase4Sample = inputGradientBatch + sample * inputSizePerSample;
    const ElemType* outputGradientBatchBase4Sample = outputGradientBatch + sample * outputSizePerSample;

    for (CUDA_LONG outY = startOutY; outY <= endOutY; outY++)
    {
        for (CUDA_LONG outX = startOutX; outX <= endOutX; outX++)
        {
            CUDA_LONG outputIndex = outY * outputHeightTimesChannel + outX * channels + c;
            inputGradientBatchBase4Sample[inputIndexWithinSample] = (comp_t)inputGradientBatchBase4Sample[inputIndexWithinSample] + (comp_t)outputGradientBatchBase4Sample[outputIndex] / (comp_t)windowSize;
        }
    }
}

template <class ElemType>
__global__ void _addMaxPoolingGradientLoopOut(ElemType* inputGradientBatch, const ElemType* outputGradientBatch, const ElemType* inputBatch, const ElemType* outputBatch,
                                              const CUDA_LONG batchSize, const CUDA_LONG channels,
                                              const CUDA_LONG inputWidth, const CUDA_LONG inputHeight, const CUDA_LONG inputSizePerSample,
                                              const CUDA_LONG outputWidth, const CUDA_LONG outputHeight, const CUDA_LONG outputSizePerSample,
                                              const CUDA_LONG windowWidth, const CUDA_LONG windowHeight, const CUDA_LONG horizontalSubsample, const CUDA_LONG verticalSubsample)
{
    const CUDA_LONG outputIndex = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG sample = outputIndex / outputSizePerSample;
    if (sample >= batchSize)
        return;

    const CUDA_LONG outputIndexWithinSample = outputIndex % outputSizePerSample;
    const CUDA_LONG inputWidthTimesChannel = inputWidth * channels;
    const CUDA_LONG outputWidthTimesChannel = outputWidth * channels;
    const CUDA_LONG y = outputIndexWithinSample / outputWidthTimesChannel;
    const CUDA_LONG nXC = outputIndexWithinSample % outputWidthTimesChannel;
    const CUDA_LONG x = nXC / channels;
    const CUDA_LONG c = nXC % channels;

    const CUDA_LONG offset0 = sample * inputSizePerSample + y * verticalSubsample * inputWidthTimesChannel + x * horizontalSubsample * channels;
    const ElemType* pCurWindow4Input = inputBatch + offset0; // pooling to current window's first input pixel
    ElemType* pCurWindow4InGradient = inputGradientBatch + offset0;
    for (CUDA_LONG yy = 0; yy < windowHeight; yy++)
    {
        const CUDA_LONG offset1 = yy * inputWidthTimesChannel + c;
        const ElemType* pf0 = pCurWindow4Input + offset1;
        ElemType* pf1 = pCurWindow4InGradient + offset1;
        for (CUDA_LONG xx = 0; xx < windowWidth; xx++)
        {
            const CUDA_LONG offset2 = xx * channels;
            if (pf0[offset2] == outputBatch[outputIndex])
            {
                pf1[offset2] += outputGradientBatch[outputIndex]; // need to be atomic however atomicAdd on double is not supported.
            }
        }
    }
}

template <class ElemType>
__global__ void _addElementProductOf(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    us[id] = (comp_t)us[id] + ((comp_t)a[id] * (comp_t)b[id]);
}

template <class ElemType>
__global__ void _columnElementMultiplyWith(
    ElemType* us,
    const ElemType* a,
    const CUDA_LONG N, // a.GetNumRows();
    const CUDA_LONG M) // us.GetNumCols();
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    // __shared__ ElemType _a[GridDim::maxThreadsPerBlock];
    // _a[threadIdx.x]=a[id];
    comp_t mul = a[id];
    for (CUDA_LONG j = 0; j < M; ++j)
    {
        us[IDX2C(id, j, N)] = (comp_t)us[IDX2C(id, j, N)] * mul;
    }
}

template <class ElemType>
__global__ void _rowElementMultiplyWith(
    ElemType* us,
    const ElemType* a,
    const CUDA_LONG N, // us.GetNumRows();
    const CUDA_LONG M) // a.GetNumCols();
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= M)
        return;

    // __shared__ ElemType _a[GridDim::maxThreadsPerBlock];
    // _a[threadIdx.x]=a[id];
    comp_t mul = a[id];
    for (CUDA_LONG i = 0; i < N; ++i)
    {
        us[IDX2C(i, id, N)] = (comp_t)us[IDX2C(i, id, N)] * mul;
    }
}

template <class ElemType>
__global__ void _rowElementDivideBy(
    ElemType* us,
    const ElemType* a,
    const CUDA_LONG N, // us.GetNumRows();
    const CUDA_LONG M) // a.GetNumCols();
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= M)
        return;

    // __shared__ ElemType _a[GridDim::maxThreadsPerBlock];
    // _a[threadIdx.x]=a[id];
    comp_t v = a[id];
    if (v >= 0 && v < EPS_IN_INVERSE)
        v = EPS_IN_INVERSE;
    else if (v < 0 && v > -EPS_IN_INVERSE)
        v = (-EPS_IN_INVERSE);

    for (CUDA_LONG i = 0; i < N; ++i)
    {
        us[IDX2C(i, id, N)] = (comp_t)us[IDX2C(i, id, N)] / v;
    }
}

template <class ElemType>
__global__ void _ColumnElementDivideBy(
    ElemType* us,
    const ElemType* a,
    const CUDA_LONG N, // a.GetNumRows();
    const CUDA_LONG M) // us.GetNumCols();
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    comp_t smallValue = EPS_IN_INVERSE;

    // __shared__ ElemType _a[GridDim::maxThreadsPerBlock];
    // _a[threadIdx.x]=a[id];
    comp_t v = a[id];
    for (CUDA_LONG j = 0; j < M; ++j)
    {
        if (v < 0 && v > -smallValue)
            us[IDX2C(id, j, N)] = (comp_t)us[IDX2C(id, j, N)] / (-smallValue);
        else if (v >= 0 && v < smallValue)
            us[IDX2C(id, j, N)] = (comp_t)us[IDX2C(id, j, N)] / smallValue;
        else
            us[IDX2C(id, j, N)] = (comp_t)us[IDX2C(id, j, N)] / v;
    }
}

template <class ElemType>
__global__ void _innerProduct(
    ElemType* c,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG N, // a.GetNumRows();
    const CUDA_LONG M, // a.GetNumCols();
    const bool isColWise)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if ((isColWise && id >= M) || (!isColWise && id >= N))
        return;

    comp_t sum = 0;
    CUDA_LONG index;
    if (isColWise)
    {
        for (CUDA_LONG i = 0; i < N; ++i)
        {
            index = IDX2C(i, id, N);
            sum += (comp_t)a[index] * (comp_t)b[index];
        }
    }
    else
    {
        for (CUDA_LONG j = 0; j < M; ++j)
        {
            index = IDX2C(id, j, N);
            sum += (comp_t)a[index] * (comp_t)b[index];
        }
    }

    c[id] = sum;
}

template <class ElemType>
__global__ void _innerProduct4SparseCSC(
    ElemType* c,
    const ElemType* a,
    const GPUSPARSE_INDEX_TYPE* aRowIndex,
    const GPUSPARSE_INDEX_TYPE* aColCSCIndex,
    const ElemType* b,
    const CUDA_LONG M, // a.GetNumRows();
    const CUDA_LONG N, // a.GetNumCols();
    const bool isColWise)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if ((isColWise && id >= N) || (!isColWise && id >= M))
        return;

    ElemType sum = 0;
    CUDA_LONG index;

    if (isColWise)
    {
        for (CUDA_LONG i = aColCSCIndex[id]; i < aColCSCIndex[id+1]; i++)
        {
            index = IDX2C(aRowIndex[i], id, M);
            sum += a[i] * b[index];
        }
    }
    else
    {
        for (CUDA_LONG j = 0; j < N; ++j)
        {
            for (CUDA_LONG i = aColCSCIndex[j]; i < aColCSCIndex[j+1]; i++)
            {
                if (aRowIndex[i] == id)
                {
                    index = IDX2C(id, j, M);
                    sum += a[i] * b[index];
                    break;
                }
            }
        }
    }

    c[id] = sum;
}

template <class ElemType>
__global__ void _assignSignOf(
    ElemType* a,
    const ElemType* b,
    const CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    ElemType v = b[id];
    a[id] = (v == (ElemType) 0 ? (ElemType) 0 : (v > 0 ? (ElemType) 1 : (ElemType)(-1)));
}

template <class ElemType>
__global__ void _addSignOf(
    ElemType* a,
    const ElemType* b,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    comp_t v = b[id];
    a[id] = (comp_t)a[id] + (v == (comp_t) 0 ? (comp_t) 0 : (v > (comp_t)0 ? (comp_t) 1 : (comp_t)(-1)));
}

// This function processes 1 column per block. this function needs 512 threads
template <class ElemType, bool IsMax>
__global__ void _vectorMaxMinReduce512Threads(
    const ElemType* us,
    ElemType* Indexes,
    ElemType* Values,
    const CUDA_LONG numRows,
    const CUDA_LONG numCols)
{
    // we first find max per column
    __shared__ ElemType partials[512];
    __shared__ int partialsInd[512];
    if (IsMax)
    {
        partials[threadIdx.x] = -10000000;
    }
    else
    {
        partials[threadIdx.x] = 10000000;
    }
    partialsInd[threadIdx.x] = -1;

    for (int i = threadIdx.x; i < numRows; i += 512)
    {
        if ((IsMax ? (us[IDX2C(i, blockIdx.x, numRows)] > partials[threadIdx.x]) : (us[IDX2C(i, blockIdx.x, numRows)] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = us[IDX2C(i, blockIdx.x, numRows)];
            partialsInd[threadIdx.x] = i;
        }
    }
    __syncthreads();

    if (threadIdx.x < 256)
    {
        if ((IsMax ? (partials[threadIdx.x + 256] > partials[threadIdx.x]) : (partials[threadIdx.x + 256] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 256];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 256];
        }
    }
    __syncthreads();

    if (threadIdx.x < 128)
    {
        if ((IsMax ? (partials[threadIdx.x + 128] > partials[threadIdx.x]) : (partials[threadIdx.x + 128] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 128];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 128];
        }
    }
    __syncthreads();

    if (threadIdx.x < 64)
    {
        if ((IsMax ? (partials[threadIdx.x + 64] > partials[threadIdx.x]) : (partials[threadIdx.x + 64] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 64];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 64];
        }
    }
    __syncthreads();

    if (threadIdx.x < 32)
    {
        if ((IsMax ? (partials[threadIdx.x + 32] > partials[threadIdx.x]) : (partials[threadIdx.x + 32] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 32];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 32];
        }
    }
    __syncthreads();

    if (threadIdx.x < 16)
    {
        if ((IsMax ? (partials[threadIdx.x + 16] > partials[threadIdx.x]) : (partials[threadIdx.x + 16] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 16];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 16];
        }
    }
    __syncthreads();

    if (threadIdx.x < 8)
    {
        if ((IsMax ? (partials[threadIdx.x + 8] > partials[threadIdx.x]) : (partials[threadIdx.x + 8] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 8];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 8];
        }
    }
    __syncthreads();

    if (threadIdx.x < 4)
    {
        if ((IsMax ? (partials[threadIdx.x + 4] > partials[threadIdx.x]) : (partials[threadIdx.x + 4] < partials[threadIdx.x])) || (partialsInd[threadIdx.x] == -1))
        {
            partials[threadIdx.x] = partials[threadIdx.x + 4];
            partialsInd[threadIdx.x] = partialsInd[threadIdx.x + 4];
        }
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        ElemType mx = partials[0];
        int ind = partialsInd[0];
        if ((IsMax ? (mx < partials[1]) : (mx > partials[1])) || (ind == -1))
        {
            mx = partials[1];
            ind = partialsInd[1];
        }
        if ((IsMax ? (mx < partials[2]) : (mx > partials[2])) || (ind == -1))
        {
            mx = partials[2];
            ind = partialsInd[2];
        }
        if ((IsMax ? (mx < partials[3]) : (mx > partials[3])) || (ind == -1))
        {
            mx = partials[3];
            ind = partialsInd[3];
        }
        Values[blockIdx.x] = mx;
        Indexes[blockIdx.x] = ind;
    }
}

template <class ElemType>
__global__ void _vectorMax(
    const ElemType* us,
    ElemType* maxIndexes,
    ElemType* maxValues,
    const CUDA_LONG m, // number of rows
    const CUDA_LONG n, // number of cols
    const bool isColWise)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG maxInd = -1;
    ElemType maxVal = -100000;

    if (isColWise)
    {
        if (id >= n)
            return;

        for (CUDA_LONG i = 0; i < m; i++)
        {
            if (maxInd == -1 || us[IDX2C(i, id, m)] >= maxVal)
            {
                maxInd = i;
                maxVal = us[IDX2C(i, id, m)];
            }
        }
    }
    else
    {
        if (id >= m)
            return;

        for (CUDA_LONG j = 0; j < n; j++)
        {
            if (maxInd == -1 || us[IDX2C(id, j, m)] >= maxVal)
            {
                maxInd = j;
                maxVal = us[IDX2C(id, j, m)];
            }
        }
    }
    maxIndexes[id] = maxInd;
    maxValues[id] = maxVal;
}

template <class ElemType>
__global__ void _vectorMin(
    const ElemType* us,
    ElemType* minIndexes,
    ElemType* minValues,
    const CUDA_LONG m, // number of rows
    const CUDA_LONG n, // number of cols
    const bool isColWise)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG minInd = -1;
    ElemType minVal = -100000;

    if (isColWise)
    {
        if (id >= n)
            return;

        for (CUDA_LONG i = 0; i < m; i++)
        {
            if (minInd == -1 || us[IDX2C(i, id, m)] <= minVal)
            {
                minInd = i;
                minVal = us[IDX2C(i, id, m)];
            }
        }
    }
    else
    {
        if (id >= m)
            return;

        for (CUDA_LONG j = 0; j < n; j++)
        {
            if (minInd == -1 || us[IDX2C(id, j, m)] <= minVal)
            {
                minInd = j;
                minVal = us[IDX2C(id, j, m)];
            }
        }
    }
    minIndexes[id] = minInd;
    minValues[id] = minVal;
}

template <class ElemType>
__global__ void _matrixMatrixAddOnCuda(
    const ElemType alpha,
    const ElemType* a,
    const ElemType* b,
    ElemType* c,
    const CUDA_LONG N)
{
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    c[id] = (comp_t)alpha * (comp_t)a[id] + (comp_t)b[id];
}

template <class ElemType>
__global__ void _matrixVectorRowWiseAddWithThreadPerElem(
    const ElemType* a,
    const ElemType* b,
    ElemType* us,
    ElemType alpha,
    const CUDA_LONG m, // number of rows
    const CUDA_LONG n) // number of cols
{
    CUDA_LONG N = m * n; // used in CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N) macro
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);

    CUDA_LONG col = id / m;

    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    us[id] = (comp_t)alpha * (comp_t)a[col] + (comp_t)b[id];
}

//this implementation uses more threads but also more memory access
template <class ElemType>
__global__ void _matrixVectorColumnWiseAddWithThreadPerElem(
    const ElemType* a,
    const ElemType* b,
    ElemType* us,
    ElemType alpha,
    const CUDA_LONG m, // number of rows
    const CUDA_LONG n) // number of cols
{
    CUDA_LONG N = m * n; // used in CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N) macro
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);

    CUDA_LONG col = id / m;
    CUDA_LONG row = id - col * m;

    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    us[id] = (comp_t)alpha * (comp_t)a[row] + (comp_t)b[id];
}

template <class ElemType>
__global__ void _matrixVectorColumnWiseAddWithThreadPerRow(
    const ElemType* a,
    ElemType* us,
    ElemType alpha,
    const CUDA_LONG m, // number of rows
    const CUDA_LONG n) // number of cols
{
#ifdef VALIDATION
    if (blockDim.x * blockIdx.x + threadIdx.x == 0)
    {
        printf("** _matrixVectorColumnWiseAdd on device:\na = %p, us = %p, alpha = %f, m = %ld, n = %ld\n",
               a, us, alpha, m, n);
        printf("us[0] = %f\n", us[0]);
        printf("a[0] = %f\n", a[0]);
    }
#endif
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= m)
        return;
    ElemType tmp = a[id];
#ifdef VALIDATION
    printf("  a[%d] = %f\n", id, tmp);
#endif
    for (CUDA_LONG j = 0; j < n; ++j)
    {
        us[j * m + id] += alpha * tmp;
    }
}

template <class ElemType>
__global__ void _matrixVectorColumnWiseAddBlockPerRow(
    const ElemType* a,
    ElemType* us,
    ElemType alpha,
    const CUDA_LONG m, // number of rows
    const CUDA_LONG n) // number of cols
{
    ElemType tmp;

    if (threadIdx.x == 0)
    {
        tmp = a[blockIdx.x];
    }
    __syncthreads();

    int loadPerThread = n / blockDim.x;

    for (int i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? n : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        us[m * blockIdx.x + i] += alpha * tmp;
    }
}

template <class ElemType>
__global__ void _addScaledDifference(
    ElemType alpha,
    ElemType* a,
    ElemType* b,
    ElemType* c,
    CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    c[id] = (comp_t)c[id] + ((comp_t)a[id] - (comp_t)b[id]) * ((comp_t)alpha);
}

template <class ElemType>
__global__ void _assignScaledDifference(
    ElemType alpha,
    ElemType* a,
    ElemType* b,
    ElemType* c,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    c[id] = ((comp_t)a[id] - (comp_t)b[id]) * ((comp_t)alpha);
}

template <class ElemType>
__global__ void _addScaledDifference(
    ElemType* alpha,
    ElemType* a,
    ElemType* b,
    ElemType* c,
    CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    c[id] = (comp_t)c[id] + ((comp_t)a[id] - (comp_t)b[id]) * (comp_t)alpha[0];
}

template <class ElemType>
__global__ void _assignScaledDifference(
    ElemType* alpha,
    ElemType* a,
    ElemType* b,
    ElemType* c,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    c[id] = ((comp_t)a[id] - (comp_t)b[id]) * (comp_t)alpha[0];
}

template <class ElemType>
__global__ void _addElementToElement(
    ElemType beta,
    const ElemType* a, CUDA_LONG indexA,
    ElemType* c, CUDA_LONG indexC)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    //CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;  // only one thread launched
    //if (id > 0)
    //    return;
    comp_t us = (comp_t)beta ? (comp_t)beta * (comp_t)c[indexC] : (comp_t)0; // do not multiply if beta is 0, could be a NaN
    us += (comp_t)a[indexA];
    c[indexC] = us;
}

template <class ElemType>
__global__ void _assignNumOfDiff1024Threads(
    const ElemType* a,
    const ElemType* b,
    ElemType* c,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums[1024];
    partialSums[threadIdx.x] = 0;
    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG loadPerThread = N / blockDim.x;
    for (CUDA_LONG i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        partialSums[threadIdx.x] += ((comp_t)a[i] != (comp_t)b[i]);
    }
    __syncthreads();

    // 512
    if (threadIdx.x < 512)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 512];
    }
    __syncthreads();

    // 256
    if (threadIdx.x < 256)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 256];
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 128];
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 64];
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 32];
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 16];
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 8];
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 4];
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        c[0] = partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3];
    }
}

/*template<class ElemType>
__global__ void _assignNumOfDiff1024Threads(
ElemType *a,
ElemType *b,
ElemType *c,
CUDA_LONG N)
{
//TO DO: replace atomic operation with reduction

__shared__ int totalSum;
if (threadIdx.x == 0) totalSum = 0;
__syncthreads();

int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id>=N)
return;

int localVal = (a[id] != b[id]);
atomicAdd(&totalSum, localVal);
__syncthreads();

c[id] = totalSum;
}*/

template <class ElemType>
__global__ void _scaleArray(
    ElemType alpha,
    ElemType* us,
    CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    us[id] = us[id] * alpha;
}

template <class ElemType>
__global__ void _sparseCSRPlusDense(
    ElemType alpha,
    const ElemType* m_dVal,
    const int* m_dRow,
    const int* m_dCol,
    ElemType* pArrayDev,
    CUDA_LONG M)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= M)
        return;
    int start = m_dRow[id];
    int end = m_dRow[id + 1];
    for (int _i = start; _i < end; ++_i) // _i is index in m_dVal and m_dCol
    {
        int j = m_dCol[_i];
        pArrayDev[IDX2C(id, j, M)] += (alpha * m_dVal[_i]);
    }
}

template <class ElemType>
__global__ void _sparseCSRElemMulDense(
    const ElemType* m_dVal,
    const int* m_dRow,
    const int* m_dCol,
    const ElemType* b,
    ElemType* c,
    CUDA_LONG M)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= M)
        return;
    int start = m_dRow[id];
    int end = m_dRow[id + 1];
    for (int _i = start; _i < end; ++_i) // _i is index in m_dVal and m_dCol
    {
        int j = m_dCol[_i];
        c[IDX2C(id, j, M)] = b[IDX2C(id, j, M)] * m_dVal[_i];
    }
}

template <class ElemType>
__global__ void _isValid(
    const GPUSPARSE_INDEX_TYPE* rowIndex,
    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
    const int rows,
    const int cols,
    const int nz,
    long* d_res)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= cols || d_res[0] <= 0)
        return;

    int start = colCSCIndex[id];
    int end = colCSCIndex[id + 1];

    if (start > end)
    {
        if (d_res[0] > 0)
        {
            d_res[0] = -1;
            d_res[1] = id;
            d_res[2] = start;
            d_res[3] = end;
        }
    }
    else if (end > nz)
    {
        if (d_res[0] > 0)
        {
            d_res[0] = -2;
            d_res[1] = id + 1;
            d_res[2] = end;
            d_res[3] = nz;
        }
    }
    else
    {
        for (int j = start; j < end; j++) // j points to the value
        {
            if (rowIndex[j] >= rows)
            {
                if (d_res[0] > 0)
                {
                    d_res[0] = -3;
                    d_res[1] = j;
                    d_res[2] = rowIndex[j];
                    d_res[3] = rows;
                    break;
                }
            }
            if (j > start && rowIndex[j] < rowIndex[j - 1])
            {
                if (d_res[0] > 0)
                {
                    d_res[0] = -4;
                    d_res[1] = id;
                    d_res[2] = j;
                    d_res[3] = rowIndex[j];
                    break;
                }
            }
        }
    }
}

template <class ElemType>
__global__ void _shiftColCSCIndexFromSliceViewToAbsolute(
    GPUSPARSE_INDEX_TYPE* colCSCIndex,
    const int cols,
    const int nz)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= cols)
        return;

    colCSCIndex[id] = colCSCIndex[id] - colCSCIndex[0];

    if (id == cols - 1)
        colCSCIndex[cols] = nz;
}

//c = alpha * op(a) * op(b) + beta*c
// TODO: This function can be further improved by loading the kernel in shared memory
template <class ElemType>
__global__ void _dense1DConvMultSparseCSCAndWeightedAddToDense(
    const int m,                   // rowDense
    const int k,                   // colDense
    const int n,                   // colSparse
    const int numChannels,         // input num channels
    const int numSteps,            // convolution num steps
    const int horizontalSubsample, // convolution step size
    const bool channelwise,        // pixelwise for normal multiplication and channelwise for convolution operation
    const ElemType alpha,
    const ElemType* a, // dense
    const bool transposeA,
    const ElemType* bnzValues, // sparse nz values
    const GPUSPARSE_INDEX_TYPE* rowIndex,
    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
    const ElemType beta,
    ElemType* c // dense target
    )
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= m * numSteps * n)
        return;

    int colInC = id / (m * numSteps);
    int rowInC = id % (m * numSteps);
    int stepIdx = rowInC / m;

    int start = colCSCIndex[colInC];
    int end = colCSCIndex[colInC + 1];

    ElemType s = 0;
    for (int j = start; j < end; j++) // j points to the value
    {
        int i = rowIndex[j] - (horizontalSubsample * numChannels * stepIdx); // offset row index by the convolution step

        if (i >= 0)
        {
            if (i >= k)
                break;

            // Convert to channelwise index.
            // This is similar to rowwise to columnwise conversion
            if (channelwise)
            {
                int pixel = i / numChannels;
                int channel = i % numChannels;
                int numPixels = k / numChannels;
                i = channel * numPixels + pixel;
            }

            if (!transposeA)
                s += a[IDX2C(rowInC % m, i, m)] * bnzValues[j];
            else
                s += a[IDX2C(i, rowInC % m, k)] * bnzValues[j];
        }
    }

    c[IDX2C(rowInC, colInC, m * numSteps)] = alpha * s + (beta == 0 ? (ElemType)0 : beta * c[IDX2C(rowInC, colInC, m * numSteps)]); // If beta is zero then don't lookup c
}

/// c += alpha * a * b^T
template <class ElemType>
__global__ void _dense1DConvMultSparseCSCTransposeAndAddToDense(
    int m,                   // rowDense
    int k,                   // colDense
    int n,                   // colSparse
    int numChannels,         // input num channels
    int numSteps,            // convolution num steps
    int horizontalSubsample, // convolution step size
    bool channelwise,        // pixelwise for normal multiplication and channelwise for convolution operation
    int rowInB,              // row index of the sparse matrix
    ElemType alpha,
    const ElemType* a, // dense
    bool transposeA,
    const ElemType* bnzValues, // sparse nz values
    const GPUSPARSE_INDEX_TYPE* rowIndex,
    const GPUSPARSE_INDEX_TYPE* colCSCIndex,
    ElemType* c // dense target
    )
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= m * numSteps)
        return;

    int rowInC = id;
    int stepIdx = rowInC / m;
    int i = rowInB - (horizontalSubsample * numChannels * stepIdx); // offset row index by the convolution step

    if (i < 0 || i >= k)
        return;

    // Convert to channelwise index.
    // This is similar to rowwise to columnwise conversion
    if (channelwise)
    {
        int pixel = i / numChannels;
        int channel = i % numChannels;
        int numPixels = k / numChannels;
        i = channel * numPixels + pixel;
    }

    int start = colCSCIndex[rowInB];
    int end = colCSCIndex[rowInB + 1];

    ElemType s = 0;
    for (int j = start; j < end; j++) // j points to the value that are in the same row
    {
        int colInC = rowIndex[j]; // the column index because of transpose

        // bnzValues[j] = the b[][j] value
        if (!transposeA)
            s = a[IDX2C(rowInC % m, i, m)] * bnzValues[j];
        else
            s = a[IDX2C(i, rowInC % m, k)] * bnzValues[j];

        atomicAdd(&c[IDX2C(rowInC, colInC, m * numSteps)], alpha * s);
    }
}

template <class ElemType>
__global__ void _columnwiseScaleAndWeightedAdd(
    ElemType alpha,
    const ElemType* aData,
    const ElemType* vData,
    ElemType beta,
    ElemType* cData,
    int m, int n)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= m * n)
        return;

    CUDA_LONG col = id / m;

    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    if (beta == 0) // don't even read the memory if beta is 0
        cData[id] = (comp_t)alpha * (comp_t)vData[col] * (comp_t)aData[id];
    else
        cData[id] = (comp_t)alpha * (comp_t)vData[col] * (comp_t)aData[id] + (comp_t)beta * (comp_t)cData[id];
}

template <class ElemType>
__global__ void _columnwiseScaleAndWeightedAdd4CSC(
    ElemType alpha,
    const ElemType* aData, const GPUSPARSE_INDEX_TYPE* aSecondaryIndices, const GPUSPARSE_INDEX_TYPE* aMajorIndices,
    const ElemType* vData,
    ElemType beta,
    ElemType* cData,
    int m, int n)
{
    CUDA_LONG col = blockDim.x * blockIdx.x + threadIdx.x;
    if (col >= n)
        return;

    GPUSPARSE_INDEX_TYPE start = aSecondaryIndices[col];
    GPUSPARSE_INDEX_TYPE end = aSecondaryIndices[col + 1];

    for (GPUSPARSE_INDEX_TYPE p = start; p < end; p++)
    {
        GPUSPARSE_INDEX_TYPE row = aMajorIndices[p];
        ElemType val = aData[p];

        if (beta == 0) // don't even read the memory if beta is 0
            cData[IDX2C(row, col, m)] = alpha * vData[col] * val;
        else
            cData[IDX2C(row, col, m)] = alpha * vData[col] * val + beta * cData[IDX2C(row, col, m)];
    }
}

///
/// adjusts the sparse block column matrix with the new Col2BlockId
/// For each column, if new Col2BlockId contains valid index, a corresponding block exists at the index
/// if old col2BlockId[i] contains value at that column, it would be copied over; otherwise the block would be filled with zeros
///
template <class ElemType>
__global__ void _adjustCol2BlockId(
    const int numRows,
    const int numCols,
    const GPUSPARSE_INDEX_TYPE* oldCol2BlockId,
    const ElemType* oldNZ,
    const GPUSPARSE_INDEX_TYPE* newCol2BlockId,
    ElemType* newNZ,
    GPUSPARSE_INDEX_TYPE* newBlockId2Col)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= numCols)
        return;

    int newBlockId = newCol2BlockId[id];
    if (newBlockId != SparseIndex_NotAssigned)
    {
        newBlockId2Col[newBlockId] = id;
        int oldBlockId = oldCol2BlockId[id];
        if (oldBlockId != SparseIndex_NotAssigned)
        {
            const ElemType* oldValue = oldNZ + numRows * oldBlockId;
            ElemType* newValue = newNZ + numRows * newBlockId;
            for (int row = 0; row < numRows; row++)
            {
                newValue[row] = oldValue[row];
            }
        }
        else
        {
            ElemType* newValue = newNZ + numRows * newBlockId;
            for (int row = 0; row < numRows; row++)
            {
                newValue[row] = 0;
            }
        }
    }
}

template <class ElemType>
__global__ void _reshape(
    const int oldNumRows,                       // old row count
    const int oldNumCols,                       // old col count
    const int newNumRows,                       // new row count
    const int newNumCols,                       // new col count
    const GPUSPARSE_INDEX_TYPE* oldRowIndex,    // old row index array
    const GPUSPARSE_INDEX_TYPE* oldColumnIndex, // old column index array
    GPUSPARSE_INDEX_TYPE* newRowIndex,          // new row index array
    GPUSPARSE_INDEX_TYPE* newColumnIndex        // new column index array
    )
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= newNumCols)
        return;

    int currentCol = id;
    int oldColLower = (newNumRows * currentCol) / oldNumRows;

    // initialize to the end and then scan in the right direction in the for-loop
    int currentColStart = oldColumnIndex[oldNumCols];

    for (int oldCol = oldColLower; oldCol < oldNumCols; oldCol++)
    {
        int start = oldColumnIndex[oldCol];
        int end = oldColumnIndex[oldCol + 1];
        bool done = false;

        for (int j = start; j < end; j++) // j points to the value
        {
            int oldRow = oldRowIndex[j];
            int index = (oldCol * oldNumRows + oldRow);
            int newCol = index / newNumRows;
            int newRow = index % newNumRows;

            if (newCol == currentCol)
                newRowIndex[j] = newRow;

            if (newCol >= currentCol && currentColStart > j)
                currentColStart = j;

            if (newCol > currentCol)
            {
                done = true;
                break;
            }
        }

        if (done)
            break;
    }

    newColumnIndex[currentCol] = currentColStart;

    if (currentCol == (newNumCols - 1))
        newColumnIndex[newNumCols] = oldColumnIndex[oldNumCols]; // set end pointer
}

//called before _determineBlockIds and _denseMulSparseCSCTransposeToSparseBlockCol to determine which columns have values and
//what's the mapping from the column id in the resulted SparseBlockCol format to the column id in the dense format
//input: rowIndexes: the row indexes of the CSC sparse matrix to be multiplied with
//blockIDs: the blockID mapping in the resulting matrix;
//nnz: number of nonzero value or the size of rowIndexes;
template <class ElemType>
__global__ void _findColsWithValues(
    const GPUSPARSE_INDEX_TYPE* rowIndexes, GPUSPARSE_INDEX_TYPE* col2BlockIds, const size_t nnz)
{
    const size_t nzIndex = blockIdx.x * blockDim.x + threadIdx.x;
    if (nzIndex >= nnz)
        return;

    if (col2BlockIds[rowIndexes[nzIndex]] == SparseIndex_NotAssigned)
        col2BlockIds[rowIndexes[nzIndex]] = SparseIndex_Pending; // this row has value.
}

//called before _denseMulSparseCSCTransposeToSparseBlockCol and after _findColsWithValuesto determine which columns have values and
//what's the mapping from the column id in the resulted SparseBlockCol format to the column id in the dense format
//input: rowIndexes: the row indexes of the CSC sparse matrix to be multiplied with
//blockId2Col: the blockID to colum id mapping in the resulting matrix;
//col2BlockId: the col2BlockId to blockID mapping in the resulting matrix;
//numCols: number of columns in the resulting matrix or the size of blockIDs
//blockSize: return the blockSize with values
template <class ElemType>
__global__ void _determineBlockIds(
    GPUSPARSE_INDEX_TYPE* blockId2Col, GPUSPARSE_INDEX_TYPE* col2BlockId, size_t numCols, size_t* blockSize)
{
    const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
    if (col >= numCols)
        return;

    if (col2BlockId[col] == SparseIndex_Pending)
    {
        GPUSPARSE_INDEX_TYPE blockIndex = atomicAdd((unsigned int*)blockSize, (unsigned int)1);
        col2BlockId[col] = blockIndex;
        blockId2Col[blockIndex] = col;
    }
}

// backward pass from hidden layer to feature weight
//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
//assume resultValues are 0-initialized
template <class ElemType>
__global__ void _denseMulSparseCSCTransposeToSparseBlockCol2(
    const ElemType alpha,
    const ElemType* lhsValues,
    const size_t numRowsLhs,
    const size_t numColsRhs,
    const ElemType* rhsNZValues,
    const GPUSPARSE_INDEX_TYPE* rhsRows,
    const GPUSPARSE_INDEX_TYPE* rhsCols,
    const GPUSPARSE_INDEX_TYPE* col2blockIds,
    ElemType* resultValues)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG lhsCol = index / numRowsLhs; // rhsCol == lhsCol
    if (lhsCol >= numColsRhs)
        return;
    const CUDA_LONG lhsRow = index - numRowsLhs * lhsCol; // resultRow == lhsRow

    // each thread handles one [row, col] combination
    ElemType lhsValue = alpha * lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];

    CUDA_LONG start = rhsCols[lhsCol]; // rhsCol == lhsCol
    CUDA_LONG end = rhsCols[lhsCol + 1];

    for (CUDA_LONG p = start; p < end; p++)
    {
        CUDA_LONG rhsRow = rhsRows[p];
        ElemType rhsVal = rhsNZValues[p];
        CUDA_LONG resultCol = col2blockIds[rhsRow]; // resultCol == rhsRow maps to columnid

        // assume resultValues are 0-initialized
        atomicAdd(&resultValues[IDX2C(lhsRow, resultCol, numRowsLhs)], lhsValue * rhsVal); //TODO: this does not work with fp16 for sparse embedding
    }
}

// backward pass from hidden layer to feature weight
//result (sparse BlockCol)= alpha * (lhs (dense) X rhs^T (sparse CSC)
//assume resultValues are 0-initialized
template <class ElemType>
__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
    const ElemType alpha,
    const ElemType* lhsValues,
    const size_t numRowsLhs,
    const size_t numColsRhs,                // The number of columns of rhs matrix before transpose. I.e. it is the 'conttacting' dimension in the matrix product to be computed.
    const ElemType* rhsNZValues,
    const GPUSPARSE_INDEX_TYPE* rhsRows,    // Mapping the ids of the non-zero values to their row index.
    const GPUSPARSE_INDEX_TYPE* rhsCols,    // Start id of each column.
    const GPUSPARSE_INDEX_TYPE* rhsRowIdx,  // Each non-zero row of the rhs sparse matrix get's an index (call it block-id). This array (size nnz) maps the nz-value row to the corresponding block-id.
    ElemType* resultValues,                 // Modified on return to contain values of the product.
    GPUSPARSE_INDEX_TYPE* blockId2Col       // Maps block-ids to column of the result matrix.
    )
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    const CUDA_LONG lhsCol = index / numRowsLhs; // rhsCol == lhsCol
    if (lhsCol >= numColsRhs)
        return;
    const CUDA_LONG lhsRow = index - numRowsLhs * lhsCol; // resultRow == lhsRow

    // each thread handles one [row, col] combination of lhs
    ElemType lhsValue = alpha * lhsValues[IDX2C(lhsRow, lhsCol, numRowsLhs)];

    CUDA_LONG start = rhsCols[lhsCol]; // rhsCol == lhsCol
    CUDA_LONG end = rhsCols[lhsCol + 1];

    for (CUDA_LONG p = start; p < end; p++)
    {
        CUDA_LONG rhsRow = rhsRows[p];
        ElemType rhsVal = rhsNZValues[p];
        CUDA_LONG blockId = rhsRowIdx[p]; // resultCol == blockId
        blockId2Col[blockId] = rhsRow;    // indicate which colmn it actually points to

        // assume resultValues are 0-initialized
        atomicAdd(&resultValues[IDX2C(lhsRow, blockId, numRowsLhs)], lhsValue * rhsVal);
    }
}

// gradients update
template <class ElemType>
__global__ void _scaleSparseBlockAndAddToDense(
    const ElemType alpha,
    const bool blockCol, // true if blockRow
    const size_t numRows,
    const size_t numCols,
    const size_t numBlocks,
    const ElemType* lhsValues, // lhs is blockCol or blockRow
    const GPUSPARSE_INDEX_TYPE* blockIds,
    ElemType* rhs)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG row, col;
    if (blockCol)
    {
        const CUDA_LONG blockId = index / numRows;
        if (blockId >= numBlocks)
            return;
        row = index - numRows * blockId;
        col = blockIds[blockId];
    }
    else
    {
        const CUDA_LONG blockId = index / numCols;
        if (blockId >= numBlocks)
            return;
        col = index - numCols * blockId;
        row = blockIds[blockId];
    }
    rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
}

#if 0
// compute predictions in cross entropy node
template <class ElemType>
__global__ void _computePrediction(
    int nv,
    const ElemType* a,
    int numrows,
    const ElemType* weight,
    int nrs,
    int labelSize,
    const GPUSPARSE_INDEX_TYPE* labelRow,
    const size_t* block2Id,
    const ElemType* cls,
    const ElemType* idx2cls,
    ElemType* val,
    GPUSPARSE_INDEX_TYPE* row,
    GPUSPARSE_INDEX_TYPE* pb)
{
    // get label block id
    int id = -1;
    int offset = -1;
    for (int i = 1; i < labelSize; i++)
    {
        if (blockIdx.x < block2Id[i])
        {
            id = i - 1;
            offset = blockIdx.x - block2Id[i - 1];
            break;
        }
    }
    if (id == -1)
    {
        id = labelSize - 1;
        offset = blockIdx.x - block2Id[labelSize - 1];
    }

    int t = labelRow[id];
    int iStt;
    int iEnd;
    if (t < nv)
    {
        int clsid = idx2cls[t];
        iStt = cls[IDX2C(0, clsid, 2)];
        iEnd = cls[IDX2C(1, clsid, 2)];
    }
    else
    {
        iStt = nv;
        iEnd = nrs;
    }
    int i = iStt + offset;
    int j = id / 2;

    int loadPerThread = (numrows + blockDim.x - 1) / blockDim.x;
    int tStart = loadPerThread * threadIdx.x;
    int tEnd = min((int) numrows, loadPerThread + tStart);

    ElemType v = 0.0;
    for (int h = tStart; h < tEnd; h++)
    {
        v += weight[IDX2C(i, h, nrs)] * a[IDX2C(h, j, numrows)];
    }
    atomicAdd(&val[blockIdx.x], v);
    row[blockIdx.x] = i;

    if (blockIdx.x == 0 && threadIdx.x == 0)
        pb[0] = 0;

    if ((threadIdx.x == 0) && (i == iEnd - 1) && (i >= nv))
        pb[j + 1] = blockIdx.x + 1;
}

// normalize predictions in cross entropy node
template <class ElemType>
__global__ void _normalizePrediction(
    const size_t labelSize,
    const size_t expandedLabelSize,
    const GPUSPARSE_INDEX_TYPE* labelRow,
    const size_t* block2Id,
    const GPUSPARSE_INDEX_TYPE* row,
    ElemType* val,
    ElemType* entropyScore)
{
    __shared__ ElemType partials[512];
    partials[threadIdx.x] = 0;

    int p = blockIdx.x;
    int t = labelRow[p];
    int start = block2Id[p];
    int end;
    if (p == labelSize - 1)
    {
        end = expandedLabelSize;
    }
    else
    {
        end = block2Id[p + 1];
    }
    int len = end - start;

    int loadPerThread = (len + blockDim.x - 1) / blockDim.x;
    int tStart = loadPerThread * threadIdx.x;
    int tLen = min((int) len, loadPerThread + tStart);

    for (int i = start + tStart; i < start + tLen; i++)
    {
        partials[threadIdx.x] += exp(val[i]);
    }

    __syncthreads();

    // now sum up the objective function
    int nTotalThreads = blockDim.x;

    while (nTotalThreads > 1)
    {
        int halfPoint = (nTotalThreads >> 1);

        if (threadIdx.x < halfPoint)
            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];

        __syncthreads();

        nTotalThreads = (nTotalThreads >> 1);
    }

    for (int i = start + tStart; i < start + tLen; i++)
    {
        val[i] = log(exp(val[i]) / partials[0]);
        if (row[i] == t)
        {
            atomicAdd(entropyScore, -val[i]);
            val[i] *= -1;
        }
    }
}

// compute prediction error in cross entropy node
template <class ElemType>
__global__ void _computePredictionError(
    ElemType* val,
    int N)
{
    int p = blockDim.x * blockIdx.x + threadIdx.x;
    if (p >= N)
        return;

    if (val[p] < 0)
        val[p] = exp(val[p]); // negative;
    else
        val[p] = exp(-val[p]) - 1; // positive
}

// compute gradients of input in cross entropy node
template <class ElemType>
__global__ void _computeGradientOfInput(
    const ElemType* val,
    const GPUSPARSE_INDEX_TYPE* row,
    const GPUSPARSE_INDEX_TYPE* pb,
    ElemType* weight,
    size_t nrs,
    ElemType* grd,
    size_t numrows)
{
    int h = blockIdx.x % numrows;
    int j = blockIdx.x / numrows;

    int start = pb[j];
    int end = pb[j + 1];
    int len = end - start;

    int load = (len + blockDim.x - 1) / blockDim.x;
    int pStart = start + load * threadIdx.x;
    int pEnd = start + min(len, load * (threadIdx.x + 1));

    ElemType sum = 0;
    for (int p = pStart; p < pEnd; p++)
    {
        int i = row[p];
        sum += val[p] * weight[IDX2C(i, h, nrs)];
    }

    atomicAdd(&grd[IDX2C(h, j, numrows)], sum);
}
#endif

#if 0
template <class ElemType>
__global__ void computeNCEForwardProp512Threads(
    const ElemType* val,
    const int* col,
    int numRows,
    int sampleCount,
    const ElemType* a,
    int numCols_a,
    const ElemType* b,
    ElemType* res)
{
    // val and col are in CSR format
    // val is an array contains log_Pn(w). To differentiate positive and negative samples,
    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
    // res is the buffer to store computed output (sparse)

    // follow the convention, this kernel must be run on 512 threads per block
    __shared__ ElemType partials[512];
    partials[threadIdx.x] = 0;

    // determine the elements to be handled by this block
    int total = numRows * sampleCount;
    int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;

    int start = loadPerBlock * blockIdx.x;
    int end = min(total, loadPerBlock * (blockIdx.x + 1));

    for (int i = start; i < end; i++)
    {
        int colIndex = col[i];
        int rowIndex = i / sampleCount;

        int loadPerThread = (numCols_a + blockDim.x - 1) / blockDim.x;
        int tstart = loadPerThread * threadIdx.x;
        int tend = min(numCols_a, loadPerThread * (threadIdx.x + 1));

        for (int j = tstart; j < tend; j++)
            partials[threadIdx.x] = a[IDX2C(rowIndex, j, numRows)] * b[IDX2C(j, colIndex, numCols_a)];

        __syncthreads();

        // sum up
        int nTotalThreads = blockDim.x;

        while (nTotalThreads > 1)
        {
            int halfPoint = (nTotalThreads >> 1);

            if (threadIdx.x < halfPoint)
                partials[threadIdx.x] += partials[threadIdx.x + halfPoint];

            __syncthreads();

            nTotalThreads = (nTotalThreads >> 1);
        }

        if (threadIdx.x == 0)
            res[i] = partials[0];
    }
}
#endif

template <class ElemType>
__global__ void _computeNceOutputMax512Threads(
    const ElemType* col,
    int numRows,
    int sampleCount,
    const ElemType* a,
    int numCols_a,
    const ElemType* b,
    const ElemType* bias,
    ElemType* res)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // val and col are in CSR format
    // val is an array contains log_Pn(w). To differentiate positive and negative samples,
    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
    // res is the buffer to store computed output (sparse)

    // follow the convention, this kernel must be run on 512 threads per block
    __shared__ comp_t partials[512];
    partials[threadIdx.x] = 0;

    // threadIdx.x range from[0 ~ 512)
    // blockIdx.x range from[0 ~ nnz)
    // blockDim.x equal to 512
    // gridDim.x equal to nnz

    // determine the elements to be handled by this block
    int total = numRows * sampleCount;
    int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;

    int start = loadPerBlock * blockIdx.x;
    int end = min(total, loadPerBlock * (blockIdx.x + 1));

    for (int i = start; i < end; i++)
    {
        int wid = (int) col[2 * i];
        int batchid = i / sampleCount;

        int loadPerThread = (numCols_a + blockDim.x - 1) / blockDim.x;
        int tstart = loadPerThread * threadIdx.x;
        int tend = min(numCols_a, loadPerThread * (threadIdx.x + 1));

        for (int j = tstart; j < tend; j++)
            partials[threadIdx.x] = (comp_t)a[IDX2C(j, batchid, numCols_a)] * (comp_t)b[IDX2C(j, wid, numCols_a)];

        __syncthreads();

        // sum up
        int nTotalThreads = blockDim.x;

        while (nTotalThreads > 1)
        {
            int halfPoint = (nTotalThreads >> 1);

            if (threadIdx.x < halfPoint)
                partials[threadIdx.x] += partials[threadIdx.x + halfPoint];

            __syncthreads();

            nTotalThreads = (nTotalThreads >> 1);
        }

        if (threadIdx.x == 0)
            res[i] = partials[0] + (comp_t)bias[wid];
    }
}

template <class ElemType>
__global__ void _assignSoftmaxSumMax512Threads(
    const ElemType* softmax,
    int sampleCount,
    const ElemType* a,
    ElemType* c) // run on 512 threads per block
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // val and col are in CSR format
    // val is an array contains log_Pn(w). To differentiate positive and negative samples,
    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
    // tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
    // c is the matrix to store objective

    __shared__ comp_t partials[512];
    partials[threadIdx.x] = 0;

    int total = sampleCount;
    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;

    // find out the items this thread is responsible for
    int start = loadPerThread * threadIdx.x;
    int end = min(total, loadPerThread * (threadIdx.x + 1));
    for (int i = start; i < end; i++)
    {
        int wid = (int) a[i];
        partials[threadIdx.x] += (comp_t)softmax[IDX2C(i, wid, sampleCount)];
    }

    __syncthreads();

    // now sum up the objective function
    int nTotalThreads = blockDim.x;

    while (nTotalThreads > 1)
    {
        int halfPoint = (nTotalThreads >> 1);

        if (threadIdx.x < halfPoint)
            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];

        __syncthreads();

        nTotalThreads = (nTotalThreads >> 1);
    }

    if (threadIdx.x == 0)
        c[0] = -partials[0];
}

template <class ElemType>
__global__ void _assignNoiseContrastiveEstimationMax512Threads(
    const ElemType* val,
    int numRows,
    int sampleCount,
    const ElemType* a,
    int width, // number of columns in a
    const ElemType* b,
    ElemType* tmp,
    ElemType* c) // run on 512 threads per block
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // val and col are in CSR format
    // val is an array contains log_Pn(w). To differentiate positive and negative samples,
    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
    // tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
    // c is the matrix to store objective

    __shared__ comp_t partials[512];
    partials[threadIdx.x] = 0;

    int total = numRows * sampleCount;
    int loadPerThread = (total + blockDim.x - 1) / blockDim.x;

    // find out the items this thread is responsible for
    int start = loadPerThread * threadIdx.x;
    int end = min(total, loadPerThread * (threadIdx.x + 1));

    comp_t log_num_noise_samples = log_((comp_t)(sampleCount - 1));
    for (int i = start; i < end; i++)
    {
        comp_t prob = -(comp_t)val[2 * i + 1];
        bool positive = (prob > 0);
        if (positive)
            prob = -prob;
        comp_t score_noise = log_num_noise_samples + prob;
        comp_t z = logaddk((comp_t)tmp[i], score_noise);
        comp_t logprob = (comp_t)tmp[i] - z;
        comp_t logprob_noise = score_noise - z;
        tmp[i] = -exp_(logprob);
        if (positive)
            tmp[i] = (comp_t)tmp[i] + 1;
        if (positive)
            partials[threadIdx.x] += logprob;
        else
            partials[threadIdx.x] += logprob_noise;
    }

    __syncthreads();

    // now sum up the objective function
    int nTotalThreads = blockDim.x;

    while (nTotalThreads > 1)
    {
        int halfPoint = (nTotalThreads >> 1);

        if (threadIdx.x < halfPoint)
            partials[threadIdx.x] += partials[threadIdx.x + halfPoint];

        __syncthreads();

        nTotalThreads = (nTotalThreads >> 1);
    }

    if (threadIdx.x == 0)
        c[0] = -partials[0];
}

template <class ElemType>
__global__ void _assignNceDerivative(
    const ElemType* val,
    int numRows,
    int sampleCount,
    const ElemType* a,
    int width, // number of columns in a
    const ElemType* b,
    const ElemType* tmp,
    ElemType* c,
    size_t inputIndex)
{
    // val and col are CSR format sparse matrix for label
    // val is an array contains log_Pn(w). To differentiate positive and negative samples
    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
    // tmp is a matrix of precalculated error
    // c is the output matrix to store calculated gradients

    int total = numRows * sampleCount;
    int loadPerBlock = (total + gridDim.x - 1) / gridDim.x;

    // find out the items this block is responsible for
    int start = loadPerBlock * blockIdx.x;
    int end = min(total, loadPerBlock * (blockIdx.x + 1));

    for (int i = start; i < end; i++)
    {
        int wid = (int) val[2 * i];
        int batchId = i / sampleCount;

        ElemType er = tmp[i]; // precalculated error for this output node

        // calculate gradients
        int loadPerThread = (width + blockDim.x - 1) / blockDim.x;
        int tstart = loadPerThread * threadIdx.x;
        int tend = min(width, loadPerThread * (threadIdx.x + 1));

        if (inputIndex == 1) // hidden layer output
        {
            for (int j = tstart; j < tend; j++)
            {
                ElemType val = -er * b[IDX2C(j, wid, width)];
                atomicAdd(&c[IDX2C(j, batchId, width)], val);
                // c[IDX2C(j, batchId, width)] += val;
                // c[IDX2C(batchId, j, numRows)] += val;
            }
        }
        else if (inputIndex == 2) // weight
        {
            for (int j = tstart; j < tend; j++)
            {
                ElemType val = -er * a[IDX2C(j, batchId, width)];
                atomicAdd(&c[IDX2C(j, wid, width)], val);
                // c[IDX2C(j, wid, width)] += val;
            }
        }
        else // bias vector
        {
            // ElemType val = -er;
            atomicAdd(&c[wid], -er);
            // c[wid] -= er;
        }
    }
}

template <class ElemType>
__global__ void _assignNceDerivativeNew(
    const ElemType* val,
    int numRows,
    int sampleCount,
    const ElemType* a,
    int width, // number of columns in a
    const ElemType* b,
    const ElemType* tmp,
    ElemType* c,
    size_t inputIndex)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // val and col are CSR format sparse matrix for label
    // val is an array contains log_Pn(w). To differentiate positive and negative samples
    // we store log_Pn(w) as it is for positive samples, and -log_Pn(w) for negative samples
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
    // tmp is a matrix of precalculated error
    // c is the output matrix to store calculated gradients

    // logical single index for this thread
    int n = threadIdx.x + blockDim.x * blockIdx.x;

    int batchId = n / sampleCount;
    int total = numRows * sampleCount;
    // is thread in range for the addition
    if (n < total)
    {
        int wid = (int) val[2 * n];
        comp_t er = tmp[n];
        if (inputIndex == 1)
        {
            for (int i = 0; i < width; i++)
            {
                int j = (i + n) % width; // introduce randomization to avoid conflicts
                comp_t val = -er * (comp_t)b[IDX2C(j, wid, width)];
                atomicAdd(&c[IDX2C(j, batchId, width)], (ElemType)val);
            }
        }
        else if (inputIndex == 2)
        {
            for (int i = 0; i < width; i++)
            {
                int j = (i + n) % width; // introduce randomization to avoid conflicts
                comp_t val = -er * (comp_t)a[IDX2C(j, batchId, width)];
                atomicAdd(&c[IDX2C(j, wid, width)], (ElemType)val);
            }
        }
        else
            atomicAdd(&c[wid], (ElemType)-er);
    }
}

#if 0
// compute gradients of weights in cross entropy node
template <class ElemType>
__global__ void _computeGradientOfWeight(
    const ElemType* val,
    const GPUSPARSE_INDEX_TYPE* row,
    const GPUSPARSE_INDEX_TYPE* pb,
    size_t mb,
    size_t nv,
    const GPUSPARSE_INDEX_TYPE* labelRow,
    const size_t* labelBlock2UniqId,
    const ElemType* cls,
    const ElemType* idx2cls,
    ElemType* input,
    size_t nrs,
    ElemType* blockVal,
    GPUSPARSE_INDEX_TYPE* blockIds)
{
    int p = blockIdx.x;
    ElemType v = val[p];
    int i = row[p];
    int j = -1;
    for (int k = 1; k < mb; k++)
    {
        if (p < pb[k])
        {
            j = k - 1;
            break;
        }
    }
    if (j == -1)
    {
        j = mb - 1;
    }

    // figure out blocks
    int bId = i < nv ? 2 * j : 2 * j + 1;
    int t = labelRow[bId];
    int iStt;
    if (t < nv)
    {
        int clsid = idx2cls[t];
        iStt = cls[IDX2C(0, clsid, 2)];
    }
    else
    {
        iStt = nv;
    }
    int offset = i - iStt;
    int ii = labelBlock2UniqId[bId] + offset;

    int load = (nrs + blockDim.x - 1) / blockDim.x;
    int pStart = load * threadIdx.x;
    int pEnd = min((int) nrs, load + pStart);

    for (int h = pStart; h < pEnd; h++)
    {
        ElemType temp = v * input[IDX2C(h, j, nrs)];
        atomicAdd(&blockVal[ii * nrs + h], temp);
        blockIds[ii] = i;
    }
}
#endif

// used in clipping gradients
template <class ElemType>
__global__ void _inplaceTruncate(
    ElemType* a,
    const ElemType threshold,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
    comp_t locThresholdPos = fabs_((comp_t)threshold);
    comp_t locTHresholdNeg = -locThresholdPos;
    if (a[id] > locThresholdPos)
    {
        a[id] = locThresholdPos;
    }
    else if ((comp_t)a[id] < locTHresholdNeg)
    {
        a[id] = locTHresholdNeg;
    }
}

template <class ElemType>
__global__ void _inplaceSoftThreshold(
    ElemType* a,
    const ElemType threshold,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    if (a[id] > threshold)
    {
        a[id] = (comp_t)a[id] - (comp_t)threshold;
    }
    else if (a[id] < -(comp_t)threshold)
    {
        a[id] = (comp_t)a[id] + (comp_t)threshold;
    }
    else
        a[id] = 0;
}

template <class ElemType>
__global__ void _normalGradForSparseBlock(
    const ElemType momentum,
    const bool blockCol, // true if blockRow
    const size_t numRows,
    const size_t numCols,
    const size_t numBlocks,
    ElemType* lhsValues, // lhs is blockCol or blockRow
    const GPUSPARSE_INDEX_TYPE* blockIds,
    ElemType* rhs,
    ElemType unitGainFactor)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG row, col;
    if (blockCol)
    {
        const CUDA_LONG blockId = index / numRows;
        if (blockId >= numBlocks)
            return;
        row = index - numRows * blockId;
        col = blockIds[blockId];
    }
    else
    {
        const CUDA_LONG blockId = index / numCols;
        if (blockId >= numBlocks)
            return;
        col = index - numCols * blockId;
        row = blockIds[blockId];
    }
    rhs[IDX2C(row, col, numRows)] = unitGainFactor * lhsValues[index] + momentum * rhs[IDX2C(row, col, numRows)];
    lhsValues[index] = rhs[IDX2C(row, col, numRows)];
}

//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionSum1024Threads(
    const ElemType* data,
    ElemType* sum,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums[1024];
    partialSums[threadIdx.x] = 0;
    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG loadPerThread = N / blockDim.x;
    for (CUDA_LONG i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        partialSums[threadIdx.x] += (comp_t)data[i];
    }
    __syncthreads();

    // 512
    if (threadIdx.x < 512)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 512];
    }
    __syncthreads();

    // 256
    if (threadIdx.x < 256)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 256];
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 128];
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 64];
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 32];
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 16];
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 8];
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 4];
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        sum[0] = partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3];
    }
}

//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionSumAndAssign1024Threads(
    ElemType* toAssign,
    const ElemType* data,
    CUDA_LONG N, // length of data
    CUDA_LONG M) // length of toAssign
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums[1024];
    __shared__ comp_t res;
    partialSums[threadIdx.x] = 0;
    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG loadPerThread = N / blockDim.x;
    for (CUDA_LONG i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        partialSums[threadIdx.x] += (comp_t)data[i];
    }
    __syncthreads();

    // 512
    if (threadIdx.x < 512)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 512];
    }
    __syncthreads();

    // 256
    if (threadIdx.x < 256)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 256];
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 128];
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 64];
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 32];
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 16];
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 8];
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 4];
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        res = partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3];
        for (CUDA_LONG i = 0; i < M; ++i)
            toAssign[i] = res;
    }
}

//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionSum21024Threads(
    const ElemType* data,
    ElemType* sum,
    CUDA_LONG N,
    bool takeSqrt = false)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums[1024];
    partialSums[threadIdx.x] = 0;
    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG loadPerThread = N / blockDim.x;
    for (CUDA_LONG i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    // for (int i= threadIdx.x*loadPerThread; i<(threadIdx.x+1)*loadPerThread;++i)
    {
        partialSums[threadIdx.x] += ((comp_t)data[i] * (comp_t)data[i]);
    }
    __syncthreads();

    // 512
    if (threadIdx.x < 512)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 512];
    }
    __syncthreads();

    // 256
    if (threadIdx.x < 256)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 256];
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 128];
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 64];
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 32];
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 16];
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 8];
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums[threadIdx.x] += partialSums[threadIdx.x + 4];
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        if (takeSqrt)
        {
            sum[0] = sqrt_(partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3]);
        }
        else
        {
            sum[0] = partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3];
        }
    }
}

//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionMatrixNormInf1024Threads(
    const ElemType* data,
    ElemType* maxAbs,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums[1024];
    partialSums[threadIdx.x] = 0;
    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    int loadPerThread = N / blockDim.x;
    for (int i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        partialSums[threadIdx.x] = max(fabs_((comp_t)data[i]), partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 512
    if (threadIdx.x < 512)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 512], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 256
    if (threadIdx.x < 256)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 256], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 128], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 64], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 32], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 16], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 8], partialSums[threadIdx.x]);
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums[threadIdx.x] = max(partialSums[threadIdx.x + 4], partialSums[threadIdx.x]);
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        maxAbs[0] = max(max(partialSums[0], partialSums[1]), max(partialSums[2], partialSums[3]));
    }
}

//This function should be called with 1024 threads per block and 1 block
//THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
template <class ElemType>
__global__ void _reductionMatrixNorm01024Threads(
    const ElemType* data,
    ElemType* nz,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums[1024];
    partialSums[threadIdx.x] = 0;
    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG loadPerThread = N / blockDim.x;
    for (CUDA_LONG i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        if ((comp_t)data[i] != 0)
            ++partialSums[threadIdx.x];
    }
    __syncthreads();

    // 512
    if (threadIdx.x < 512)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 512] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 256
    if (threadIdx.x < 256)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 256] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 128] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 64] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 32] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 16] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 8] + partialSums[threadIdx.x];
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums[threadIdx.x] = partialSums[threadIdx.x + 4] + partialSums[threadIdx.x];
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        nz[0] = partialSums[0] + partialSums[1] + partialSums[2] + partialSums[3];
    }
}

template <class ElemType>
__global__ void _getSparseVectorRepresntationForCSCMatrix(
    const int* m_dRow,
    const int* m_dCol,
    int* vectArray,
    const CUDA_LONG M,
    const CUDA_LONG N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= M)
        return;
    int start = m_dRow[i];
    int end = m_dRow[i + 1];
    for (int _i = start; _i < end; ++_i) // _i is index in m_dVal and m_dCol
    {
        int j = m_dCol[_i];
        vectArray[_i] = i * N + j;
    }
}

template <class ElemType>
__global__ void _lrHelper512Threads(
    const ElemType* data1,
    const ElemType* data2,
    const CUDA_LONG N,
    ElemType* d_res)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialSums1[512];
    __shared__ comp_t partialSums2[512];
    partialSums1[threadIdx.x] = 0;
    partialSums2[threadIdx.x] = 0;

    // int id = blockDim.x * blockIdx.x + threadIdx.x;
    int loadPerThread = N / blockDim.x;
    for (int i = threadIdx.x * loadPerThread; i < (threadIdx.x == blockDim.x - 1 ? N : (threadIdx.x + 1) * loadPerThread); ++i)
    {
        partialSums1[threadIdx.x] += ((comp_t)data1[i] * (comp_t)data1[i]);
        partialSums2[threadIdx.x] += ((comp_t)data2[i] * (comp_t)data2[i]);
    }
    __syncthreads();

    /*
    // 512
    if (threadIdx.x<512)
    {
    partialSums1[threadIdx.x]+=partialSums1[threadIdx.x+512];
    partialSums2[threadIdx.x]+=partialSums2[threadIdx.x+512];
    }
    __syncthreads();*/

    // 256
    if (threadIdx.x < 256)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 256];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 256];
    }
    __syncthreads();

    // 128
    if (threadIdx.x < 128)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 128];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 128];
    }
    __syncthreads();

    // 64
    if (threadIdx.x < 64)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 64];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 64];
    }
    __syncthreads();

    // 32
    if (threadIdx.x < 32)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 32];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 32];
    }
    __syncthreads();

    // 16
    if (threadIdx.x < 16)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 16];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 16];
    }
    __syncthreads();

    // 8
    if (threadIdx.x < 8)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 8];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 8];
    }
    __syncthreads();

    // 4
    if (threadIdx.x < 4)
    {
        partialSums1[threadIdx.x] += partialSums1[threadIdx.x + 4];
        partialSums2[threadIdx.x] += partialSums2[threadIdx.x + 4];
    }
    __syncthreads();

    if (threadIdx.x == 0)
    {
        comp_t fns1 = partialSums1[0] + partialSums1[1] + partialSums1[2] + partialSums1[3];
        comp_t fns2 = partialSums2[0] + partialSums2[1] + partialSums2[2] + partialSums2[3];
        d_res[0] = max((comp_t)0, (comp_t)d_res[0] / max((comp_t) 1.0e-10, sqrt_(fns1)) / max((ElemType) 1.0e-10, sqrt_(fns2)));
    }
}

/*
template<class ElemType>
__global__ void _lrHelper512Threads(
ElemType* d_tmp)
{
    d_tmp[0] = max((ElemType)0, d_tmp[0]/max((ElemType)1.0e-10,sqrt_(d_tmp[1]))/max((ElemType)1.0e-10,sqrt_(d_tmp[2])));
}
*/

template <class ElemType>
__global__ void _assignElementProductOfWithShiftNeg(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const int shift,
    const int NTPlusOne,
    const int BS)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG idy = blockDim.y * blockIdx.y + threadIdx.y;

    if (idx >= NTPlusOne || idy >= BS)
        return;

    if (idx == 0)
    {
        // this is row-0. No need to shift
        us[IDX2C(idx, idy, NTPlusOne)] = (comp_t)a[idy] * (comp_t)b[idy];
    }
    else
    {
        int cs = shift + idx - 1;
        int tmpidy = (idy + cs) % BS;
        us[IDX2C(idx, idy, NTPlusOne)] = (comp_t)a[idy] * (comp_t)b[tmpidy];
    }
}

template <class ElemType>
__global__ void _innerProductWithShiftNeg(
    ElemType* c,
    const ElemType* a,
    const ElemType* b,
    const CUDA_LONG N, // a.GetNumRows();
    const CUDA_LONG M, // a.GetNumCols();
    const CUDA_LONG shift,
    const CUDA_LONG NTPlusOne)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG idy = blockDim.y * blockIdx.y + threadIdx.y;

    if (idx >= NTPlusOne || idy >= M)
        return;

    comp_t sum = 0;
    CUDA_LONG index_a = 0;
    CUDA_LONG index_b = 0;
    CUDA_LONG col_a = 0;
    CUDA_LONG col_b = 0;
    if (idx == 0)
    {
        // this is row 0. No need to shift
        // the product of a(:,idy) dot b(:,idy)
        col_a = idy;
        for (CUDA_LONG i = 0; i < N; ++i)
        {
            index_a = IDX2C(i, col_a, N);
            sum += (comp_t)a[index_a] * (comp_t)b[index_a];
        }
    }
    else
    {
        int cs = shift + idx - 1;
        col_a = idy;
        col_b = (idy + cs) % M;
        for (int i = 0; i < N; ++i)
        {
            index_a = IDX2C(i, col_a, N);
            index_b = IDX2C(i, col_b, N);
            sum += (comp_t)a[index_a] * (comp_t)b[index_b];
        }
    }
    c[IDX2C(idx, idy, NTPlusOne)] = sum;
}

template <class ElemType>
__global__ void _getARowByIndex(
    ElemType* us,
    const ElemType* a,
    const int O, // a's rows
    const int P, // a's cols
    const int m  // the m-th row of a
    )
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= P)
        return;
    //    us[id] = a[id] * b[id];
    us[id] = a[IDX2C(m, id, O)];
}

template <class ElemType>
__global__ void _conductRowElementMultiplyWithShift(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const int O, // b's rows
    const int P, // b's cols
    const int shift,
    const bool isafixed)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
    CUDA_LONG idy = blockDim.y * blockIdx.y + threadIdx.y;

    if (idx >= O || idy >= P)
        return;

    int tmpidy = (idy + shift) % P;
    if (isafixed)
    {
        // we fix a, and shift b
        us[IDX2C(idx, idy, O)] = (comp_t)a[idy] * (comp_t)b[IDX2C(idx, tmpidy, O)];
    }
    else
    {
        // we fix b, but shift a
        us[IDX2C(idx, idy, O)] = (comp_t)a[tmpidy] * (comp_t)b[IDX2C(idx, idy, O)];
    }
}

template <class ElemType>
__global__ void _assignElementProductOfWithShift(
    ElemType* us,
    const ElemType* a,
    const ElemType* b,
    const int shift,
    const CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;

    int tmpidb = (id + shift) % N;
    us[id] = (comp_t)a[id] * (comp_t)b[tmpidb];
}

// minus 1 at a specific position
template <class ElemType>
__global__ void _minusOneAt(
    ElemType* c,
    CUDA_LONG position,
    CUDA_LONG N)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    if (id == position)
        c[id] = c[id] - 1.0;
}

// the kernel function for CRFLSTMNetwork  backward computation
// assume a column slice of input and output
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
template <class ElemType>
__global__ void _rcrfBackwardComputeMax1024Labels(
    const size_t t, // time position
    const size_t iNumPos,
    const ElemType* galpha,       // column slice at current time t
    ElemType* gbeta,              // column slices with [row, 2] at current time t for [
    const ElemType* gzeta,        // column slices with [row, 2] at current time t for [
    const ElemType* gpair_scores, // column slice at current time t
    const size_t iNumLab, const int shift)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    extern __shared__ double sh_alpha_and_beta[]; // [id] or [id + iNumLab] or [id + 2 * iNumLab)]
    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    comp_t* alpha = (comp_t*) (sh_alpha_and_beta);
    comp_t* beta_t1 = (comp_t*) (alpha + iNumLab);
    comp_t* zeta = (comp_t*) (beta_t1 + iNumLab);
    comp_t pair_scores[1024];  // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;

    // copy global memory to shared memory to save time
    alpha[id] = galpha[IDX2C(id, t, iNumLab)];
    if (t < iNumPos - 1)
        beta_t1[id] = gbeta[IDX2C(id, t + 1, iNumLab)];
    zeta[id] = gzeta[id];

    __syncthreads();

    for (int j = 0; j < iNumLab; j++)
        pair_scores[j] = gpair_scores[IDX2C(j, id, iNumLab)];

    comp_t fTmp = LZERO;
    if (t == iNumPos - 1)
    {
        fTmp = alpha[id] - zeta[id];
    }
    else
    {
        for (int j = 0; j < iNumLab; j++)
        {
            fTmp = logaddk(fTmp, beta_t1[j] + alpha[id] + pair_scores[j] - zeta[j]);
        }
    }

    gbeta[IDX2C(id, t, iNumLab)] = fTmp;
}

// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
template <class ElemType>
__global__ void _rcrfBackwardComputeZetaMax1024Labels(
    const size_t t, // time position
    const size_t iNumPos,
    const ElemType* galpha, // column slice at current time t
    ElemType* gzeta,        // column slices with [row, 2] at current time t for [
    const ElemType* gpair_scores,
    const size_t iNumLab, const int shift)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    extern __shared__ double sh_alpha_and_beta[]; // [id]
    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    comp_t* alpha = (comp_t*) (sh_alpha_and_beta);
    comp_t pair_scores[1024]; // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;

    // copy global memory to shared memory to save time
    alpha[id] = galpha[IDX2C(id, t, iNumLab)];

    __syncthreads();

    for (int j = 0; j < iNumLab; j++)
        pair_scores[j] = gpair_scores[IDX2C(id, j, iNumLab)];

    comp_t fSum = LZERO;
    for (int m = 0; m < iNumLab; m++)
    {
        if (t == iNumPos - 1)
            fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)]);
        else
            fSum = logaddk(fSum, alpha[IDX2C(m, 0, iNumLab)] + pair_scores[m]);
    }

    gzeta[id] = fSum;
}

/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
template <class ElemType>
__global__ void _rcrfTransGrdComputeZetaMax1024Labels(
    const int t, // time position
    const size_t iNumPos,
    const ElemType* galpha, // column slice at current time t
    ElemType* gzeta,        // column slices with [row, 2] at current time t for [
    const ElemType* gpair_scores,
    const size_t iNumLab,
    const size_t start_lbl,
    const int shift)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    extern __shared__ double sh_alpha_and_beta[]; // [id]
    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    comp_t* alpha = (comp_t*) (sh_alpha_and_beta);
    comp_t pair_scores[1024]; // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;

    // copy global memory to shared memory to save time
    if (t >= 0)
        alpha[id] = galpha[IDX2C(id, t, iNumLab)];

    __syncthreads();

    for (int j = 0; j < iNumLab; j++)
        pair_scores[j] = gpair_scores[IDX2C(id, j, iNumLab)];

    comp_t fSum = LZERO;
    comp_t fTmp;
    for (int m = 0; m < iNumLab; m++)
    {
        if (t < 0)
        {
            if (m == start_lbl)
                fTmp = 0;
            else
                fTmp = LZERO;
        }
        else
            fTmp = alpha[m];

        fSum = logaddk(fSum, pair_scores[m] + fTmp);
    }

    gzeta[id] = fSum;
}

// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
template <class ElemType>
__global__ void _rcrfTransGrdComputeMax1024Labels(
    int t,
    const size_t start_lbl,
    const ElemType* galpha,
    const ElemType* gbeta,
    const ElemType* gzeta,
    const ElemType* gpair_scores,
    const ElemType* lbls,
    ElemType* grd,
    const size_t iNumPos,
    const size_t iNumLab,
    const int shift)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;

    extern __shared__ double sh_alpha_and_beta[]; // [id]
    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    comp_t* alpha = (comp_t*) (sh_alpha_and_beta);
    comp_t* beta = (comp_t*) (alpha + iNumLab);
    comp_t* zeta = (comp_t*) (beta + iNumLab);
    comp_t pair_scores[1024]; // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;

    // copy global memory to shared memory to save time
    if (t > 0)
        alpha[id] = galpha[IDX2C(id, t - 1, iNumLab)];
    beta[id] = gbeta[IDX2C(id, t, iNumLab)];
    zeta[id] = gzeta[id];

    __syncthreads();

    for (int j = 0; j < iNumLab; j++)
        pair_scores[j] = gpair_scores[IDX2C(j, id, iNumLab)];

    comp_t fTmp;
    comp_t fTmp2;
    for (int j = 0; j < iNumLab; j++)
    {
        if (t == 0)
        {
            if (id == start_lbl)
                fTmp = 0;
            else
                fTmp = LZERO;
        }
        else
            fTmp = alpha[id];

        fTmp2 = fTmp + pair_scores[j] - zeta[j];
        assert(fTmp2 <= 0.0);
        fTmp2 += beta[j];

        fTmp = exp_(fTmp2);
        grd[IDX2C(j, id, iNumLab)] = (comp_t)grd[IDX2C(j, id, iNumLab)] + fTmp;
    }

    if ((t == 0 && id == start_lbl) || (t > 0 && t < iNumPos && lbls[IDX2C(id, t - 1, iNumLab)] != 0))
    {
        for (int ik = 0; ik < iNumLab; ik++)
        {
            if (lbls[IDX2C(ik, t, iNumLab)] != 0)
                grd[IDX2C(ik, id, iNumLab)] = (comp_t)grd[IDX2C(ik, id, iNumLab)] - 1.0;
        }
    }
};

template <class ElemType>
__global__ void _reductionLogAddSum(
    const ElemType* data,
    ElemType* sum,
    const size_t sum_size,
    CUDA_LONG N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    __shared__ comp_t partialLogAddSum[GridDim::maxThreadsPerBlock];

    int id = blockDim.x * blockIdx.x + threadIdx.x;
    int tid = threadIdx.x;

    if (id < N)
        partialLogAddSum[tid] = data[id];
    else
        partialLogAddSum[tid] = LZERO;

    __syncthreads();

    // do reduction on the shared memory
    size_t start_width = ceil((N + 0.0) / 2.0);
    for (size_t s = start_width; s > 0; s >>= 1)
    {
        comp_t lSum = LZERO;
        if (tid < s)
        {
            lSum = logaddk(partialLogAddSum[tid], partialLogAddSum[tid + s]);
            partialLogAddSum[tid] = lSum;
        }
    }
    __syncthreads();

    if (tid == 0)
        sum[0] = partialLogAddSum[0];
}

// set the value of certain columns to be zero
// the column is decided by threshhold value
// TODO: This kernel has very poor performace and needs to
// be optimized
template <class ElemType>
__global__ void _DropFrame(
    ElemType* a,
    const ElemType* label,
    const ElemType* gamma,
    const ElemType framedropthreshhold,
    const long m_numCols,
    const long m_numRows) // ld
{
    int col_id = blockDim.x * blockIdx.x + threadIdx.x;
    if (col_id >= m_numCols)
        return;
    bool dropframe = false;
    // find the 1 in the one-hot representation of the labels
    // This is a linear scan--bad perf!
    for (long i = 0; i < m_numRows; ++i)
    {
        int idx = IDX2C(i, col_id, m_numRows);
        // printf("%u ", idx);
        if (fabs_(label[idx] - 1.0) < 0.1) // we found the 1 in the vector
        {
            if (gamma[idx] < framedropthreshhold)
                dropframe = true;
            break;
        }
    }

    if (dropframe)
    {
        // printf("frame dropped %u ", col_id);
        for (long i = 0; i < m_numRows; ++i)
        {
            a[IDX2C(i, col_id, m_numRows)] = 0.0;
        }
    }
}

template <class ElemType>
__global__ void _AssignSequenceError(const ElemType hsmoothingWeight, ElemType* error, const ElemType* label,
                                     const ElemType* dnnoutput, const ElemType* gamma, ElemType alpha, const long N)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= N)
        return;
    error[id] = (comp_t)error[id] - (comp_t)alpha * ((comp_t)label[id] - (1.0 - (comp_t)hsmoothingWeight) * (comp_t)dnnoutput[id] - (comp_t)hsmoothingWeight * (comp_t)gamma[id]);
    // change to ce
    // error[id] -= alpha * (label[id] - dnnoutput[id] );
}

template <class ElemType>
__global__ void _copyTopKResults(const uint64_t* indexes, const ElemType* values, ElemType* maxIndexes, ElemType* maxValues,
                                 CUDA_LONG crow, CUDA_LONG ccol, int topK)
{
    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id >= topK * ccol)
        return;
    CUDA_LONG irow = id % topK;
    CUDA_LONG icol = id / topK;
    maxIndexes[id] = static_cast<CUDA_LONG>(indexes[icol * crow + irow] >> 32);
    maxValues[id] = values[icol * crow + irow];
}

template <int BlockSize, class ElemType>
__global__ void _assignNumOfDiffCol(const ElemType* a, const ElemType* b, ElemType* c, CUDA_LONG crowB, CUDA_LONG ccol)
{
    assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);

    int cur = 0;
    CUDA_LONG icol = threadIdx.x;
    for (; icol < ccol; icol += blockDim.x)
    {
        ElemType key = a[icol];
        CUDA_LONG idxB = icol * crowB;
        CUDA_LONG irow = 0;
        for (; irow < crowB; irow++, idxB++)
        {
            if (b[idxB] == key)
                break;
        }

        cur += (irow == crowB);
    }

    using BlockReduceT = cub::BlockReduce<int, BlockSize>;
    __shared__ typename BlockReduceT::TempStorage tmp;

    int res = BlockReduceT(tmp).Sum(cur);
    if (threadIdx.x == 0)
        *c = res;
}

template <class ElemType>
__global__ void _maskColumnsValue(ElemType* a, const char* columnsMask, CUDA_LONG numCols, CUDA_LONG numRows, ElemType val, CUDA_LONG numColsPerMaskEntry)
{
    CUDA_LONG maskColIdx = blockIdx.x;
    CUDA_LONG matrixStartColIdx = maskColIdx * numColsPerMaskEntry;

    for (CUDA_LONG k = 0; k < numColsPerMaskEntry; ++k)
    {
        CUDA_LONG colIdx = matrixStartColIdx + k;
        if (colIdx > numCols)
            return;

        if (columnsMask[IDX2C(0, maskColIdx, 1)] == 1)
            return;

        CUDA_LONG rowIdx = threadIdx.x;
        for (; rowIdx < numRows; rowIdx += blockDim.x)
        {
            a[IDX2C(rowIdx, colIdx, numRows)] = val;
        }
    }
}

template <class ElemType>
__global__ void _adam(CUDA_LONG size, ElemType* grad, ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
    ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, ElemType epsilon, ElemType typedUnitGainFactor, bool adamax)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    const comp_t unitGainFactor = (comp_t)typedUnitGainFactor;
    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride)
    {
        comp_t g = grad[idx];
        comp_t w;
        if (!adamax)
        {
            comp_t adaSqr = (comp_t)adaWeight * (comp_t)smoothAda[idx] + (1.0f - (comp_t)adaWeight) * g * g;
            smoothAda[idx] = adaSqr;
            w = (comp_t)adaMul * 1.0 / (sqrt_(adaSqr) + (comp_t)epsilon);
        }
        else
        {
            comp_t gAbs;
            gAbs = fabs_(g);
            smoothAda[idx] = max((comp_t)adaWeight * (comp_t)smoothAda[idx], gAbs);
            w = (comp_t)adaMul / (comp_t)smoothAda[idx];
        }

        g = (comp_t)mom * (comp_t)smoothMom[idx] + unitGainFactor * g;
        smoothMom[idx] = g;
        g = (comp_t)lr*g*w;
        val[idx] = (comp_t)val[idx] - g;
    }
}

template <class ElemType>
__global__ void _adam4BlockSparseCol(CUDA_LONG size,
    ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
    ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
    ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul, ElemType epsilon, ElemType unitGainFactor, bool adamax)
{
    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride)
    {
        ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, idx);
        ElemType w;
        if (!adamax)
        {
            ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
            smoothAda[idx] = adaSqr;

            w = adaMul * (ElemType)1.0 / (sqrt_(adaSqr) + epsilon);
        }
        else
        {
            ElemType gAbs;
            gAbs = fabs_(g);
            smoothAda[idx] = max(adaWeight * smoothAda[idx], gAbs);
            w = adaMul / smoothAda[idx];
        }

        g = mom * smoothMom[idx] + unitGainFactor * g;
        smoothMom[idx] = g;
        g = lr*g*w;
        val[idx] -= g;
    }
}

template <class ElemType, class GradType>
__global__ void _adadelta(CUDA_LONG size, GradType* grad, ElemType* smoothAda, ElemType* smoothX2, ElemType* val,
    ElemType learningRate, ElemType rho, ElemType epsilon)
{
    CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
    CUDA_LONG stride = blockDim.x * gridDim.x;
    for (; idx < size; idx += stride)
    {
        ElemType g = (ElemType)grad[idx];
        ElemType adaSqr = rho * smoothAda[idx] + (1.0f - rho) * g * g;
        smoothAda[idx] = adaSqr;
        ElemType x2 = smoothX2[idx];
        ElemType deltaX;
        deltaX = -sqrt_(x2 + epsilon) * rsqrt_(adaSqr + epsilon) * g;

        smoothX2[idx] = rho * smoothX2[idx] + (1.0f - rho) * deltaX * deltaX;
        val[idx] = val[idx] + learningRate * deltaX;
    }
}

template <class GradType, class AccumType>
__global__ void _adadelta4BlockSparseCol(CUDA_LONG size,
    const GradType* grad_bsc, const GPUSPARSE_INDEX_TYPE* blockId2ColOrRow, size_t numRows,
    AccumType* smoothAda, AccumType* smoothX2, AccumType* val,
    AccumType learningRate, AccumType rho, AccumType epsilon,
    const int* timestamps, int currentTimestamp)
{
    auto sparseIndex = blockDim.x * blockIdx.x + threadIdx.x;
    if (sparseIndex >= size)
        return;
    auto blockid = sparseIndex / numRows;
    auto col = blockId2ColOrRow[blockid];
    auto decay = pow_(rho, (AccumType)(currentTimestamp - 1 - timestamps[col]));
    auto denseIndex = col * numRows + sparseIndex % numRows;
    AccumType g = (AccumType)grad_bsc[sparseIndex];
    AccumType adaSqr = rho * decay * smoothAda[denseIndex] + (1.0f - rho) * g * g;
    smoothAda[denseIndex] = adaSqr;
    AccumType x2 = decay * smoothX2[denseIndex];
    AccumType deltaX;
    if (sizeof(AccumType) == sizeof(double))
    {
        deltaX = -sqrt_(x2 + epsilon) * rsqrt_(adaSqr + epsilon) * g;
    }
    else
    {
        deltaX = -sqrt_(x2 + epsilon) * rsqrt_(adaSqr + epsilon) * g;
    }
    smoothX2[denseIndex] = rho * x2 + (1.0f - rho) * deltaX * deltaX;
    val[denseIndex] += learningRate * deltaX;
}


template <class ElemType>
__global__ void _adadeltaFlush(CUDA_LONG N, size_t rows, ElemType* smoothAda, ElemType* smoothX2,
    ElemType rho, int* timestamps, int currentTimestamp)
{
    auto col = blockIdx.x * blockDim.x + threadIdx.x;
    if (col >= N)
        return;
    
    auto decay = pow_(rho, (ElemType)(currentTimestamp - timestamps[col]));
    auto offset = rows * col;
    timestamps[col] = 0;
    for (auto row = 0; row < rows; ++row)
    {
        smoothAda[offset + row] *= decay;
        smoothX2[offset + row] *= decay;
    }
}


// Calculate alpha in forward-backward calculation. equation (6), (7) in ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
// GPU x dimension corresponds to utterances, y dimension corresponds to phone sequence in each utterance
// prob (input): the posterior output from the network
// alpha (output): alpha for forward-backward calculation.
// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance
// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance
// uttToChanInd (input):  map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
// uttFrameNum (input): the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
// uttBeginFrame(input): the position of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
// uttPhoneNum (input): the phone number of each utterance. The size of this vector =  the number of all utterances in this minibatch
// numChannels (input): channel number in this minibatch
// uttNum (input): number of utterances
// t (input): time stamp to process
// maxPhoneNum (input): the max number of phones between utterances
// totalPhoneNum (input): the total number of phones of all utterances
// blankTokenId (input): id of the CTC blank token
// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
//      Alpha and Beta scores outside of the delay boundary are set to zero.
//      Setting this parameter smaller will result in shorted delay between label output during decoding.
//      delayConstraint=-1 means no constraint
template<class ElemType>
__global__ void _assignAlphaScore(
    const ElemType *prob,
    ElemType *alphaScore,
    ElemType *phoneSeq,
    ElemType *phoneBound,
    const size_t *uttToChanInd,
    const size_t *uttFrameNum,
    const size_t *uttBeginFrame,
    const size_t *uttPhoneNum,
    size_t numChannels,
    const size_t uttNum,
    const size_t  t,
    const size_t maxPhoneNum, // Maximum length of utterance in this MB
    const size_t totalPhoneNum, // Total number of phones
    const size_t blankTokenId,
    const int delayConstraint)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
    // Index of the label in the sequence
    LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;

    // Number of phones and frames in this utterance
    LONG64 phoneNum = uttPhoneNum[uttId];
    LONG64 frameNum = uttFrameNum[uttId];

    if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;

    // Current and previous phone indices in phoneSeq matrix
    LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
    LONG64 labelid_2 = labelid - 2;

    // Actual current phone label
    LONG64 phoneId = (LONG64)(phoneSeq[labelid]);

    // Index of the current frame in minibatch
    LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];

    // Index of probability of observing phoneId at frame timeId
    LONG64 probId = timeId*totalPhoneNum + phoneId;

    LONG64 alphaId = maxPhoneNum* timeId + phoneSeqId; // alpha_t(s)
    // Previous time frame
    LONG64 timeId_1 = timeId - numChannels; // Index corresponding to (t-1)
    LONG64 alphaId_0 = maxPhoneNum* timeId_1 + phoneSeqId; // alpha_{t-1}(s)
    LONG64 alphaId_1 = alphaId_0 - 1; // alpha_{t-1}(s-1)
    LONG64 alphaId_2 = alphaId_0 - 2; // alpha_{t-1}(s-2)

    if (t == 0)
    {
        // Initialize recursion
        if (phoneSeqId == 1 || phoneSeqId == 2)
        {
            alphaScore[alphaId] = prob[probId];
        }
    }
    else
    {
        if (phoneSeqId >= 1)
        {
            comp_t x = LZERO;

            comp_t ascore;
            if (phoneSeqId > 2)
            {
                // if current label is not blank and not equal prev non-blank label
                if ((LONG64)(phoneSeq[labelid]) != blankTokenId && phoneId != (LONG64)(phoneSeq[labelid_2]))
                {
                    x = logaddk(x, (comp_t)alphaScore[alphaId_2]);
                }
            }

            if (phoneSeqId > 1)
            {
                x = logaddk(x, (comp_t)alphaScore[alphaId_1]);
            }

            x = logaddk(x, (comp_t)alphaScore[alphaId_0]);

            if (phoneId != SIZE_MAX)
                ascore = prob[probId]; // Probability of observing given label at given time
            else
                ascore = 0;
            alphaScore[alphaId] = x + ascore;
            if (delayConstraint != -1)
            {
                LONG64 labelid_r = labelid + 2;
                LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_r]);
                if (phoneId == blankTokenId)
                {
                    // only constraint right side
                    if (t > phoneBoundId_r + delayConstraint - 1)
                        alphaScore[alphaId] = LZERO;
                }
                else if (phoneId != blankTokenId)
                {
                    if (t > phoneBoundId_r + delayConstraint)
                        alphaScore[alphaId] = LZERO;
                }
            }
        }
    }
}

// Calculate beta in forward-backward calculation, equation (10), (11) in ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
// See _assignAlphaScore for the explanation of parameters
template<class ElemType>
__global__ void _assignBetaScore(
    const ElemType *prob,
    ElemType *betaScore,
    ElemType *phoneSeq,
    ElemType *phoneBound,
    const size_t *uttToChanInd,
    const size_t *uttFrameNum,
    const size_t *uttBeginFrame,
    const size_t *uttPhoneNum,
    const size_t numChannels,
    const size_t uttNum,
    const size_t  t,
    const size_t maxPhoneNum,
    const size_t totalPhoneNum,
    const size_t blankTokenId,
    const int delayConstraint)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
    // Index of the label in the sequence
    LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
    LONG64 phoneNum = uttPhoneNum[uttId];
    LONG64 frameNum = uttFrameNum[uttId];

    if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;

    LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
    LONG64 labelid_2 = labelid + 2;
    LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
    LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
    LONG64 probId = timeId*totalPhoneNum + phoneId;
    LONG64 betaid = maxPhoneNum* timeId + phoneSeqId;
    LONG64 timeId_1 = timeId + numChannels;
    LONG64 betaid_0 = maxPhoneNum* timeId_1 + phoneSeqId;
    LONG64 betaid_1 = betaid_0 + 1;
    LONG64 betaid_2 = betaid_0 + 2;

    if (t == frameNum - 1)
    {
        if (phoneSeqId == phoneNum - 3 || phoneSeqId == phoneNum - 2)
        {
            betaScore[betaid] = prob[probId];
        }
    }
    else
    {
        if (phoneSeqId >= 1)
        {
            comp_t x = LZERO;
            comp_t ascore;
            if (phoneSeqId < phoneNum - 3)
            {
                if (phoneSeq[labelid] != blankTokenId && phoneId != phoneSeq[labelid_2])
                {
                    x = logaddk(x, (comp_t)betaScore[betaid_2]);
                }
            }

            if (phoneSeqId < phoneNum - 2)
            {
                x = logaddk(x, (comp_t)betaScore[betaid_1]);
            }

            x = logaddk(x, (comp_t)betaScore[betaid_0]);

            if (phoneId != SIZE_MAX)
                ascore = prob[probId];
            else
                ascore = 0;
            betaScore[betaid] = x + ascore;
            if (delayConstraint != -1)
            {
                LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_2]);
                if (phoneId == blankTokenId)
                {
                    if (t > phoneBoundId_r + delayConstraint - 1)
                        betaScore[betaid] = LZERO;
                }
                else if (phoneId != blankTokenId)
                {
                    if (t > phoneBoundId_r + delayConstraint)
                        betaScore[betaid] = LZERO;
                }
            }
        }
    }
}

// Calculate derivative, equation (15) in ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
// See _assignAlphaScore for the explanation of parameters
template<class ElemType>
__global__ void _assignCTCScore(
    ElemType *CTCscore,
    ElemType *prob,
    ElemType *alphaScore,
    ElemType *betaScore,
    ElemType *phoneSeq,
    const size_t uttNum,
    const size_t *uttToChanInd,
    const size_t *uttBeginFrame,
    const size_t *uttPhoneNum,
    const size_t *uttFrameNum,
    const long numChannels,
    const long maxPhoneNum,
    const long totalPhoneNum)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
    LONG64 t = blockDim.y * blockIdx.y + threadIdx.y;

    if (uttId < uttNum && t < uttFrameNum[uttId])
    {
        LONG64 phoneNum = uttPhoneNum[uttId];
        LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
        LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
        comp_t P_lx = betaScore[alphaId_0];

        for (int s = 1; s < phoneNum - 1; s++)
        {
            long phoneId = (comp_t)phoneSeq[uttId*maxPhoneNum + s];
            LONG64 alphaId = maxPhoneNum* timeId + s;
            LONG64 probId = timeId*totalPhoneNum + phoneId;

            if (phoneId != SIZE_MAX)
            {
                comp_t logoccu = (comp_t)alphaScore[alphaId] + (comp_t)betaScore[alphaId] - (comp_t)prob[probId] - P_lx;
                CTCscore[probId] = logaddk((comp_t)CTCscore[probId], logoccu);
            }
        }

        for (int s = 0; s < totalPhoneNum; s++)
        {
            LONG64 probId = timeId*totalPhoneNum + s;
            comp_t logoccu = CTCscore[probId];
            if (logoccu < LZERO)
                CTCscore[probId] = 0.0f;
            else
                CTCscore[probId] = exp_(logoccu);
        }
    }
}

// Calculate CTC score. equation (8) in ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
template<class ElemType>
__global__ void _assignTotalScore(ElemType *betaScore,
    ElemType *totalScore,
    const size_t uttNum,
    const size_t *uttToChanInd,
    const size_t *uttBeginFrame,
    const size_t numChannels,
    const size_t maxPhoneNum)
{
    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    LONG64 uttId = blockIdx.x;
    if (uttId < uttNum)
    {
        LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;

        betaScore[alphaId_0] = logaddk((comp_t)betaScore[alphaId_0 + 1], (comp_t)betaScore[alphaId_0 + 2]);
        // Negative sum
        atomicAdd(&totalScore[0], -1 * betaScore[alphaId_0]);
    }
}

template<class ElemType>
__global__ void _assignOneHot(ElemType *indices,
                                  ElemType *targetBuffer,
                                  size_t num_class,
                                  size_t num_item,
                                  size_t num_element)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < num_element)
    {
        if (indices[index] >= 0 && indices[index] < num_class)
        {
            size_t block_id = index / num_item;
            size_t item_id = index % num_item;
            targetBuffer[block_id * num_class * num_item + item_id + num_item * (size_t)(unsigned long long int)indices[index]] = 1;
        }
    }
}

template<class ElemType>
__global__ void _gatherFromTarget(ElemType *indices,
                                  ElemType *target,
                                  ElemType *buffer,
                                  size_t num_row_elements,
                                  size_t num_indices,
                                  CUDA_LONG num_elements)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < num_elements)
    {
        size_t indices_index = index / num_row_elements;
        size_t offset = index % num_row_elements;
        buffer[index] = target[(size_t)(unsigned long long int)indices[indices_index] * num_row_elements + offset];
    }
}

template<class ElemType>
__global__ void _scatterToIndices(ElemType *indices,
                                  ElemType *value,
                                  ElemType *buffer,
                                  char *mask,
                                  size_t num_indices_elems_per_mask_col,
                                  size_t num_row_elements,
                                  size_t num_indices,
                                  CUDA_LONG num_elements)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < num_elements)
    {
        size_t indices_index = index / num_row_elements;
        size_t offset = index % num_row_elements;
        //Skip missing values
        assert(!mask || num_indices_elems_per_mask_col != 0);
        if (mask && mask[indices_index / num_indices_elems_per_mask_col] == 0) return;
        //We resort to nondeterministic behavior (floating point addition is not associative).
        //Note that the CPU parallel algorithm will have poor performance on the GPU because of thread divergence
        atomicAdd(&buffer[(size_t)(unsigned long long int)indices[indices_index] * num_row_elements + offset], value[index]);
    }
}


template<class ElemType>
__global__ void _assignOneHotAsSparse(ElemType *indices,
                                      GPUSPARSE_INDEX_TYPE *secondaryIndices,
                                      GPUSPARSE_INDEX_TYPE *majorIndices,
                                      ElemType *targetBuffer,
                                      size_t num_class,
                                      int num_item,
                                      size_t num_elements)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < num_elements)
    {
        int block_id = index / num_item;
        int item_id = index % num_item;
        // for invalid indices, theorically they should not belong to nz elements.
        // but if we scan the indices to count the valid indices number,
        // it will be difficult for parallel calculation, especially on GPU.
        // here we chose to keep those elements in nz element list, but with value 0 at row 0
        if (indices[index] >= 0 && indices[index] < num_class)
        {
            targetBuffer[index] = 1;
            majorIndices[index] = ((int)indices[index] * num_item) + item_id;
        }
        else
        {
            targetBuffer[index] = 0;
            majorIndices[index] = item_id;
        }

        if (item_id == 0)
            secondaryIndices[block_id + 1] = num_item * (block_id + 1);

        if (index == 0)
            secondaryIndices[0] = 0;
    }
}

template<class ElemType>
__global__ void _setSparseDiagonalValue(ElemType v,
    GPUSPARSE_INDEX_TYPE *secondaryIndices,
    GPUSPARSE_INDEX_TYPE *majorIndices,
    ElemType *targetBuffer,
    size_t diagSize,
    size_t num_elements)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < diagSize)
    {
        majorIndices[index] = index;
        targetBuffer[index] = v;
        secondaryIndices[index] = index;
    }
    else if (index < num_elements)
    {
        secondaryIndices[index] = diagSize;
    }
}

template<class ElemType>
__global__ void _setSparseDiagonalValue(ElemType *vector,
    GPUSPARSE_INDEX_TYPE *secondaryIndices,
    GPUSPARSE_INDEX_TYPE *majorIndices,
    ElemType *targetBuffer,
    size_t diagSize,
    size_t num_elements)
{
    const CUDA_LONG index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < diagSize)
    {
        majorIndices[index] = index;
        targetBuffer[index] = vector[index];
        secondaryIndices[index] = index;
    }
    else if (index < num_elements)
    {
        secondaryIndices[index] = diagSize;
    }
}

}}}

#endif // !CPUONLY
Browse the archive

https://github.com/Microsoft/CNTK