// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #include "stdafx.h" #include "Basics.h" #include "BestGpu.h" #ifndef CPUONLY #include "GPUMatrix.h" #include "GPUMatrixCUDAKernels.cuh" #include "GPUSparseMatrix.h" #include "GPUTensor.h" #include "CommonMatrix.h" #define TENSOR_OPS_DECL __device__ __host__ #include "TensorOps.h" #include "device_launch_parameters.h" #include #include #include #include #include "cublas_v2.h" #include #include #include "CntkBatchNormalization.cuh" #include "Convolution.cuh" #pragma comment(lib, "cudart.lib") // instruct linker to reference these libs #pragma comment(lib, "cublas.lib") #pragma comment(lib, "cusparse.lib") #pragma comment(lib, "curand.lib") #pragma warning(disable : 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<>> syntax if a and b are size_t #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this #pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons #define DEFAULT_THREAD_PER_DIM 16 #define UNCONST(t, c, uc) GPUMatrix& uc = const_cast&>(c); #ifdef _WIN32 // thread local storage to access the current stream, initalize to default stream __declspec(thread) #endif cudaStream_t t_stream = cudaStreamDefault; #define DEFAULT_THREAD_PER_DIM 16 extern int _ConvertSMVer2Cores(int major, int minor); // forward declaration // SetStream - set the stream that will be used by the GPU routines void MATH_API SetStream(cudaStream_t stream) { t_stream = stream; } // GetStream - get the stream that will be used by the GPU routines cudaStream_t MATH_API GetStream() { return t_stream; } // Helper macro patterns for elemtwise methods #define DEF_ELEMWISE_INPLACE_FUNC(f) \ template \ GPUMatrix& GPUMatrix::Inplace##f() \ { \ performElementWiseFunction(ElementWiseOperator::op##f, Data()); \ return *this; \ } #define DEF_ELEMWISE_ASSIGN_FUNC(f) \ template \ GPUMatrix& GPUMatrix::Assign##f##Of(const GPUMatrix& a) \ { \ if (a.IsEmpty()) \ LogicError("Assign##f##Of: Matrix a is empty."); \ if (this != &a) \ RequireSize(a.GetNumRows(), a.GetNumCols()); \ performElementWiseFunction(ElementWiseOperator::op##f, a.Data()); \ return *this; \ } template <> const char* CudaErrString(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); } template <> const char* CudaErrString(cublasStatus_t e) { cudaDeviceSynchronize(); switch (e) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; default: return "(look for CUBLAS_STATUS_xxx in cublas_api.h)"; } } template <> const char* CudaErrString(curandStatus) { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; } namespace Microsoft { namespace MSR { namespace CNTK { template AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols) { if (IsTraceEnabled()) { auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId); fprintf(stderr, "Allocating Matrix<%s> (Rows = %d, Cols = %d) buffer on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int)numRows, (int)numCols, (int)deviceId, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second); Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); } AllocatedElemType* deviceBufferPtr = AllocateNoTrace(deviceId, numRows * numCols); if (IsTraceEnabled()) { fprintf(stderr, "Allocated DeviceData = %p\n", (void*) deviceBufferPtr); } return deviceBufferPtr; } template AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numElements) { if (IsTraceEnabled()) { auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId); fprintf(stderr, "Allocating array<%s> (NumElements = %d) on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int)numElements, (int)deviceId, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second); Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); } AllocatedElemType* deviceBufferPtr = AllocateNoTrace(deviceId, numElements); if (IsTraceEnabled()) { fprintf(stderr, "Allocated DeviceData = %p\n", (void*)deviceBufferPtr); } return deviceBufferPtr; } template void TracingGPUMemoryAllocator::Free(int deviceId, AllocatedElemType* bufferPtr, bool ignoreCUDARetCode /*= false*/) { PrepareDevice(deviceId); if (ignoreCUDARetCode) cudaFree((void*) bufferPtr); else CUDA_CALL(cudaFree((void*) bufferPtr)); if (IsTraceEnabled()) { auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId); fprintf(stderr, "Freed buffer<%s> DeviceData = %p on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (void*) bufferPtr, (int) deviceId, (int) freeAndTotalMemory.first, (int) freeAndTotalMemory.second); Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); } } template AllocatedElemType* TracingGPUMemoryAllocator::AllocateNoTrace(int deviceId, size_t numElements) { AllocatedElemType* deviceBufferPtr; PrepareDevice(deviceId); CUDA_CALL(cudaMalloc((void**) &deviceBufferPtr, sizeof(AllocatedElemType) * numElements)); return deviceBufferPtr; } std::pair TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(int deviceId) { PrepareDevice(deviceId); size_t free, total; CUDA_CALL(cudaMemGetInfo(&free, &total)); size_t numBytesPerMB = 1 << 20; return {free / numBytesPerMB, total / numBytesPerMB}; } // PrepareDevice - Setup the correct cuda context for an operation // deviceId - the device on which the operation will take place void PrepareDevice(DEVICEID_TYPE deviceId) { static DEVICEID_TYPE currentDevice = DEVICEID_NOTYETDETERMINED; // and if we last set the device to be this device we are good if (deviceId == currentDevice) return; CUDA_CALL(cudaSetDevice(deviceId)); currentDevice = deviceId; } #pragma region DeviceBoundNumber class template DeviceBoundNumber::DeviceBoundNumber(const DeviceBoundNumber& /*deepCopy*/) { NOT_IMPLEMENTED; } template DeviceBoundNumber::DeviceBoundNumber(DeviceBoundNumber&& shallowCopy) { ShallowCopyFrom(shallowCopy.m_data, shallowCopy.m_computeDevice); shallowCopy.m_data = NULL; } template void DeviceBoundNumber::ShallowCopyFrom(ElemType* newVal, int newValsDevceId) { m_computeDevice = newValsDevceId; m_data = newVal; } template DeviceBoundNumber::~DeviceBoundNumber() { if (m_data != NULL) { if (m_computeDevice < 0) { delete m_data; m_data = NULL; } else { TracingGPUMemoryAllocator::Free(m_computeDevice, m_data); } } } #pragma endregion DeviceBoundNumber class #pragma region Helper functions template cublasHandle_t _initCUBLAS(int devId) { PrepareDevice((DEVICEID_TYPE) devId); cublasHandle_t cuHandle; CUBLAS_CALL(cublasCreate(&cuHandle)); return cuHandle; } template void GPUMatrix::SetDevice(DEVICEID_TYPE deviceId) { assert(deviceId >= 0); CUDA_CALL(cudaSetDevice(deviceId)); } // PrepareDevice - Setup the correct cuda context for an operation // deviceId - the device on which the operation will take place // defaults to -1, which means use matrices current device template DEVICEID_TYPE GPUMatrix::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const { // if default value use current compute device DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : GetComputeDeviceId(); Microsoft::MSR::CNTK::PrepareDevice(newId); return newId; } template ElemType* GPUMatrix::CopyToArray() const { size_t numElements = GetNumElements(); if (numElements != 0) { PrepareDevice(); ElemType* pArray = new ElemType[numElements]; CUDA_CALL(cudaMemcpy(pArray, Data(), sizeof(ElemType) * m_numRows * m_numCols, cudaMemcpyDeviceToHost)); return pArray; } else { return NULL; } } //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done //return number of elements copied template size_t GPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const { size_t numElements = GetNumElements(); if (numElements > currentArraySize) { delete arrayCopyTo; arrayCopyTo = new ElemType[numElements]; currentArraySize = numElements; } if (numElements != 0) { PrepareDevice(); CUDA_CALL(cudaMemcpy(arrayCopyTo, Data(), sizeof(ElemType) * numElements, cudaMemcpyDeviceToHost)); } return numElements; } template void GPUMatrix::CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const { CUBLAS_CALL(cublasGetMatrix((int) numRows, (int) numCols, sizeof(ElemType), Data(), (int) GetNumRows(), dst, (int) colStride)); } template void GPUMatrix::ChangeDeviceTo(DEVICEID_TYPE to_id) { if (to_id == CPUDEVICE) LogicError("to_id must be valid GPU"); if (GetComputeDeviceId() == to_id) return; ElemType* d_dst = TracingGPUMemoryAllocator::Allocate(to_id, m_numRows, m_numCols); SetSizeAllocated(m_numRows * m_numCols); // check to make sure we have something to copy (on init we often have zero sized allocations) if (GetSizeAllocated() > 0) { // first try peer access int canAccessPeer = false; CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, GetComputeDeviceId())); if (canAccessPeer) { cudaError_t cudaStatus = cudaDeviceEnablePeerAccess(GetComputeDeviceId(), 0); if (cudaStatus != cudaErrorPeerAccessAlreadyEnabled) { CUDA_CALL(cudaStatus); } CUDA_CALL(cudaMemcpyPeer(d_dst, to_id, Data(), GetComputeDeviceId(), sizeof(ElemType) * m_numRows * m_numCols)); } else { // peer access didn't work, just copy normal // make this more efficient by keeping some buffers available for each copy ElemType* h_dst = NULL; PrepareDevice(); CUDA_CALL(cudaMallocHost((void**) &h_dst, sizeof(ElemType) * m_numRows * m_numCols)); CUDA_CALL(cudaMemcpy(h_dst, Data(), sizeof(ElemType) * m_numRows * m_numCols, cudaMemcpyDeviceToHost)); PrepareDevice((DEVICEID_TYPE) to_id); CUDA_CALL(cudaMemcpy(d_dst, h_dst, sizeof(ElemType) * m_numRows * m_numCols, cudaMemcpyHostToDevice)); CUDA_CALL(cudaFreeHost(h_dst)); } } TracingGPUMemoryAllocator::Free(GetComputeDeviceId(), Buffer()); SetBuffer(d_dst, m_numRows * m_numCols * sizeof(ElemType)); PrepareDevice((DEVICEID_TYPE) to_id); SetComputeDeviceId(to_id); } template void GPUMatrix::performElementWiseFunction(ElementWiseOperator kind, const ElemType* src) { PrepareDevice(); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); switch (kind) { case ElementWiseOperator::opSigmoid: { PROFILE_CUDA_STREAM("_elementWiseSigmoidOnCuda", t_stream); _elementWiseSigmoidOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opTanh: { PROFILE_CUDA_STREAM("_elementWiseTanhOnCuda", t_stream); _elementWiseTanhOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opSqrt: { PROFILE_CUDA_STREAM("_elementWiseSqrtOnCuda", t_stream); _elementWiseSqrtOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opExp: { PROFILE_CUDA_STREAM("_elementWiseExpOnCuda", t_stream); _elementWiseExpOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opLog: { PROFILE_CUDA_STREAM("_elementWiseLogOnCuda", t_stream); _elementWiseLogOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opAbs: { PROFILE_CUDA_STREAM("_elementWiseAbsOnCuda", t_stream); _elementWiseAbsOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opLinearRectifierDerivative: { PROFILE_CUDA_STREAM("_elementWiseLinRectDerivativeOnCuda", t_stream); _elementWiseLinRectDerivativeOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opCosine: { PROFILE_CUDA_STREAM("_elementWiseCosineOnCuda", t_stream); _elementWiseCosineOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opNegativeSine: { PROFILE_CUDA_STREAM("_elementWiseNegativeSineOnCuda", t_stream); _elementWiseNegativeSineOnCuda<<>>(src, Data(), N); break; } case ElementWiseOperator::opSigmoidDerivative: { PROFILE_CUDA_STREAM("_elementWiseSigmoidDerivativeOnCuda", t_stream); _elementWiseSigmoidDerivativeOnCuda<<>>(src, Data(), N); break; } default: LogicError("performElementWiseFunction: unexpected op code %d", (int)kind); } } #pragma endregion Helper functions #pragma region Constructors and Destructor // should only be used by constructors template void GPUMatrix::ZeroInit(int deviceId) { BaseMatrix::ZeroInit(); SetComputeDeviceId(deviceId); } template GPUMatrix::GPUMatrix(int deviceId) { ZeroInit(deviceId); }; template GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId) { ZeroInit(deviceId); m_numRows = numRows; m_numCols = numCols; SetSizeAllocated(GetNumElements()); if (GetNumElements() != 0) { SetBuffer(TracingGPUMemoryAllocator::Allocate(GetComputeDeviceId(), m_numRows, m_numCols), GetNumElements() * sizeof(ElemType)); CUDA_CALL(cudaMemset(Buffer(), 0, sizeof(ElemType) * GetSizeAllocated())); } }; template GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags) { ZeroInit(deviceId); SetValue(numRows, numCols, deviceId, pArray, matrixFlags); }; template GPUMatrix::GPUMatrix(const GPUMatrix& deepCopyFrom) { ZeroInit(); SetValue(deepCopyFrom); } template GPUMatrix::GPUMatrix(GPUMatrix&& moveFrom) { ShallowCopyFrom(moveFrom); moveFrom.ZeroValues(); } //assignment operator, deep copy template GPUMatrix& GPUMatrix::operator=(const GPUMatrix& deepCopyFrom) { if (this != &deepCopyFrom) { SetValue(deepCopyFrom); } return *this; } //move assignment operator, shallow copy template GPUMatrix& GPUMatrix::operator=(GPUMatrix&& moveFrom) { if (this != &moveFrom) { ShallowCopyFrom(moveFrom); moveFrom.ZeroValues(); } return *this; } template GPUMatrix::~GPUMatrix(void) { } // TODO: This should be in the storage object. // Clear will clear your storage, zeroinit just drops it on the ground. template void GPUMatrix::Clear() { VerifyWritable(__func__); //if (OwnBuffer() && m_pArray != NULL) if (m_sob != nullptr) { if (GetComputeDeviceId()>= 0) { // BUG: We do not check the CUDA return code for cudaFree here since this may get called // during processExit when cudaFree will fail. The destruction of CUDA objects during // process exit must be avoided ReleaseStorageMemory(); } } ZeroInit(GetComputeDeviceId()); } #pragma endregion Constructors and Destructor template std::unique_ptr> GPUMatrix::GetOrCreateWorkspace() const { // REVIEW alexeyk: not thread-safe, fine for now. if (m_workspace == nullptr) m_workspace = std::make_unique>>>(); assert(m_workspace != nullptr); auto deviceId = GetComputeDeviceId(); return m_workspace->pop_or_create([deviceId]() { return std::make_unique>(deviceId); }); } template void GPUMatrix::ReleaseWorkspace(std::unique_ptr> src) const { assert(m_workspace != nullptr); m_workspace->push(std::move(src)); } #pragma region Basic Operators template GPUMatrix GPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const { if (startColumn + numCols > GetNumCols()) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) GetNumCols()); GPUMatrix slice(GetComputeDeviceId()); slice.ShallowCopyFrom(*this); slice.m_numCols = numCols; slice.m_sliceViewOffset = m_sliceViewOffset + startColumn * GetNumRows(); return slice; } template GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { if (numCols == 0) LogicError("The slice cannot have 0 columns."); if (startColumn + numCols > fromMatrix.GetNumCols()) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) fromMatrix.GetNumCols()); Clear(); ShallowCopyFrom(fromMatrix); m_numCols = numCols; m_sliceViewOffset = fromMatrix.m_sliceViewOffset + startColumn * GetNumRows(); return *this; } template GPUMatrix& GPUMatrix::SetColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { if (startColumn + numCols > GetNumCols()) LogicError("The slice is out of range of the destination matrix."); if (numCols > fromMatrix.GetNumCols()) InvalidArgument("The slice (%d) is out of range of the source matrix (%d).", (int) numCols, (int) fromMatrix.GetNumCols()); if (m_numRows != fromMatrix.m_numRows) LogicError("The number of rows in source and destination matrices do not match"); if (m_numRows * numCols > 0) // TODO: remove if unnecessary CUDA_CALL(cudaMemcpy(Data() + m_sliceViewOffset + LocateColumn(startColumn), fromMatrix.Data(), sizeof(ElemType) * m_numRows * numCols, cudaMemcpyDeviceToDevice)); return *this; } template void GPUMatrix::CopyColumnsStrided(const GPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride) { if ((((numCols - 1) * srcNumColsStride) + 1) > fromMatrix.m_numCols) LogicError("The numCols to copy and srcNumColsStride specified is out of range of the source matrix."); if ((((numCols - 1) * destNumColsStride) + 1) > m_numCols) LogicError("The numCols to copy and srcNumColsStride specified is out of range of the destination matrix."); if (m_numRows != fromMatrix.m_numRows) LogicError("The number of rows in source and destination matrices do not match"); if ((m_numRows * numCols) > 0) { // Launch a kernel to do the strided copy CUDA_LONG N = (CUDA_LONG)(m_numRows * numCols); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_copyColumnsStrided", t_stream); _copyColumnsStrided<<>>(Data(), fromMatrix.Data(), N, (CUDA_LONG) m_numRows, (CUDA_LONG) destNumColsStride, (CUDA_LONG) srcNumColsStride); } } //for each column of a, we assign all rows of a to this starting from startIndex template GPUMatrix& GPUMatrix::AssignToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddToRowSliceValuesOf: input matrix a is empty."); if (a.GetNumRows() != numRows) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); if (startIndex + numRows > GetNumRows()) LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddToRowSliceValuesOf: columns does not match."); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_assignToRowSliceValuesOf", t_stream); _assignToRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) GetNumRows(), (CUDA_LONG) a.GetNumRows()); return *this; } //for each column of a, we assign numRows starting from startIndex to this template GPUMatrix& GPUMatrix::AssignRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AssignRowSliceValuesOf: input matrix a is empty."); if (startIndex + numRows > a.GetNumRows()) LogicError("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); RequireSize(numRows, a.GetNumCols()); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_assignRowSliceValuesOf", t_stream); _assignRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) numRows, (CUDA_LONG) a.GetNumRows()); return *this; } //for the row slice of this starting from startIndex we add a to it. template GPUMatrix& GPUMatrix::AddToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddToRowSliceValuesOf: input matrix a is empty."); if (a.GetNumRows() != numRows) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); if (startIndex + numRows > GetNumRows()) LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddToRowSliceValuesOf: columns does not match."); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_addToRowSliceValuesOf", t_stream); _addToRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) GetNumRows(), (CUDA_LONG) a.GetNumRows()); return *this; } //for each column of this, we add row slice of a starting from startIndex template GPUMatrix& GPUMatrix::AddWithRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddWithRowSliceValuesOf: input matrix a is empty."); if (GetNumRows() != numRows) LogicError("AddWithRowSliceValuesOf: GetNumRows() != numRows."); if (startIndex + numRows > a.GetNumRows()) LogicError("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddWithRowSliceValuesOf: columns does not match."); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_addWithRowSliceValuesOf", t_stream); _addWithRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) GetNumRows(), (CUDA_LONG) a.GetNumRows()); return *this; } template GPUMatrix GPUMatrix::Diagonal() const { size_t m = GetNumRows(); size_t n = GetNumCols(); if (m != n) LogicError("Diagonal can be called only for square matrix. (rows=%d, cols=%d)", (int) m, (int) n); GPUMatrix diag(1, n, GetComputeDeviceId()); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_assignToDiagonalValuesOf", t_stream); _assignToDiagonalValuesOf<<>>(diag.Data(), Data(), N, (CUDA_LONG) n); return diag; } // c = c - 1.0 for a specific position template void GPUMatrix::MinusOneAt(GPUMatrix& c, const size_t position) { assert(position < c.GetNumElements()); CUDA_LONG n = (CUDA_LONG) c.GetNumElements(); CUDA_LONG p = (CUDA_LONG) position; int blocksPerGrid = (int) ceil(1.0 * n / GridDim::maxThreadsPerBlock); c.PrepareDevice(); PROFILE_CUDA_STREAM("_minusOneAt", t_stream); _minusOneAt<<>>(c.Data(), p, n); } template GPUMatrix& GPUMatrix::AssignRepeatOf(const GPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats) { if (this == &a) LogicError("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat."); if (a.IsEmpty()) LogicError("AssignRepeatOf: Matrix a is empty."); RequireSize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats); CUDA_LONG N = (CUDA_LONG) GetNumElements(); CUDA_LONG n = (CUDA_LONG) a.GetNumCols(), m = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_assignRepeatOf", t_stream); _assignRepeatOf<<>>(Data(), a.Data(), N, m, n, (CUDA_LONG) GetNumRows()); return *this; } template GPUMatrix& GPUMatrix::AddToRowRepeatValuesOf(const GPUMatrix& a, const size_t numRepeats) { if (a.IsEmpty()) LogicError("AddToRowRepeatValuesOf: input matrix a is empty."); if (a.GetNumRows() != GetNumRows() * numRepeats) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != GetNumRows() * numRepeats."); RequireSize(a.GetNumRows() / numRepeats, a.GetNumCols()); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_addToRowRepeatValuesOf", t_stream); _addToRowRepeatValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) a.GetNumRows(), (CUDA_LONG) a.GetNumCols(), (CUDA_LONG) GetNumRows()); return *this; } template GPUMatrix& GPUMatrix::AssignPositiveAndShiftedNegSample(const GPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { if (this == &a) LogicError("AssignPositiveAndShiftedNegSample: a is the same as [this]. Does not support inplace assignment."); if (a.IsEmpty()) LogicError("AssignPositiveAndShiftedNegSample: Matrix a is empty."); RequireSize(a.GetNumRows() * (posNumber + negNumber), a.GetNumCols()); CUDA_LONG N = (CUDA_LONG) GetNumElements(); CUDA_LONG n = (CUDA_LONG) a.GetNumCols(), m = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_assignPositiveAndShiftedNegSample", t_stream); _assignPositiveAndShiftedNegSample<<>>(Data(), a.Data(), N, m, n, (CUDA_LONG) GetNumRows(), posNumber, shiftNumber); return *this; } template GPUMatrix& GPUMatrix::AddFoldedPositiveAndShiftedNegSample(const GPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { if (this == &a) LogicError("AddFoldedPositiveAndShiftedNegSample: a is the same as [this]. Does not support inplace assignment."); if (a.IsEmpty()) LogicError("AddFoldedPositiveAndShiftedNegSample: Matrix a is empty."); if (a.GetNumRows() != GetNumRows() * (posNumber + negNumber) || a.GetNumCols() != GetNumCols()) LogicError("AddFoldedPositiveAndShiftedNegSample: dimensions mismatch."); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); CUDA_LONG n = (CUDA_LONG) a.GetNumCols(), m = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_addFoldedPositiveAndShiftedNegSample", t_stream); _addFoldedPositiveAndShiftedNegSample<<>>(Data(), a.Data(), N, m, n, (CUDA_LONG) GetNumRows(), posNumber, shiftNumber); return *this; } template GPUMatrix GPUMatrix::Transpose() const { if (IsEmpty()) LogicError("Transpose: Matrix is empty."); GPUMatrix c(GetComputeDeviceId()); c.AssignTransposeOf(*this); return c; } // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU // computeDevice - The compute device for which the cublas handle is desired // returns: cublas handle // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends template cublasHandle_t GPUMatrix::GetCublasHandle(int computeDevice /*=-1*/) { // if the compute device is not passed, get the current device from CUDA if (computeDevice < 0) cudaGetDevice(&computeDevice); if (computeDevice < 0 || computeDevice >= MaxGpus) LogicError("GetCublasHandle: Maximum GPU exceeded"); cublasHandle_t cuHandle = s_cuHandle[computeDevice]; if (cuHandle == NULL) { s_cuHandle[computeDevice] = cuHandle = _initCUBLAS(computeDevice); } CUBLAS_CALL(cublasSetStream(cuHandle, t_stream)); return cuHandle; } template GPUMatrix& GPUMatrix::AssignTransposeOf(const GPUMatrix& a) { if (this == &a) LogicError("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose."); if (a.IsEmpty()) LogicError("AssignTransposeOf: Matrix a is empty."); if (GetNumRows() != a.GetNumCols() || GetNumCols() != a.GetNumRows()) RequireSize(a.GetNumCols(), a.GetNumRows()); cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); cublasOperation_t transA = CUBLAS_OP_T; cublasOperation_t transB = CUBLAS_OP_T; int m = (int) a.m_numCols; int n = (int) a.m_numRows; ElemType alpha = 1; ElemType beta = 0; cublasStatus_t st; if (sizeof(ElemType) == sizeof(float)) { PROFILE_CUDA_STREAM("cublasSgeam", t_stream); st = cublasSgeam(cuHandle, transA, transB, m, n, reinterpret_cast(&alpha), reinterpret_cast(a.Data()), (int)a.m_numRows, reinterpret_cast(&beta), reinterpret_cast(a.Data()), (int)a.m_numRows, reinterpret_cast(Data()), (int)m_numRows); } else if (sizeof(ElemType) == sizeof(double)) { PROFILE_CUDA_STREAM("cublasDgeam", t_stream); st = cublasDgeam(cuHandle, transA, transB, m, n, reinterpret_cast(&alpha), reinterpret_cast(a.Data()), (int)a.m_numRows, reinterpret_cast(&beta), reinterpret_cast(a.Data()), (int)a.m_numRows, reinterpret_cast(Data()), (int)m_numRows); } else RuntimeError("Unsupported template argument in GPUMatrix"); if (st != CUBLAS_STATUS_SUCCESS) RuntimeError("AssignTransposeOf failed"); m_numRows = a.m_numCols; m_numCols = a.m_numRows; return *this; } template __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType beta, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, size_t aCols, const ElemType alpha, CUDA_LONG numElements) { CUDA_LONG id = GridDim::GetLinearThreadId(); if (id >= numElements) // note: there are no __syncthread() calls inside return; // id = i + jOut * usStride; // Each thread processes one element of the output matrix. CUDA_LONG i = id % usStride; // row index into 'us' and 'a' CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx' auto jInF = idx[jOut * idxStride]; // this is the column we need to get if (jInF < 0) // negative index means gap return; size_t jIn = (size_t)jInF; if (jIn >= aCols) return; // actually a failure const ElemType& ra = a[ i + jIn * aStride ]; ElemType& rus = us[id/*i + jOut * usStride*/]; ElemType res = ra * alpha; if (beta != 0) res += rus * beta; rus = res; } // *this[:,j] = a[:,idx[j]] * alpha + *this[:,j] * beta template GPUMatrix& GPUMatrix::DoGatherColumnsOf(ElemType beta, const GPUMatrix& idx, const GPUMatrix& a, ElemType alpha) { if (idx.GetNumRows() != 1) // index is 1-dimensional only InvalidArgument("DoGatherColumnsOf: Map must be a row vector."); if (beta == 0) RequireSize(a.GetNumRows(), idx.GetNumCols()); // output has same column format as a, but number of columns comes from idx else VerifySize(a.GetNumRows(), idx.GetNumCols()); if (idx.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId()) InvalidArgument("All matrices must be on the same GPU"); a.PrepareDevice(); // launch the kernel CUDA_LONG NN = (CUDA_LONG)GetNumElements(); // linear space identifying each individual input element GridDim grid(NN); PROFILE_CUDA_STREAM("_doGatherColumnsOf", t_stream); _doGatherColumnsOf<<>>(Data(), GetNumRows(), beta, idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), a.GetNumCols(), alpha, grid.m_N); // Note: The following fails silently (no error, immediate or delayed) for numcols = 10000 under CUDA 7.0. //_doGatherColumnsOf<<>>(Data(), GetNumRows(), beta, idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), a.GetNumCols(), alpha); return *this; } template __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, const ElemType alpha, CUDA_LONG numElements) { CUDA_LONG id = GridDim::GetLinearThreadId(); if (id >= numElements) // note: there are no __syncthread() calls inside return; // id = i + jIn * aStride // Each thread processes one element of a CUDA_LONG i = id % aStride; // row index into 'a' and 'us' CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx' auto jOutF = idx[jIn * idxStride]; // this is the column we copy/add into if (jOutF < 0) // negative index means gap return; size_t jOut = (size_t)jOutF; if (jOut >= usCols) return; // actually a failure --TODO: This should not be necessary. Why is it? const ElemType& ra = a[id/*i + jIn * aStride*/]; ElemType& rus = us[ i + jOut * usStride ]; ElemType res = ra * alpha; if (res != 0) // avoid memory conflict if e.g. an entire column has no gradient atomicAdd(&rus, res); // rus += res; // Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter()) } // little helper for debugging template static void Peek(const GPUMatrix& m, const char* which) { size_t rows = m.GetNumRows(); size_t cols = m.GetNumCols(); ElemType buf[10000] = { 0 }; size_t n = min(rows * cols, _countof(buf)); CUDA_CALL(cudaMemcpy(buf, m.Data(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost)); UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here //CUDA_CALL(cudaMemcpy(const_cast(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice)); } // *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta template GPUMatrix& GPUMatrix::DoScatterColumnsOf(ElemType beta, const GPUMatrix& idx, const GPUMatrix& a, ElemType alpha) { if (idx.GetNumRows() != 1) // index is 1-dimensional only InvalidArgument("DoScatterColumnsOf: Map must be a row vector."); if (idx.GetNumCols() != a.GetNumCols()) InvalidArgument("DoScatterColumnsOf: Map must have width of input vector."); if (a.GetNumRows() != GetNumRows()) InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector."); if (idx.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId()) InvalidArgument("All matrices must be on the same GPU"); a.PrepareDevice(); auto& us = *this; // pre-scale with beta upfront // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding. Scale(beta, us); // if beta is 0, then this will be a memset() // launch the kernel CUDA_LONG NN = (CUDA_LONG)(a.GetNumElements()); // linear space identifying each individual input element GridDim grid(NN); PROFILE_CUDA_STREAM("_doScatterColumnsOf", t_stream); _doScatterColumnsOf<<>>(Data(), GetNumRows(), GetNumCols(), idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), alpha, NN); //_doScatterColumnsOf<<>>(Data(), GetNumRows(), GetNumCols(), idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), alpha, NN); return *this; } template void GPUMatrix::SetValue(const ElemType v) { if (IsEmpty()) return; CUDA_LONG N = (CUDA_LONG) GetNumElements(); // Check if value is zero, which can be set using cudaMemset bool isZero = true; const char* valArray = reinterpret_cast(&v); for (int i = 0; i < sizeof(ElemType); i++) { if (valArray[i] != 0) { isZero = false; break; } } if (isZero) { CUDA_CALL(cudaMemset(Data(), 0, N * sizeof(ElemType))); } else { int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_setValue", t_stream); _setValue<<>>(Data(), v, N); } } template void GPUMatrix::SetValue(const ElemType* d_v) // d_v is pointer to the the value in GPU memory { if (IsEmpty()) LogicError("SetValue: Matrix is empty."); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_setValue", t_stream); _setValue<<>>(Data(), d_v, N); } template void GPUMatrix::MaskColumnsValue(const GPUMatrix& columnsMask, ElemType val) { if (GetNumCols() != columnsMask.GetNumCols()) RuntimeError("Matrix and column mask must have equal number of columns"); if (GetComputeDeviceId() != columnsMask.GetComputeDeviceId()) RuntimeError("Matrix and column mask must be on the same device"); int blocksPerGrid = (int) GetNumCols(); PrepareDevice(); PROFILE_CUDA_STREAM("_maskColumnsValue", t_stream); _maskColumnsValue<<>>(Data(), columnsMask.Data(), (CUDA_LONG) GetNumCols(), (CUDA_LONG) GetNumRows(), val); } template void GPUMatrix::SetColumn(const ElemType* colPointer, size_t colInd) { if (IsEmpty()) LogicError("SetValue: Matrix is empty."); if (colPointer == NULL) return; CUDA_CALL(cudaMemcpy(Data() + LocateColumn(colInd), colPointer, sizeof(ElemType) * m_numRows, cudaMemcpyHostToDevice)); } template void GPUMatrix::SetColumn(const GPUMatrix& valMat, size_t colInd) { if (IsEmpty()) LogicError("SetColumn: Matrix is empty."); if (valMat.GetNumCols() != 1) LogicError("SetColumn: only support one column matrix now."); CUDA_CALL(cudaMemcpy(Data() + LocateColumn(colInd), valMat.Data(), sizeof(ElemType) * m_numRows, cudaMemcpyDeviceToDevice)); } template void GPUMatrix::SetValue(const GPUMatrix& deepCopyFrom) { if (this == &deepCopyFrom) return; SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.GetComputeDeviceId(), deepCopyFrom.Data(), matrixFlagSetValueOnDevice); } template void GPUMatrix::SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags) { // handle externally managed case // BUGBUG: This is super super ugly, and needs to be fixed, but if matrixFlags has the right value, then we can't free anything, // and everything gets wonky. This should be fixed, and would go away if it is made a shared_ptr. if (matrixFlags & matrixFlagDontOwnBuffer) { // free the existing array if it used to be an owned array if ( Buffer() != NULL) { TracingGPUMemoryAllocator::Free(GetComputeDeviceId(), Buffer()); } m_numRows = numRows; m_numCols = numCols; SetBuffer(pArray, GetNumElements() * sizeof(ElemType), true); SetSizeAllocated(GetNumElements()); SetFormat(matrixFormatDense); SetComputeDeviceId(deviceId); } else { // if the devices are different move it now if (GetComputeDeviceId() != deviceId && deviceId >= 0) { Clear(); ZeroInit(deviceId); } // now RequireSize/allocate as necessary RequireSize(numRows, numCols); // copy over the content to the buffer PrepareDevice(); if (pArray != NULL) { if (!(matrixFlags & matrixFormatRowMajor)) { CUDA_CALL(cudaMemcpy(Data(), pArray, sizeof(ElemType) * GetNumElements(), (matrixFlags & matrixFlagSetValueOnDevice) ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice)); } else // row major: must transpose (this is not meant to be efficient, but very useful for defining inline matrices for test code) { vector transposed(GetNumElements()); for (size_t i = 0; i < numRows; i++) for (size_t j = 0; j < numCols; j++) transposed[i + numRows * j] = pArray[j + numCols * i]; CUDA_CALL(cudaMemcpy(Data(), transposed.data(), sizeof(ElemType) * GetNumElements(), (matrixFlags & matrixFlagSetValueOnDevice) ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice)); } } } SetFormat(matrixFormatDense); } template void GPUMatrix::SetDiagonalValue(const ElemType v) { CUDA_LONG N = (CUDA_LONG) GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_setDiagonalValue", t_stream); _setDiagonalValue<<>>(Data(), v, N, (CUDA_LONG) GetNumRows()); } template void GPUMatrix::SetDiagonalValue(const GPUMatrix& vector) { if (IsEmpty() || vector.IsEmpty()) LogicError("SetDiagonalValue: Matrix is empty."); if (GetNumRows() != GetNumCols()) LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1) LogicError("SetDiagonalValue: input vector must be a vector."); if (vector.GetNumElements() == 1) // reduce to simple form SetDiagonalValue(vector.Data()[0]); else if (vector.GetNumRows() != GetNumRows()) LogicError("SetDiagonalValue: input vector's dimension does not agree with [this]."); else { CUDA_LONG N = (CUDA_LONG) GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); PROFILE_CUDA_STREAM("_setDiagonalValueFromVector", t_stream); _setDiagonalValueFromVector<<>>(Data(), vector.Data(), N); } } template void GPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) { PrepareDevice(); CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead? cudaEvent_t done = nullptr; CUDA_CALL(cudaEventCreate(&done)); if (sizeof(ElemType) == sizeof(float)) { PROFILE_CUDA("curandGenerateUniform"); CURAND_CALL(curandGenerateUniform(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements())); } else { PROFILE_CUDA("curandGenerateUniformDouble"); CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements())); } // Wait for curand* to complete before continuing CUDA_CALL(cudaEventRecord(done)); CUDA_CALL(cudaEventSynchronize(done)); // CURAND_CALL(curandDestroyGenerator(gen)); CUDA_CALL(cudaEventDestroy(done)); size_t N = GetNumElements(); size_t blocksPerGrid = (size_t) ceil(N / (double) GridDim::maxThreadsPerBlock); PROFILE_CUDA_STREAM("_rescaleToRange", t_stream); _rescaleToRange<<>>(Data(), N, low, high); } template void GPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) { PrepareDevice(); CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead? if (sizeof(ElemType) == sizeof(float)) { PROFILE_CUDA("curandGenerateNormal"); CURAND_CALL(curandGenerateNormal(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements(), (float)mean, (float)sigma)); } else { PROFILE_CUDA("curandGenerateNormalDouble"); CURAND_CALL(curandGenerateNormalDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements(), (double)mean, (double)sigma)); } // CURAND_CALL(curandDestroyGenerator(gen)); } //maskRate: percentage of values masked out (similar to dropout rate) //scaleValue: which scale value to set to the left ones (unmasked items). template void GPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed) { PrepareDevice(); CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead? cudaEvent_t done = nullptr; CUDA_CALL(cudaEventCreate(&done)); if (sizeof(ElemType) == sizeof(float)) { PROFILE_CUDA("curandGenerateUniform"); CURAND_CALL(curandGenerateUniform((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast(Data()), GetNumElements())); } else { PROFILE_CUDA("curandGenerateUniformDouble"); CURAND_CALL(curandGenerateUniformDouble((((curandGenerator_t*)s_curandGenerator)[0]), reinterpret_cast(Data()), GetNumElements())); } // Wait for curand* to complete before continuing CUDA_CALL(cudaEventRecord(done)); CUDA_CALL(cudaEventSynchronize(done)); CUDA_CALL(cudaEventDestroy(done)); // CURAND_CALL(curandDestroyGenerator(gen)); size_t N = GetNumElements(); size_t blocksPerGrid = (size_t) ceil(N / (double) GridDim::maxThreadsPerBlock); PROFILE_CUDA_STREAM("_setMaskAndScale", t_stream); _setMaskAndScale<<>>(Data(), N, maskRate, scaleValue); } template ElemType GPUMatrix::Adagrad(GPUMatrix& gradients, const bool needAveMultiplier) { size_t numColsNeeded = gradients.GetNumCols(); if (needAveMultiplier) numColsNeeded += gradients.GetNumCols(); if (IsEmpty() || GetNumCols() < numColsNeeded) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == numColsNeeded); size_t n = gradients.GetNumElements(); ElemType* multipliers = nullptr; if (needAveMultiplier) multipliers = Data() + n; // temp memory used to store multipliers, int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; { // BUGBUG: Should this use CUDA streams? PROFILE_CUDA("_adagrad"); _adagrad<<>>(Data(), gradients.Data(), n, multipliers); } if (!needAveMultiplier) return 1; cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) { float aveMultiplier = 0; PROFILE_CUDA_STREAM("cublasSasum", t_stream); CUBLAS_CALL(cublasSasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / n; } else { double aveMultiplier = 0; PROFILE_CUDA_STREAM("cublasDasum", t_stream); CUBLAS_CALL(cublasDasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / n; } } template void GPUMatrix::FSAdagrad(GPUMatrix& gradients, GPUMatrix& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul) { size_t numColsNeeded = 2 * gradients.GetNumCols(); if (IsEmpty() || (GetNumCols() < numColsNeeded)) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded)); size_t n = gradients.GetNumElements(); int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; PROFILE_CUDA("_fsadagrad"); _fsadagrad<<>>(n, gradients.Data(), Data(), Data()+ n, functionValues.Data(), learnRatePerSample, momentum, adaWeight, adaMul); } template ElemType GPUMatrix::RmsProp(GPUMatrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier) { const ElemType floor = 1e-6f; static ElemType* upd_gpu = (ElemType*) 0; size_t n = gradients.GetNumElements(); int blocksPerGrid = (GetNumElements() + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; size_t numColsNeeded = gradients.GetNumCols() * 3; if (needAveMultiplier) numColsNeeded += gradients.GetNumCols(); if (IsEmpty() || GetNumCols() < numColsNeeded) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); ElemType* avars = Data(); // accumulated variances for RMS scaling ElemType* signs = Data() + n; // sign of previous gradient ElemType* steps = Data() + 2 * n; // current step size // Data()+3*n is temp memory used to store multipliers, no need to initialize PROFILE_CUDA("_rmsprop_init"); _rmsprop_init<<>>(avars, signs, steps, gradients.Data(), n); } assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == numColsNeeded); ElemType* avars = Data(); // accumulated variances for RMS scaling ElemType* signs = Data() + n; // sign of previous gradient ElemType* steps = Data() + 2 * n; // current step size ElemType* multipliers = nullptr; if (needAveMultiplier) multipliers = Data() + 3 * n; // temp memory used to store multipliers, if (!upd_gpu) { const ElemType upd[] = { 2, 2, 0, 2, 2, 0, 1, 1, 1, 2, 2, 0, 1, 2, 1, 0, 2, 2, 1, 1, 1, 0, 2, 2, 0, 2, 2, }; upd_gpu = TracingGPUMemoryAllocator::Allocate(GetComputeDeviceId(), 27); CUDA_CALL(cudaMemcpy(upd_gpu, upd, sizeof(ElemType) * _countof(upd), cudaMemcpyHostToDevice)); } { PROFILE_CUDA("_rmsprop"); _rmsprop<<>>(avars, signs, steps, gradients.Data(), n, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, floor, upd_gpu, multipliers); } if (!needAveMultiplier) return 1; cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) { float aveMultiplier = 0; PROFILE_CUDA_STREAM("cublasSasum", t_stream); CUBLAS_CALL(cublasSasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return aveMultiplier / n; } else { double aveMultiplier = 0; PROFILE_CUDA_STREAM("cublasDasum", t_stream); CUBLAS_CALL(cublasDasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / n; } } template void GPUMatrix::Reshape(const size_t numRows, const size_t numCols) { assert(numRows * numCols == GetNumElements()); if (numRows * numCols != GetNumElements()) InvalidArgument("Reshape: total number of elements does not match."); m_numRows = numRows; m_numCols = numCols; } template void GPUMatrix::RequireSize(const size_t numRows, const size_t numCols, bool growOnly) { if (GetNumRows() != numRows || GetNumCols() != numCols) Resize(numRows, numCols, growOnly); } template void GPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly) { VerifyResizable(__func__); if (GetNumRows() == numRows && GetNumCols() == numCols) return; m_numRows = numRows; m_numCols = numCols; size_t numElements = GetNumElements(); if (numElements > GetSizeAllocated() || (!growOnly && numElements != GetSizeAllocated())) { if (IsEmpty()) { SetSizeAllocated(0); SetBuffer(nullptr, 0); } else { if (Buffer()) { TracingGPUMemoryAllocator::Free(GetComputeDeviceId(), Buffer()); } SetSizeAllocated(numElements); SetBuffer(TracingGPUMemoryAllocator::Allocate(GetComputeDeviceId(), m_numRows, m_numCols), numElements * sizeof(ElemType)); CUDA_CALL(cudaMemset(Buffer(), 0, sizeof(ElemType) * GetSizeAllocated())); } } m_sliceViewOffset = 0; } template size_t GPUMatrix::LocateElement(const size_t row, const size_t col) const { assert(row < m_numRows && col < m_numCols); return LocateColumn(col) + row; // matrix in column-wise storage } template size_t GPUMatrix::LocateColumn(const size_t col) const { assert(col < GetNumCols()); return col * m_numRows; // matrix in column-wise storage } template ElemType GPUMatrix::Get00Element() const { ElemType res = 0; CUDA_CALL(cudaMemcpy(&res, Data(), sizeof(ElemType), cudaMemcpyDeviceToHost)); return res; } #pragma endregion Basic Operators #pragma region Member BLAS Functions template GPUMatrix& GPUMatrix::operator+=(ElemType alpha) { if (IsEmpty()) LogicError("operator+=: Matrix is empty."); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PROFILE_CUDA_STREAM("_addValue", t_stream); _addValue<<>>(Data(), alpha, N); return *this; } template GPUMatrix GPUMatrix::operator+(ElemType alpha) const { if (IsEmpty()) LogicError("operator+: Matrix is empty."); GPUMatrix c(*this); c += alpha; return c; } template GPUMatrix& GPUMatrix::AssignSumOf(const ElemType alpha, const GPUMatrix& a) { SetValue(a); (*this) += alpha; return (*this); } template GPUMatrix& GPUMatrix::operator+=(const GPUMatrix& a) { ScaleAndAdd(1, a, *this); return *this; } template GPUMatrix GPUMatrix::operator+(const GPUMatrix& a) const { if (GetNumElements() == 1) { GPUMatrix c(a); c += Get00Element(); return c; } else if (a.GetNumElements() == 1) { GPUMatrix c(*this); c += a.Get00Element(); return c; } else { GPUMatrix c(*this); // this implementation will introduce a copy overhead. but make resue of the code c += a; return c; } } template GPUMatrix& GPUMatrix::AssignSumOf(const GPUMatrix& a, const GPUMatrix& b) { SetValue(a); (*this) += b; return (*this); } template GPUMatrix& GPUMatrix::operator-=(ElemType alpha) { if (IsEmpty()) LogicError("operato-=: Matrix is empty."); return operator+=(-1 * alpha); } template GPUMatrix GPUMatrix::operator-(ElemType alpha) const { if (IsEmpty()) LogicError("operator-: Matrix is empty."); return operator+(-1 * alpha); } template GPUMatrix& GPUMatrix::AssignDifferenceOf(const ElemType alpha, const GPUMatrix& a) { RequireSize(a.m_numRows, a.m_numCols); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); a.PrepareDevice(); PROFILE_CUDA_STREAM("_assignDifferenceOf1", t_stream); _assignDifferenceOf1<<>>(Data(), alpha, a.Data(), N); return *this; } template GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const ElemType alpha) { RequireSize(a.m_numRows, a.m_numCols); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); a.PrepareDevice(); PROFILE_CUDA_STREAM("_assignDifferenceOf2", t_stream); _assignDifferenceOf2<<>>(Data(), alpha, a.Data(), N); return *this; } template GPUMatrix& GPUMatrix::operator-=(const GPUMatrix& a) { ScaleAndAdd(-1, a, *this); return *this; } template GPUMatrix GPUMatrix::operator-(const GPUMatrix& a) const { GPUMatrix c(*this); // this implementation will introduce a copy overhead. but make resue of the code c -= a; return c; } template GPUMatrix& GPUMatrix::AssignDifferenceOf(const GPUMatrix& a, const GPUMatrix& b) { if (this != &a) { RequireSize(a.GetNumRows(), a.GetNumCols()); SetValue(a); } (*this) -= b; return *this; } template GPUMatrix& GPUMatrix::operator*=(ElemType alpha) { Scale(alpha, *this); return *this; } template GPUMatrix GPUMatrix::operator*(ElemType alpha) const { GPUMatrix c(GetNumRows(), GetNumCols(), GetComputeDeviceId()); Scale(alpha, *this, c); return c; } template GPUMatrix& GPUMatrix::AssignProductOf(const ElemType alpha, const GPUMatrix& a) { Scale(alpha, a, *this); return *this; } template GPUMatrix& GPUMatrix::AssignProductOf(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB) { if (a.GetNumElements() == 1) { if (transposeB) AssignTransposeOf(b); (*this) *= a.Get00Element(); } else if (b.GetNumElements() == 1) { if (transposeA) AssignTransposeOf(a); (*this) *= b.Get00Element(); } else Multiply(a, transposeA, b, transposeB, *this); return *this; } template GPUMatrix GPUMatrix::operator*(const GPUMatrix& a) const { const GPUMatrix& us = *this; if (GetNumElements() == 1) { GPUMatrix