// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #include "stdafx.h" #include "Basics.h" #include "BestGpu.h" #ifndef CPUONLY #include "GPUMatrix.h" #include "GPUMatrixCUDAKernels.cuh" //#include "GPUSparseMatrix.h" #include "GPUTensor.h" #include "CommonMatrix.h" #define TENSOR_OPS_DECL __device__ __host__ #include "TensorOps.h" #include "device_launch_parameters.h" #include #include #include #include #include "cublas_v2.h" #include #include #include "CntkBatchNormalization.cuh" #include "Convolution.cuh" #include "CuDnnRNN.h" #pragma comment(lib, "cudart.lib") // instruct linker to reference these libs #pragma comment(lib, "cublas.lib") #pragma comment(lib, "cusparse.lib") #pragma comment(lib, "curand.lib") #pragma warning(disable : 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<>> syntax if a and b are size_t #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this #pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons #define DEFAULT_THREAD_PER_DIM 16 #define UNCONST(t, c, uc) GPUMatrix& uc = const_cast&>(c); #ifdef _WIN32 // thread local storage to access the current stream, initialize to default stream __declspec(thread) #endif cudaStream_t t_stream = cudaStreamDefault; #define DEFAULT_THREAD_PER_DIM 16 extern int _ConvertSMVer2Cores(int major, int minor); // forward declaration // SetStream - set the stream that will be used by the GPU routines void MATH_API SetStream(cudaStream_t stream) { t_stream = stream; } // GetStream - get the stream that will be used by the GPU routines cudaStream_t MATH_API GetStream() { return t_stream; } // Helper macro patterns for elementwise methods #define DEF_ELEMWISE_INPLACE_FUNC(f) \ template \ GPUMatrix& GPUMatrix::Inplace##f() \ { \ performElementWiseFunction(ElementWiseOperator::op##f, Data()); \ return *this; \ } #define DEF_ELEMWISE_ASSIGN_FUNC(f) \ template \ GPUMatrix& GPUMatrix::Assign##f##Of(const GPUMatrix& a) \ { \ if (a.IsEmpty()) \ LogicError("Assign##f##Of: Matrix a is empty."); \ if (this != &a) \ RequireSize(a.GetNumRows(), a.GetNumCols()); \ performElementWiseFunction(ElementWiseOperator::op##f, a.Data()); \ return *this; \ } template <> const char* CudaErrString(cudaError_t x) { cudaDeviceSynchronize(); return cudaGetErrorString(x); } template <> const char* CudaErrString(cublasStatus_t e) { cudaDeviceSynchronize(); switch (e) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; default: return "(look for CUBLAS_STATUS_xxx in cublas_api.h)"; } } template <> const char* CudaErrString(curandStatus) { cudaDeviceSynchronize(); return "(see curand.h & look for curandStatus or CURAND_STATUS_xxx)"; } namespace Microsoft { namespace MSR { namespace CNTK { /*static*/ std::vector GridDim::s_cachedDeviceProps; /*static*/ std::once_flag GridDim::s_cachedDevicePropsInitFlag; /*static*/ bool SyncGuard::s_isSyncEnabled = false; /*static*/ void SyncGuard::EnableSync() { s_isSyncEnabled = true; } /*static*/ bool SyncGuard::IsSyncEnabled() { return s_isSyncEnabled; } SyncGuard::SyncGuard(bool forceSync /*= false*/) : m_forceSync(forceSync) { m_done = nullptr; if (m_forceSync || s_isSyncEnabled) { CUDA_CALL(cudaGetLastError()); CUDA_CALL(cudaEventCreate(&m_done)); } } SyncGuard::~SyncGuard() { if (m_forceSync || s_isSyncEnabled) { // The regular use of this destructor is to synchronize the GPU, but also // to check for errors. So this destructor is where CUDA errors would be thrown. // If this destructor runs during stack unwinding, then a different error has // already happened that should be reported; so we only clean up the resource. if (std::uncaught_exception()) cudaEventDestroy(m_done); else { // failures in a prior launch might be reported here CUDA_CALL(cudaEventRecord(m_done)); CUDA_CALL(cudaEventSynchronize(m_done)); CUDA_CALL(cudaEventDestroy(m_done)); } } } template AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols) { if (IsTraceEnabled()) { auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId); fprintf(stderr, "Allocating Matrix<%s> (Rows = %d, Cols = %d) buffer on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int)numRows, (int)numCols, (int)deviceId, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second); Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); } AllocatedElemType* deviceBufferPtr = AllocateNoTrace(deviceId, numRows * numCols); if (IsTraceEnabled()) { fprintf(stderr, "Allocated DeviceData = %p\n", (void*) deviceBufferPtr); } return deviceBufferPtr; } template AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numElements) { if (IsTraceEnabled()) { auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId); fprintf(stderr, "Allocating array<%s> (NumElements = %d) on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (int)numElements, (int)deviceId, (int)freeAndTotalMemory.first, (int)freeAndTotalMemory.second); Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); } AllocatedElemType* deviceBufferPtr = AllocateNoTrace(deviceId, numElements); if (IsTraceEnabled()) { fprintf(stderr, "Allocated DeviceData = %p\n", (void*)deviceBufferPtr); } return deviceBufferPtr; } template void TracingGPUMemoryAllocator::Free(int deviceId, AllocatedElemType* bufferPtr, bool ignoreCUDARetCode /*= false*/) { PrepareDevice(deviceId); if (ignoreCUDARetCode) cudaFree((void*) bufferPtr); else CUDA_CALL(cudaFree((void*) bufferPtr)); if (IsTraceEnabled()) { auto freeAndTotalMemory = GetFreeAndTotalMemoryInMBs(deviceId); fprintf(stderr, "Freed buffer<%s> DeviceData = %p on DeviceId = %d; GPU Memory Free = %d MB of %d MB\n", typeid(AllocatedElemType).name(), (void*) bufferPtr, (int) deviceId, (int) freeAndTotalMemory.first, (int) freeAndTotalMemory.second); Microsoft::MSR::CNTK::DebugUtil::PrintCallStack(); } } template AllocatedElemType* TracingGPUMemoryAllocator::AllocateNoTrace(int deviceId, size_t numElements) { AllocatedElemType* deviceBufferPtr; PrepareDevice(deviceId); // In case numElements is odd we allocate a buffer with one more element. The reason is // we might call curandGenerateNormal (e.g. for Gaussian noise injection) which would fail // if the number of elements it needs to generate is odd. CUDA_CALL(cudaMalloc((void**) &deviceBufferPtr, sizeof(AllocatedElemType) * AsMultipleOf(numElements, 2))); return deviceBufferPtr; } std::pair TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(int deviceId) { PrepareDevice(deviceId); size_t free, total; CUDA_CALL(cudaMemGetInfo(&free, &total)); size_t numBytesPerMB = 1 << 20; return {free / numBytesPerMB, total / numBytesPerMB}; } // PrepareDevice - Setup the correct cuda context for an operation // deviceId - the device on which the operation will take place void PrepareDevice(DEVICEID_TYPE deviceId) { THREAD_LOCAL static DEVICEID_TYPE currentDevice = DEVICEID_NOTYETDETERMINED; // and if we last set the device to be this device we are good if (deviceId == currentDevice) return; CUDA_CALL(cudaSetDevice(deviceId)); currentDevice = deviceId; } #pragma region DeviceBoundNumber class template DeviceBoundNumber::DeviceBoundNumber(const DeviceBoundNumber& /*deepCopy*/) { NOT_IMPLEMENTED; } template DeviceBoundNumber::DeviceBoundNumber(DeviceBoundNumber&& shallowCopy) { ShallowCopyFrom(shallowCopy.m_data, shallowCopy.m_computeDevice); shallowCopy.m_data = NULL; } template void DeviceBoundNumber::ShallowCopyFrom(ElemType* newVal, int newValsDevceId) { m_computeDevice = newValsDevceId; m_data = newVal; } template DeviceBoundNumber::~DeviceBoundNumber() { if (m_data != NULL) { if (m_computeDevice < 0) { delete m_data; m_data = NULL; } else { TracingGPUMemoryAllocator::Free(m_computeDevice, m_data); } } } #pragma endregion DeviceBoundNumber class #pragma region Helper functions template cublasHandle_t _initCUBLAS(int devId) { PrepareDevice((DEVICEID_TYPE) devId); cublasHandle_t cuHandle; CUBLAS_CALL(cublasCreate(&cuHandle)); return cuHandle; } template void GPUMatrix::SetDevice(DEVICEID_TYPE deviceId) { assert(deviceId >= 0); CUDA_CALL(cudaSetDevice(deviceId)); } // PrepareDevice - Setup the correct cuda context for an operation // deviceId - the device on which the operation will take place // defaults to -1, which means use matrices current device template DEVICEID_TYPE GPUMatrix::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const { // if default value use current compute device DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : GetComputeDeviceId(); Microsoft::MSR::CNTK::PrepareDevice(newId); return newId; } template ElemType* GPUMatrix::CopyToArray() const { size_t numElements = GetNumElements(); if (numElements != 0) { PrepareDevice(); ElemType* pArray = new ElemType[numElements]; CUDA_CALL(cudaMemcpy(pArray, Data(), sizeof(ElemType) * m_numRows * m_numCols, cudaMemcpyDeviceToHost)); return pArray; } else { return NULL; } } //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done //return number of elements copied template size_t GPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const { size_t numElements = GetNumElements(); if (numElements > currentArraySize) { delete arrayCopyTo; arrayCopyTo = new ElemType[numElements]; currentArraySize = numElements; } if (numElements != 0) { PrepareDevice(); CUDA_CALL(cudaMemcpy(arrayCopyTo, Data(), sizeof(ElemType) * numElements, cudaMemcpyDeviceToHost)); } return numElements; } template void GPUMatrix::CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const { CUBLAS_CALL(cublasGetMatrix((int) numRows, (int) numCols, sizeof(ElemType), Data(), (int) GetNumRows(), dst, (int) colStride)); } template void GPUMatrix::ChangeDeviceTo(DEVICEID_TYPE to_id) { if (to_id == CPUDEVICE) LogicError("to_id must be valid GPU"); if (GetComputeDeviceId() == to_id) return; ElemType* d_dst = TracingGPUMemoryAllocator::Allocate(to_id, m_numRows, m_numCols); SetSizeAllocated(m_numRows * m_numCols); // check to make sure we have something to copy (on init we often have zero sized allocations) if (GetSizeAllocated() > 0) { #if 0 // see the backlog item # 1220 // IOMMU DMAR needs to be disabled for CUDA P2P, otherwise it will silently hang. // Unfortunately, cudaDeviceCanAccessPeer returns true irrespective of the IOMMU settings. // More details: https://bugzilla.kernel.org/show_bug.cgi?id=188271 // http://docs.nvidia.com/cuda/gpudirect-rdma/#supported-systems // TODO: enable UVA p2p access once this is fixed. // first try peer access int canAccessPeer = false; CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, GetComputeDeviceId())); if (canAccessPeer) { cudaError_t cudaStatus = cudaDeviceEnablePeerAccess(GetComputeDeviceId(), 0); if (cudaStatus != cudaErrorPeerAccessAlreadyEnabled) { CUDA_CALL(cudaStatus); } CUDA_CALL(cudaMemcpyPeer(d_dst, to_id, Data(), GetComputeDeviceId(), sizeof(ElemType) * m_numRows * m_numCols)); } else #endif { // peer access didn't work, just copy normal // make this more efficient by keeping some buffers available for each copy ElemType* h_dst = NULL; PrepareDevice(); CUDA_CALL(cudaMallocHost((void**) &h_dst, sizeof(ElemType) * m_numRows * m_numCols)); CUDA_CALL(cudaMemcpy(h_dst, Data(), sizeof(ElemType) * m_numRows * m_numCols, cudaMemcpyDeviceToHost)); PrepareDevice((DEVICEID_TYPE) to_id); CUDA_CALL(cudaMemcpy(d_dst, h_dst, sizeof(ElemType) * m_numRows * m_numCols, cudaMemcpyHostToDevice)); CUDA_CALL(cudaFreeHost(h_dst)); } } TracingGPUMemoryAllocator::Free(GetComputeDeviceId(), Buffer()); SetBuffer(d_dst, m_numRows * m_numCols * sizeof(ElemType)); PrepareDevice((DEVICEID_TYPE) to_id); SetComputeDeviceId(to_id); } template void GPUMatrix::performElementWiseFunction(ElementWiseOperator kind, const ElemType* src) { PrepareDevice(); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); SyncGuard syncGuard; switch (kind) { case ElementWiseOperator::opSigmoid: return _elementWiseSigmoidOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opTanh: return _elementWiseTanhOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opSqrt: return _elementWiseSqrtOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opExp: return _elementWiseExpOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opLog: return _elementWiseLogOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opAbs: return _elementWiseAbsOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opLinearRectifierDerivative: return _elementWiseLinRectDerivativeOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opCosine: return _elementWiseCosineOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opNegativeSine: return _elementWiseNegativeSineOnCuda<<>>(src, Data(), N); case ElementWiseOperator::opSigmoidDerivative: return _elementWiseSigmoidDerivativeOnCuda<<>>(src, Data(), N); default: LogicError("performElementWiseFunction: unexpected op code %d", (int)kind); } } #pragma endregion Helper functions #pragma region Constructors and Destructor // should only be used by constructors template void GPUMatrix::ZeroInit(int deviceId) { BaseMatrix::ZeroInit(); SetComputeDeviceId(deviceId); } template GPUMatrix::GPUMatrix(int deviceId) { ZeroInit(deviceId); }; template GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId) { ZeroInit(deviceId); m_numRows = numRows; m_numCols = numCols; SetSizeAllocated(GetNumElements()); if (GetNumElements() != 0) { SetBuffer(TracingGPUMemoryAllocator::Allocate(GetComputeDeviceId(), m_numRows, m_numCols), GetNumElements() * sizeof(ElemType)); CUDA_CALL(cudaMemset(Buffer(), 0, sizeof(ElemType) * GetSizeAllocated())); } }; template GPUMatrix::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags) { ZeroInit(deviceId); SetValue(numRows, numCols, deviceId, pArray, matrixFlags); }; template GPUMatrix::GPUMatrix(const GPUMatrix& deepCopyFrom) { ZeroInit(); SetValue(deepCopyFrom); } template GPUMatrix::GPUMatrix(GPUMatrix&& moveFrom) { ShallowCopyFrom(moveFrom); moveFrom.ZeroValues(); } //assignment operator, deep copy template GPUMatrix& GPUMatrix::operator=(const GPUMatrix& deepCopyFrom) { if (this != &deepCopyFrom) { SetValue(deepCopyFrom); } return *this; } //move assignment operator, shallow copy template GPUMatrix& GPUMatrix::operator=(GPUMatrix&& moveFrom) { if (this != &moveFrom) { ShallowCopyFrom(moveFrom); moveFrom.ZeroValues(); } return *this; } template GPUMatrix::~GPUMatrix(void) { } // TODO: This should be in the storage object. // Clear will clear your storage, zeroinit just drops it on the ground. template void GPUMatrix::Clear() { VerifyWritable(__FUNCTION__); //if (OwnBuffer() && m_pArray != NULL) if (m_sob != nullptr) { if (GetComputeDeviceId()>= 0) { // BUG: We do not check the CUDA return code for cudaFree here since this may get called // during processExit when cudaFree will fail. The destruction of CUDA objects during // process exit must be avoided ReleaseStorageMemory(); } } ZeroInit(GetComputeDeviceId()); } #pragma endregion Constructors and Destructor template std::unique_ptr> GPUMatrix::GetOrCreateWorkspace() const { // REVIEW alexeyk: not thread-safe, fine for now. if (m_workspace == nullptr) m_workspace = std::make_unique>>>(); assert(m_workspace != nullptr); auto deviceId = GetComputeDeviceId(); return m_workspace->pop_or_create([deviceId]() { return std::make_unique>(deviceId); }); } template void GPUMatrix::ReleaseWorkspace(std::unique_ptr> src) const { assert(m_workspace != nullptr); m_workspace->push(std::move(src)); } #pragma region Basic Operators template GPUMatrix GPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const { if (startColumn + numCols > GetNumCols()) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) GetNumCols()); GPUMatrix slice(GetComputeDeviceId()); slice.ShallowCopyFrom(*this); slice.m_numCols = numCols; slice.m_sliceViewOffset = m_sliceViewOffset + startColumn * GetNumRows(); return slice; } template GPUMatrix& GPUMatrix::AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { if (numCols == 0) LogicError("The slice cannot have 0 columns."); if (startColumn + numCols > fromMatrix.GetNumCols()) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) fromMatrix.GetNumCols()); Clear(); ShallowCopyFrom(fromMatrix); m_numCols = numCols; m_sliceViewOffset = fromMatrix.m_sliceViewOffset + startColumn * GetNumRows(); return *this; } template GPUMatrix& GPUMatrix::SetColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { if (startColumn + numCols > GetNumCols()) LogicError("The slice is out of range of the destination matrix."); if (numCols > fromMatrix.GetNumCols()) InvalidArgument("The slice (%d) is out of range of the source matrix (%d).", (int) numCols, (int) fromMatrix.GetNumCols()); if (m_numRows != fromMatrix.m_numRows) LogicError("The number of rows in source and destination matrices do not match"); if (m_numRows * numCols > 0) // TODO: remove if unnecessary CUDA_CALL(cudaMemcpy(Data() + LocateColumn(startColumn), fromMatrix.Data(), sizeof(ElemType) * m_numRows * numCols, cudaMemcpyDeviceToDevice)); return *this; } template void GPUMatrix::CopyColumnsStrided(const GPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride) { if ((((numCols - 1) * srcNumColsStride) + 1) > fromMatrix.m_numCols) LogicError("The numCols to copy and srcNumColsStride specified is out of range of the source matrix."); if ((((numCols - 1) * destNumColsStride) + 1) > m_numCols) LogicError("The numCols to copy and srcNumColsStride specified is out of range of the destination matrix."); if (m_numRows != fromMatrix.m_numRows) LogicError("The number of rows in source and destination matrices do not match"); if ((m_numRows * numCols) > 0) { // Launch a kernel to do the strided copy CUDA_LONG N = (CUDA_LONG)(m_numRows * numCols); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _copyColumnsStrided<<>>(Data(), fromMatrix.Data(), N, (CUDA_LONG) m_numRows, (CUDA_LONG) destNumColsStride, (CUDA_LONG) srcNumColsStride); } } //for each column of a, we assign all rows of a to this starting from startIndex template GPUMatrix& GPUMatrix::AssignToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddToRowSliceValuesOf: input matrix a is empty."); if (a.GetNumRows() != numRows) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); if (startIndex + numRows > GetNumRows()) LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddToRowSliceValuesOf: columns does not match."); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _assignToRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) GetNumRows(), (CUDA_LONG) a.GetNumRows()); return *this; } //for each column of a, we assign numRows starting from startIndex to this template GPUMatrix& GPUMatrix::AssignRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AssignRowSliceValuesOf: input matrix a is empty."); if (startIndex + numRows > a.GetNumRows()) LogicError("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); RequireSize(numRows, a.GetNumCols()); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _assignRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) numRows, (CUDA_LONG) a.GetNumRows()); return *this; } //for the row slice of this starting from startIndex we add a to it. template GPUMatrix& GPUMatrix::AddToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddToRowSliceValuesOf: input matrix a is empty."); if (a.GetNumRows() != numRows) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); if (startIndex + numRows > GetNumRows()) LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddToRowSliceValuesOf: columns does not match."); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _addToRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) GetNumRows(), (CUDA_LONG) a.GetNumRows()); return *this; } //for each column of this, we add row slice of a starting from startIndex template GPUMatrix& GPUMatrix::AddWithRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddWithRowSliceValuesOf: input matrix a is empty."); if (GetNumRows() != numRows) LogicError("AddWithRowSliceValuesOf: GetNumRows() != numRows."); if (startIndex + numRows > a.GetNumRows()) LogicError("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddWithRowSliceValuesOf: columns does not match."); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _addWithRowSliceValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) startIndex, (CUDA_LONG) GetNumRows(), (CUDA_LONG) a.GetNumRows()); return *this; } template GPUMatrix GPUMatrix::Diagonal() const { size_t m = GetNumRows(); size_t n = GetNumCols(); if (m != n) LogicError("Diagonal can be called only for square matrix. (rows=%d, cols=%d)", (int) m, (int) n); GPUMatrix diag(1, n, GetComputeDeviceId()); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _assignToDiagonalValuesOf<<>>(diag.Data(), Data(), N, (CUDA_LONG) n); return diag; } // c = c - 1.0 for a specific position template void GPUMatrix::MinusOneAt(GPUMatrix& c, const size_t position) { assert(position < c.GetNumElements()); CUDA_LONG n = (CUDA_LONG) c.GetNumElements(); CUDA_LONG p = (CUDA_LONG) position; int blocksPerGrid = (int) ceil(1.0 * n / GridDim::maxThreadsPerBlock); // BUGBUG: PrepareDevice() missing? SyncGuard syncGuard; _minusOneAt<<>>(c.Data(), p, n); } template GPUMatrix& GPUMatrix::AssignRepeatOf(const GPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats) { if (this == &a) LogicError("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat."); if (a.IsEmpty()) LogicError("AssignRepeatOf: Matrix a is empty."); RequireSize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats); CUDA_LONG N = (CUDA_LONG) GetNumElements(); CUDA_LONG n = (CUDA_LONG) a.GetNumCols(), m = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _assignRepeatOf<<>>(Data(), a.Data(), N, m, n, (CUDA_LONG) GetNumRows()); return *this; } template GPUMatrix& GPUMatrix::AddToRowRepeatValuesOf(const GPUMatrix& a, const size_t numRepeats) { if (a.IsEmpty()) LogicError("AddToRowRepeatValuesOf: input matrix a is empty."); if (a.GetNumRows() != GetNumRows() * numRepeats) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != GetNumRows() * numRepeats."); RequireSize(a.GetNumRows() / numRepeats, a.GetNumCols()); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _addToRowRepeatValuesOf<<>>(Data(), a.Data(), N, (CUDA_LONG) a.GetNumRows(), (CUDA_LONG) a.GetNumCols(), (CUDA_LONG) GetNumRows()); return *this; } template GPUMatrix& GPUMatrix::AssignPositiveAndShiftedNegSample(const GPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { if (this == &a) LogicError("AssignPositiveAndShiftedNegSample: a is the same as [this]. Does not support inplace assignment."); if (a.IsEmpty()) LogicError("AssignPositiveAndShiftedNegSample: Matrix a is empty."); RequireSize(a.GetNumRows() * (posNumber + negNumber), a.GetNumCols()); CUDA_LONG N = (CUDA_LONG) GetNumElements(); CUDA_LONG n = (CUDA_LONG) a.GetNumCols(), m = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _assignPositiveAndShiftedNegSample<<>>(Data(), a.Data(), N, m, n, (CUDA_LONG) GetNumRows(), posNumber, shiftNumber); return *this; } template GPUMatrix& GPUMatrix::AddFoldedPositiveAndShiftedNegSample(const GPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { if (this == &a) LogicError("AddFoldedPositiveAndShiftedNegSample: a is the same as [this]. Does not support inplace assignment."); if (a.IsEmpty()) LogicError("AddFoldedPositiveAndShiftedNegSample: Matrix a is empty."); if (a.GetNumRows() != GetNumRows() * (posNumber + negNumber) || a.GetNumCols() != GetNumCols()) LogicError("AddFoldedPositiveAndShiftedNegSample: dimensions mismatch."); CUDA_LONG N = (CUDA_LONG) a.GetNumElements(); CUDA_LONG n = (CUDA_LONG) a.GetNumCols(), m = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _addFoldedPositiveAndShiftedNegSample<<>>(Data(), a.Data(), N, m, n, (CUDA_LONG) GetNumRows(), posNumber, shiftNumber); return *this; } template GPUMatrix GPUMatrix::Transpose() const { if (IsEmpty()) LogicError("Transpose: Matrix is empty."); GPUMatrix c(GetComputeDeviceId()); c.AssignTransposeOf(*this); return c; } // GetCublasHandle - get a cublas handle for the given GPU, should only need one per GPU // computeDevice - The compute device for which the cublas handle is desired // returns: cublas handle // NOTE: we currently don't bother to ever free the CUBLAS handle, it will be freed automatically by CUDA when the process ends template cublasHandle_t GPUMatrix::GetCublasHandle(int computeDevice /*=-1*/) { // if the compute device is not passed, get the current device from CUDA if (computeDevice < 0) cudaGetDevice(&computeDevice); if (computeDevice < 0 || computeDevice >= MaxGpus) LogicError("GetCublasHandle: Maximum GPU exceeded"); cublasHandle_t cuHandle = s_cuHandle[computeDevice]; if (cuHandle == NULL) { s_cuHandle[computeDevice] = cuHandle = _initCUBLAS(computeDevice); } CUBLAS_CALL(cublasSetStream(cuHandle, t_stream)); return cuHandle; } template GPUMatrix& GPUMatrix::AssignTransposeOf(const GPUMatrix& a) { if (this == &a) LogicError("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose."); if (a.IsEmpty()) LogicError("AssignTransposeOf: Matrix a is empty."); if (GetNumRows() != a.GetNumCols() || GetNumCols() != a.GetNumRows()) RequireSize(a.GetNumCols(), a.GetNumRows()); cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId()); cublasOperation_t transA = CUBLAS_OP_T; cublasOperation_t transB = CUBLAS_OP_T; int m = (int) a.m_numCols; int n = (int) a.m_numRows; ElemType alpha = 1; ElemType beta = 0; cublasStatus_t st; if (sizeof(ElemType) == sizeof(float)) st = cublasSgeam(cuHandle, transA, transB, m, n, reinterpret_cast(&alpha), reinterpret_cast(a.Data()), (int) a.m_numRows, reinterpret_cast(&beta), reinterpret_cast(a.Data()), (int) a.m_numRows, reinterpret_cast(Data()), (int) m_numRows); else if (sizeof(ElemType) == sizeof(double)) st = cublasDgeam(cuHandle, transA, transB, m, n, reinterpret_cast(&alpha), reinterpret_cast(a.Data()), (int) a.m_numRows, reinterpret_cast(&beta), reinterpret_cast(a.Data()), (int) a.m_numRows, reinterpret_cast(Data()), (int) m_numRows); else RuntimeError("Unsupported template argument in GPUMatrix"); if (st != CUBLAS_STATUS_SUCCESS) RuntimeError("AssignTransposeOf failed"); m_numRows = a.m_numCols; m_numCols = a.m_numRows; return *this; } template __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType beta, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, size_t aCols, const ElemType alpha, CUDA_LONG numElements) { CUDA_LONG id = GridDim::GetLinearThreadId(); if (id >= numElements) // note: there are no __syncthread() calls inside return; // id = i + jOut * usStride; // Each thread processes one element of the output matrix. CUDA_LONG i = id % usStride; // row index into 'us' and 'a' CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx' auto jInF = idx[jOut * idxStride]; // this is the column we need to get if (::isnan(jInF) || jInF < 0) // negative index means gap return; size_t jIn = (size_t)jInF; //if (jIn >= aCols) // return; // actually a failure const ElemType& ra = a[ i + jIn * aStride ]; ElemType& rus = us[id/*i + jOut * usStride*/]; ElemType res = ra * alpha; if (beta != 0) res += rus * beta; rus = res; } // *this[:,j] = a[:,idx[j]] * alpha + *this[:,j] * beta template GPUMatrix& GPUMatrix::DoGatherColumnsOf(ElemType beta, const GPUMatrix& idx, const GPUMatrix& a, ElemType alpha) { if (idx.GetNumRows() != 1) // index is 1-dimensional only InvalidArgument("DoGatherColumnsOf: Map must be a row vector."); if (beta == 0) RequireSize(a.GetNumRows(), idx.GetNumCols()); // output has same column format as a, but number of columns comes from idx else VerifySize(a.GetNumRows(), idx.GetNumCols()); if (idx.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId()) InvalidArgument("All matrices must be on the same GPU"); a.PrepareDevice(); // launch the kernel CUDA_LONG NN = (CUDA_LONG)GetNumElements(); // linear space identifying each individual input element SyncGuard syncGuard; GridDim grid(NN); _doGatherColumnsOf<<>>(Data(), GetNumRows(), beta, idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), a.GetNumCols(), alpha, grid.m_N); // Note: The following fails silently (no error, immediate or delayed) for numcols = 10000 under CUDA 7.0. //_doGatherColumnsOf<<>>(Data(), GetNumRows(), beta, idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), a.GetNumCols(), alpha); return *this; } // little helper for debugging template static void Peek(const GPUMatrix& m, const char* which) { size_t rows = m.GetNumRows(); size_t cols = m.GetNumCols(); ElemType buf[10000] = { 0 }; size_t n = min(rows * cols, _countof(buf)); CUDA_CALL(cudaMemcpy(buf, m.Data(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost)); UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here //CUDA_CALL(cudaMemcpy(const_cast(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice)); } #define ALLOW_ATOMIC_SCATTER // allow to disable this, until we know atomicAdd() works properly here template __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, const ElemType alpha, CUDA_LONG numElements) { CUDA_LONG id = GridDim::GetLinearThreadId(); if (id >= numElements) // note: there are no __syncthread() calls inside return; // id = i + jIn * aStride // Each thread processes one element of a CUDA_LONG i = id % aStride; // row index into 'a' and 'us' CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx' auto jOutF = idx[jIn * idxStride]; // this is the column we copy/add into if (::isnan(jOutF) || jOutF < 0) // negative index means gap return; size_t jOut = (size_t)jOutF; //if (jOut >= usCols) // return; // actually a failure --TODO: This should not be necessary. Why is it? const ElemType& ra = a[id/*i + jIn * aStride*/]; ElemType& rus = us[ i + jOut * usStride ]; ElemType res = ra * alpha; if (res != 0) // avoid memory conflict if e.g. an entire column has no gradient #ifdef ALLOW_ATOMIC_SCATTER atomicAdd(&rus, res); // rus += res; #else rus += res; #endif // Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter()) } // *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta template GPUMatrix& GPUMatrix::DoScatterColumnsOf(ElemType beta, const GPUMatrix& idx, const GPUMatrix& a, ElemType alpha) { if (idx.GetNumRows() != 1) // index is 1-dimensional only InvalidArgument("DoScatterColumnsOf: Map must be a row vector."); if (idx.GetNumCols() != a.GetNumCols()) InvalidArgument("DoScatterColumnsOf: Map must have width of input vector."); if (a.GetNumRows() != GetNumRows()) InvalidArgument("DoScatterColumnsOf: Output must have same height as input vector."); if (idx.GetComputeDeviceId() != a.GetComputeDeviceId() || GetComputeDeviceId() != a.GetComputeDeviceId()) InvalidArgument("All matrices must be on the same GPU"); a.PrepareDevice(); auto& us = *this; #ifndef ALLOW_ATOMIC_SCATTER // verify that atomicAdd is not needed --this is not efficient { vector buf(idx.GetNumRows() * idx.GetNumCols()); // idx(,)are the column(s) we copy/add into CUDA_CALL(cudaMemcpy(buf.data(), idx.Data(), sizeof(ElemType) * buf.size(), cudaMemcpyDeviceToHost)); vector writtenTo(GetNumCols(), false); // remember whether an output column is in fact a target for (size_t i = 0; i < buf.size(); i++) { auto colF = buf[i]; if (std::isnan(colF) || colF < 0) continue; size_t col = (size_t)colF; if (col >= GetNumCols()) LogicError("DoScatterColumnsOf: Index value out of bounds."); if (writtenTo[col]) LogicError("DoScatterColumnsOf: #ifndef ALLOW_ATOMIC_SCATTER then columns must be unique. Column idx(%d,%d)=%d is used twice.", (int)(i % idx.GetNumCols()), (int)(i / idx.GetNumCols()), (int)col); else writtenTo[col] = true; } } #endif // pre-scale with beta upfront // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding. Scale(beta, us); // if beta is 0, then this will be a memset() // launch the kernel CUDA_LONG NN = (CUDA_LONG)(a.GetNumElements()); // linear space identifying each individual input element SyncGuard syncGuard; GridDim grid(NN); _doScatterColumnsOf<<>>(Data(), GetNumRows(), GetNumCols(), idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), alpha, NN); //SyncGuard syncGuard; //_doScatterColumnsOf<<>>(Data(), GetNumRows(), GetNumCols(), idx.Data(), idx.GetNumRows(), a.Data(), a.GetNumRows(), alpha, NN); return *this; } template void GPUMatrix::SetValue(const ElemType v) { if (IsEmpty()) return; CUDA_LONG N = (CUDA_LONG) GetNumElements(); // Check if value is zero, which can be set using cudaMemset bool isZero = true; const char* valArray = reinterpret_cast(&v); for (int i = 0; i < sizeof(ElemType); i++) { if (valArray[i] != 0) { isZero = false; break; } } if (isZero) { CUDA_CALL(cudaMemset(Data(), 0, N * sizeof(ElemType))); } else { int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _setValue<<>>(Data(), v, N); } } template void GPUMatrix::SetValue(const ElemType* d_v) // d_v is pointer to the the value in GPU memory { if (IsEmpty()) LogicError("SetValue: Matrix is empty."); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _setValue<<>>(Data(), d_v, N); } template void GPUMatrix::MaskColumnsValue(const GPUMatrix& columnsMask, ElemType val, size_t numColsPerMaskEntry) { if (GetNumCols() != (columnsMask.GetNumCols() * numColsPerMaskEntry)) RuntimeError("Matrix number of columns must equal 'number of columns in column mask * numColsPerMaskEntry'."); if (GetComputeDeviceId() != columnsMask.GetComputeDeviceId()) RuntimeError("Matrix and column mask must be on the same device"); int blocksPerGrid = (int)columnsMask.GetNumCols(); PrepareDevice(); SyncGuard syncGuard; _maskColumnsValue<<>>(Data(), columnsMask.Data(), (CUDA_LONG) GetNumCols(), (CUDA_LONG) GetNumRows(), val, numColsPerMaskEntry); } template void GPUMatrix::SetColumn(const ElemType* colPointer, size_t colInd) { if (IsEmpty()) LogicError("SetValue: Matrix is empty."); if (colPointer == NULL) return; CUDA_CALL(cudaMemcpy(Data() + LocateColumn(colInd), colPointer, sizeof(ElemType) * m_numRows, cudaMemcpyHostToDevice)); } template void GPUMatrix::SetColumn(const GPUMatrix& valMat, size_t colInd) { if (IsEmpty()) LogicError("SetColumn: Matrix is empty."); if (valMat.GetNumCols() != 1) LogicError("SetColumn: only support one column matrix now."); CUDA_CALL(cudaMemcpy(Data() + LocateColumn(colInd), valMat.Data(), sizeof(ElemType) * m_numRows, cudaMemcpyDeviceToDevice)); } template void GPUMatrix::SetValue(const GPUMatrix& deepCopyFrom) { if (this == &deepCopyFrom) return; SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.GetComputeDeviceId(), deepCopyFrom.Data(), matrixFlagSetValueOnDevice); } #if 0 template void GPUMatrix::SetValue(const CPUMatrix& /*deepCopyFrom*/) { NOT_IMPLEMENTED; } template void GPUMatrix::SetValue(const CPUSparseMatrix& /*deepCopyFrom*/) { NOT_IMPLEMENTED; } template void GPUMatrix::SetValue(const GPUSparseMatrix& deepCopyFrom) { deepCopyFrom.CopyToDenseMatrix(*this); } #endif template void GPUMatrix::SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags, DataTransferer* transferer) { // handle externally managed case // BUGBUG: This is super super ugly, and needs to be fixed, but if matrixFlags has the right value, then we can't free anything, // and everything gets wonky. This should be fixed, and would go away if it is made a shared_ptr. if (matrixFlags & matrixFlagDontOwnBuffer) { // free the existing array if it used to be an owned array if ( Buffer() != NULL) { TracingGPUMemoryAllocator::Free(GetComputeDeviceId(), Buffer()); } m_numRows = numRows; m_numCols = numCols; SetBuffer(pArray, GetNumElements() * sizeof(ElemType), true); SetSizeAllocated(GetNumElements()); SetFormat(matrixFormatDense); SetComputeDeviceId(deviceId); } else { if (transferer && (matrixFlags & matrixFlagSetValueOnDevice)) RuntimeError("Asynchronous data copy from device to device is currently not supported."); // if the devices are different move it now if (GetComputeDeviceId() != deviceId && deviceId >= 0) { Clear(); ZeroInit(deviceId); } // now RequireSize/allocate as necessary RequireSize(numRows, numCols); // copy over the content to the buffer PrepareDevice(); if (pArray != NULL) { if (!(matrixFlags & matrixFormatRowMajor)) { if (transferer) transferer->CopyCPUToGPUAsync(pArray, GetNumElements(), sizeof(ElemType), Data()); else CUDA_CALL(cudaMemcpy(Data(), pArray, sizeof(ElemType) * GetNumElements(), (matrixFlags & matrixFlagSetValueOnDevice) ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice)); } else // row major: must transpose (this is not meant to be efficient, but very useful for defining inline matrices for test code) { vector transposed(GetNumElements()); for (size_t i = 0; i < numRows; i++) for (size_t j = 0; j < numCols; j++) transposed[i + numRows * j] = pArray[j + numCols * i]; if (transferer) transferer->CopyCPUToGPUAsync(transposed.data(), GetNumElements(), sizeof(ElemType), Data()); else CUDA_CALL(cudaMemcpy(Data(), transposed.data(), sizeof(ElemType) * GetNumElements(), (matrixFlags & matrixFlagSetValueOnDevice) ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice)); } } } SetFormat(matrixFormatDense); } template void GPUMatrix::SetDiagonalValue(const ElemType v) { CUDA_LONG N = (CUDA_LONG) GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _setDiagonalValue<<>>(Data(), v, N, (CUDA_LONG) GetNumRows()); } template void GPUMatrix::SetDiagonalValue(const GPUMatrix& vector) { if (IsEmpty() || vector.IsEmpty()) LogicError("SetDiagonalValue: Matrix is empty."); if (GetNumRows() != GetNumCols()) LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1) LogicError("SetDiagonalValue: input vector must be a vector."); if (vector.GetNumElements() == 1) // reduce to simple form SetDiagonalValue(vector.Data()[0]); else if (vector.GetNumRows() != GetNumRows() && vector.GetNumCols() != GetNumRows()) LogicError("SetDiagonalValue: input vector's dimension does not agree with [this]."); else { CUDA_LONG N = (CUDA_LONG) GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); PrepareDevice(); SyncGuard syncGuard; _setDiagonalValueFromVector<<>>(Data(), vector.Data(), N); } } template void RescaleToRange(const GPUMatrix& matrix, const ElemType low, const ElemType high) { size_t N = matrix.GetNumElements(); size_t blocksPerGrid = (size_t)ceil(N / (double)GridDim::maxThreadsPerBlock); //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; _rescaleToRange << > > (matrix.Data(), N, low, high); } template void GPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) { PrepareDevice(); CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead? { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; if (sizeof(ElemType) == sizeof(float)) CURAND_CALL(curandGenerateUniform(((curandGenerator_t*) s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements())); else CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*) s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements())); } RescaleToRange(*this, low, high); } template void GPUMatrix::SetUniformRandomValue(RNGHandle& rngHandle, const ElemType low, const ElemType high) { PrepareDevice(); GPURNGHandle* gpuRNGHandle = dynamic_cast(&rngHandle); assert(gpuRNGHandle != nullptr); { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; if (sizeof(ElemType) == sizeof(float)) CURAND_CALL(curandGenerateUniform(gpuRNGHandle->Generator(), reinterpret_cast(Data()), GetNumElements())); else CURAND_CALL(curandGenerateUniformDouble(gpuRNGHandle->Generator(), reinterpret_cast(Data()), GetNumElements())); } RescaleToRange(*this, low, high); } template void SetNormalRandomValue(const GPUMatrix& matrix, const curandGenerator_t& generator, const ElemType mean, const ElemType stdev) { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; // curandGenerateNormal can return the error CURAND_STATUS_LENGTH_NOT_MULTIPLE if GetNumElements() is odd. // To avoid this we always allocate a buffer of even size and potentially generate one more random element. auto n = AsMultipleOf(matrix.GetNumElements(), 2); if (sizeof(ElemType) == sizeof(float)) CURAND_CALL(curandGenerateNormal(generator, reinterpret_cast(matrix.Data()), n, (float)mean, (float)stdev)); else CURAND_CALL(curandGenerateNormalDouble(generator, reinterpret_cast(matrix.Data()), n, (double)mean, (double)stdev)); } template void GPUMatrix::SetGaussianRandomValue(RNGHandle& rngHandle, const ElemType mean, const ElemType stdev) { PrepareDevice(); GPURNGHandle* gpuRNGHandle = dynamic_cast(&rngHandle); assert(gpuRNGHandle != nullptr); SetNormalRandomValue(*this, gpuRNGHandle->Generator(), mean, stdev); } template void GPUMatrix::SetGumbelRandomValue(RNGHandle& rngHandle, const ElemType loc, const ElemType scale) { PrepareDevice(); GPURNGHandle* gpuRNGHandle = dynamic_cast(&rngHandle); assert(gpuRNGHandle != nullptr); { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; if (sizeof(ElemType) == sizeof(float)) CURAND_CALL(curandGenerateUniform(gpuRNGHandle->Generator(), reinterpret_cast(Data()), GetNumElements())); else CURAND_CALL(curandGenerateUniformDouble(gpuRNGHandle->Generator(), reinterpret_cast(Data()), GetNumElements())); } size_t N = GetNumElements(); size_t blocksPerGrid = (size_t)ceil(N / (double)GridDim::maxThreadsPerBlock); { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; _gumbelFromUniform << > > (Data(), N, loc, scale); } } template void GPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) { PrepareDevice(); CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead? SetNormalRandomValue(*this, ((curandGenerator_t*)s_curandGenerator)[0], mean, sigma); } template void GPUMatrix::SetTruncatedNormalRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) { // We use the method described in https://en.wikipedia.org/wiki/Truncated_normal_distribution // i.e. generate uniform, scale it to the right range, pass it through the inverse cdf, scale by sigma, and add the mean PrepareDevice(); CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead? { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; if (sizeof(ElemType) == sizeof(float)) CURAND_CALL(curandGenerateUniform(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements())); else CURAND_CALL(curandGenerateUniformDouble(((curandGenerator_t*)s_curandGenerator)[0], reinterpret_cast(Data()), GetNumElements())); } size_t N = GetNumElements(); size_t blocksPerGrid = (size_t)ceil(N / (double)GridDim::maxThreadsPerBlock); { //Nobody is ever calling SetStream so all work is done one the same stream //Therefore we don't need to sync //SyncGuard syncGuard; _truncated_normal_transform << > > (Data(), N, mean, sigma); } } //maskRate: percentage of values masked out (similar to dropout rate) //scaleValue: which scale value to set to the left ones (unmasked items). template void GPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle) { PrepareDevice(); GPURNGHandle* gpuRNGHandle = dynamic_cast(&rngHandle); assert(gpuRNGHandle != nullptr); cudaEvent_t done = nullptr; CUDA_CALL(cudaEventCreate(&done)); // TODO: why not condition on do_sync, so that we can use SyncGuard? if (sizeof(ElemType) == sizeof(float)) CURAND_CALL(curandGenerateUniform(gpuRNGHandle->Generator(), reinterpret_cast(Data()), GetNumElements())); else CURAND_CALL(curandGenerateUniformDouble(gpuRNGHandle->Generator(), reinterpret_cast(Data()), GetNumElements())); CUDA_CALL(cudaEventRecord(done)); CUDA_CALL(cudaEventSynchronize(done)); CUDA_CALL(cudaEventDestroy(done)); size_t N = GetNumElements(); size_t blocksPerGrid = (size_t) ceil(N / (double) GridDim::maxThreadsPerBlock); SyncGuard syncGuard; _setMaskAndScale<<>>(Data(), N, maskRate, scaleValue); } template ElemType GPUMatrix::Adagrad(GPUMatrix& gradients, const bool needAveMultiplier) { size_t numColsNeeded = gradients.GetNumCols(); if (needAveMultiplier) numColsNeeded += gradients.GetNumCols(); if (IsEmpty() || GetNumCols() < numColsNeeded) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == numColsNeeded); size_t n = gradients.GetNumElements(); ElemType* multipliers = nullptr; if (needAveMultiplier) multipliers = Data() + n; // temp memory used to store multipliers, int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; _adagrad<<>>(Data(), gradients.Data(), n, multipliers); if (!needAveMultiplier) return 1; cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) { float aveMultiplier = 0; CUBLAS_CALL(cublasSasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / n; } else { double aveMultiplier = 0; CUBLAS_CALL(cublasDasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / n; } } template void GPUMatrix::FSAdagrad(GPUMatrix& gradients, GPUMatrix& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, bool unitGainMomentum) { size_t numColsNeeded = 2 * gradients.GetNumCols(); if (IsEmpty() || (GetNumCols() < numColsNeeded)) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded)); size_t n = gradients.GetNumElements(); int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; _fsadagrad<<>>(n, gradients.Data(), Data(), Data()+ n, functionValues.Data(), learnRatePerSample, momentum, adaWeight, adaMul, unitGainMomentum); } template void GPUMatrix::Adam(GPUMatrix& gradients, GPUMatrix& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul, ElemType epsilon, bool unitGainMomentum, bool adamax) { size_t numColsNeeded = 2 * gradients.GetNumCols(); if (IsEmpty() || (GetNumCols() < numColsNeeded)) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded)); size_t n = gradients.GetNumElements(); int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; _adam << > >(n, gradients.Data(), Data(), Data() + n, functionValues.Data(), learnRatePerSample, momentum, adaWeight, adaMul, epsilon, unitGainMomentum, adamax); } template ElemType GPUMatrix::RmsProp(GPUMatrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier, const bool initialized) { const ElemType floor = 1e-6f; static ElemType* upd_gpu = (ElemType*) 0; size_t n = gradients.GetNumElements(); int blocksPerGrid = (GetNumElements() + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; size_t numColsNeeded = gradients.GetNumCols() * 3; if (needAveMultiplier) numColsNeeded += gradients.GetNumCols(); if (IsEmpty() || GetNumCols() < numColsNeeded || !initialized) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); ElemType* avars = Data(); // accumulated variances for RMS scaling ElemType* signs = Data() + n; // sign of previous gradient ElemType* steps = Data() + 2 * n; // current step size // Data()+3*n is temp memory used to store multipliers, no need to initialize _rmsprop_init<<>>(avars, signs, steps, gradients.Data(), n); } assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == numColsNeeded); ElemType* avars = Data(); // accumulated variances for RMS scaling ElemType* signs = Data() + n; // sign of previous gradient ElemType* steps = Data() + 2 * n; // current step size ElemType* multipliers = nullptr; if (needAveMultiplier) multipliers = Data() + 3 * n; // temp memory used to store multipliers, if (!upd_gpu) { const ElemType upd[] = { 2, 2, 0, 2, 2, 0, 1, 1, 1, 2, 2, 0, 1, 2, 1, 0, 2, 2, 1, 1, 1, 0, 2, 2, 0, 2, 2, }; upd_gpu = TracingGPUMemoryAllocator::Allocate(GetComputeDeviceId(), 27); CUDA_CALL(cudaMemcpy(upd_gpu, upd, sizeof(ElemType) * _countof(upd), cudaMemcpyHostToDevice)); } _rmsprop<<>>(avars, signs, steps, gradients.Data(), n, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, floor, upd_gpu, multipliers); if (!needAveMultiplier) return 1; cublasHandle_t cuHandle = GetCublasHandle(GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) { float aveMultiplier = 0; CUBLAS_CALL(cublasSasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return aveMultiplier / n; } else { double aveMultiplier = 0; CUBLAS_CALL(cublasDasum(cuHandle, (CUDA_LONG) n, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / n; } } template void GPUMatrix::AdaDelta(GPUMatrix& gradients, GPUMatrix& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon) { size_t numColsNeeded = 2 * gradients.GetNumCols(); if (IsEmpty() || (GetNumCols() < numColsNeeded)) { RequireSize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded)); size_t n = gradients.GetNumElements(); int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; _adadelta << > >(n, gradients.Data(), Data(), Data() + n, functionValues.Data(), learningRate, rho, epsilon); } template void GPUMatrix::Reshape(const size_t numRows, const size_t numCols) { assert(numRows * numCols == GetNumElements()); if (numRows * numCols != GetNumElements()) InvalidArgument("Reshape: total number of elements does not match."); m_numRows = numRows; m_numCols = numCols; } template void GPUMatrix::RequireSize(const size_t numRows, const size_t numCols, bool growOnly) { if (GetNumRows() != numRows || GetNumCols() != numCols) Resize(numRows, numCols, growOnly); } template void GPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly) { if (GetNumRows() == numRows && GetNumCols() == numCols) return; VerifyResizable(__FUNCTION__); size_t numElements = numRows * numCols; if (numElements > GetSizeAllocated() || // grow allocation (!growOnly && numElements != GetSizeAllocated())) // shrink allocation if not growOnly { // If the buffer exists, free it before allocate if (Buffer()) { TracingGPUMemoryAllocator::Free(GetComputeDeviceId(), Buffer()); } // reallocate buffer if numElements > 0 ElemType* pArray = nullptr; if (numElements > 0) { pArray = TracingGPUMemoryAllocator::Allocate(GetComputeDeviceId(), numRows, numCols); } SetBuffer(pArray, numElements * sizeof(ElemType)); SetSizeAllocated(numElements); } // success m_sliceViewOffset = 0; m_numRows = numRows; m_numCols = numCols; } template size_t GPUMatrix::LocateElement(const size_t row, const size_t col) const { assert(row < m_numRows && col < m_numCols); return LocateColumn(col) + row; // matrix in column-wise storage } template size_t GPUMatrix::LocateColumn(const size_t col) const { assert(col < GetNumCols()); return col * m_numRows; // matrix in column-wise storage } template ElemType GPUMatrix::Get00Element() const { ElemType res = 0; CUDA_CALL(cudaMemcpy(&res, Data(), sizeof(ElemType), cudaMemcpyDeviceToHost)); return res; } #pragma endregion Basic Operators #pragma region Member BLAS Functions template GPUMatrix& GPUMatrix::operator+=(ElemType alpha) { if (IsEmpty()) LogicError("operator+=: Matrix is empty."); CUDA_LONG N = (CUDA_LONG) GetNumElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); SyncGuard syncGuard; _addValue<<>>(Data(), alpha, N); return *this; } template GPUMatrix GPUMatrix::operator+(ElemType alpha) const { if (IsEmpty()) LogicError("operator+: Matrix is empty."); GPUMatrix c(*this); c += alpha; return c; } template GPUMatrix& GPUMatrix::AssignSumOf(const ElemType alpha, const GPUMatrix& a) { SetValue(a); (*this) += alpha; return (*this); } template GPUMatrix& GPUMatrix::operator+=(const GPUMatrix& a) { ScaleAndAdd(1, a, *this); return *this; } template GPUMatrix GPUMatrix::operator+(const GPUMatrix& a) const { if (GetNumElements() == 1) { GPUMatrix c(a); c += Get00Element(); return c; } else if (a.GetNumElements() == 1) { GPUMatrix c(*this); c += a.Get00Element(); return c; } else { GPUMatrix c(*this); // this implementation will introduce a copy overhead. but make resue of the code c += a; return c; } } template GPUMatrix& GPUMatrix::AssignSumOf(const GPUMatrix& a, const GPUMatrix& b) { SetValue(a); (*this) += b; return (*this); } template GPUMatrix& GPUMatrix::operator-=(ElemType alpha) { if (IsEmpty()) LogicError("operato-=: Matrix is empty."); return operator+=(-1 * alpha); } template