// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #include "Basics.h" #include "BestGpu.h" #include "DebugUtil.h" #ifndef CPUONLY #include "GPUSparseMatrix.h" #include "GPUMatrix.h" #include #include #include "cublas_v2.h" #include "GPUMatrixCUDAKernels.cuh" #include #include "CommonMatrix.h" #include // for cout/cerr #include typedef unsigned char byte; #pragma warning(disable : 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<>> syntax if a and b are size_t #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this #ifdef _WIN32 // thread local storage to access the current stream, initalize to default stream extern __declspec(thread) #else static #endif cudaStream_t t_stream; template <> const char* CudaErrString(cusparseStatus_t) { cudaDeviceSynchronize(); return "(see cusparse.h & look for cusparseStatus_t or CUSPARSE_STATUS_xxx)"; } namespace Microsoft { namespace MSR { namespace CNTK { #pragma region Constructors and Destructor #ifdef NO_SYNC template bool GPUSparseMatrix::do_sync = false; #else template bool GPUSparseMatrix::do_sync = true; #endif template GPUSPARSE_INDEX_TYPE GPUSparseMatrix::SecondaryIndexValueAt(size_t idx) const { GPUSPARSE_INDEX_TYPE value; CUDA_CALL(cudaMemcpy(&value, SecondaryIndexLocation() + idx, sizeof(GPUSPARSE_INDEX_TYPE), cudaMemcpyDeviceToHost)); return value; } template void GPUSparseMatrix::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice) { if (matrixFormat != MatrixFormat::matrixFormatSparseCSC && matrixFormat != MatrixFormat::matrixFormatSparseCSR && matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow) { LogicError("GPUSparseMatrix: unsupported sparse matrix format"); } m_computeDevice = (computeDevice == AUTOPLACEMATRIX) ? GPUMatrix::GetBestGPUDeviceId() : computeDevice; // current GPU device Id m_computeDevice = EnforceOneGPUOnly(m_computeDevice); // see EnforceOneGPUOnly() for comment on what this is m_numRows = 0; m_numCols = 0; m_elemSizeAllocated = m_nz = 0; // Number of non-zero elements m_totalBufferSizeAllocated = 0; m_sliceViewOffset = 0; m_format = matrixFormat; m_externalBuffer = false; m_pArray = nullptr; m_matrixName = nullptr; m_blockSize = 0; m_rowToId = nullptr; m_tempHostBuffer = nullptr; m_tempHostBufferSize = 0; } template GPUSparseMatrix::GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/, const DEVICEID_TYPE computeDevice /*= AUTOPLACEMATRIX*/) { ZeroInit(matrixFormat, computeDevice); Resize(numRows, numCols, numNZ, true, false); } template GPUSparseMatrix::GPUSparseMatrix(const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/, const DEVICEID_TYPE computeDevice /*= AUTOPLACEMATRIX*/) { ZeroInit(matrixFormat, computeDevice); } template GPUSparseMatrix::GPUSparseMatrix(const GPUMatrix& deepCopy, const MatrixFormat matrixFormat /*= MatrixFormat::matrixFormatSparseCSR*/) { ZeroInit(matrixFormat, deepCopy.GetComputeDeviceId()); if (!deepCopy.IsEmpty()) SetValue(deepCopy, matrixFormat); } template GPUSparseMatrix::GPUSparseMatrix(const GPUSparseMatrix& deepCopy) { ZeroInit(deepCopy.GetFormat(), deepCopy.GetComputeDeviceId()); DeepCopy(deepCopy); } // PrepareDevice - Setup the correct cuda context for an operation // deviceId - the device on which the operation will take place // defaults to -1, which means use matrices current device template DEVICEID_TYPE GPUSparseMatrix::PrepareDevice(DEVICEID_TYPE deviceId /*=-1*/) const { // if default value use current compute device DEVICEID_TYPE newId = deviceId >= 0 ? deviceId : m_computeDevice; Microsoft::MSR::CNTK::PrepareDevice(newId); return newId; } template void GPUSparseMatrix::DeepCopy(const GPUSparseMatrix& deepCopy) { ChangeDeviceTo(deepCopy.m_computeDevice); deepCopy.PrepareDevice(); Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false); m_nz = deepCopy.m_nz; m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice)); CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocationWithSliceViewOffset(), MajorIndexSize(), cudaMemcpyDeviceToDevice)); CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice)); if (deepCopy.m_sliceViewOffset > 0) { int blocksPerGrid = (int) ceil(1.0 * SecondaryIndexCount() / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); _shiftColCSCIndexFromSliceViewToAbsolute<<>>( SecondaryIndexLocation(), SecondaryIndexCount(), GetNumNZElements()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } m_externalBuffer = false; SetMatrixName(deepCopy.m_matrixName); // TODO: to copy other varibles used only for class based LM } template void GPUSparseMatrix::SetValue(const GPUSparseMatrix& deepCopy) { if (!OwnBuffer()) LogicError("Cannot SetValue on Managed external matrix"); DeepCopy(deepCopy); } // from CPU template void GPUSparseMatrix::SetValue(const CPUSparseMatrix& deepCopy) { if (!OwnBuffer()) LogicError("Cannot SetValue on Managed external matrix"); SetFormat(deepCopy.GetFormat()); if (deepCopy.IsEmpty()) { Reset(); return; } if (deepCopy.GetFormat() == matrixFormatSparseCSR) { SetMatrixFromCSRFormat(deepCopy.RowLocation(), deepCopy.ColLocation(), deepCopy.BufferPointer(), deepCopy.GetNumElemAllocated(), deepCopy.GetNumRows(), deepCopy.GetNumCols()); } else if (deepCopy.GetFormat() == matrixFormatSparseCSC) { SetMatrixFromCSCFormat(deepCopy.ColLocation(), deepCopy.RowLocation(), deepCopy.BufferPointer(), deepCopy.GetNumElemAllocated(), deepCopy.GetNumRows(), deepCopy.GetNumCols()); } else NOT_IMPLEMENTED; } template void GPUSparseMatrix::CopyToCPUSparseMatrix(CPUSparseMatrix& cpuSparseMatrix) const { cpuSparseMatrix.SetFormat(GetFormat()); if (IsEmpty()) { cpuSparseMatrix.Reset(); return; } if (this->GetFormat() == matrixFormatSparseCSR) { // we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumElemAllocated(), true, false); cpuSparseMatrix.SetNzCount(GetNumNZElements()); PrepareDevice(); if (sizeof(GPUSPARSE_INDEX_TYPE) == sizeof(CPUSPARSE_INDEX_TYPE)) { CUDA_CALL(cudaMemcpy(cpuSparseMatrix.RowLocation(), RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(cpuSparseMatrix.ColLocation(), ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); } else { GPUSPARSE_INDEX_TYPE* h_CSRRow = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(RowSize()); CUDA_CALL(cudaMemcpy(h_CSRRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CopyBuffer(cpuSparseMatrix.RowLocation(), h_CSRRow, SecondaryIndexCount()); GPUSPARSE_INDEX_TYPE* h_Col = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(ColSize()); CUDA_CALL(cudaMemcpy(h_Col, ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount()); } CUDA_CALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost)); } else if (this->GetFormat() == matrixFormatSparseCSC) { // we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements(), true, false); cpuSparseMatrix.SetNzCount(GetNumNZElements()); PrepareDevice(); if (sizeof(GPUSPARSE_INDEX_TYPE) == sizeof(CPUSPARSE_INDEX_TYPE)) { CUDA_CALL(cudaMemcpy(cpuSparseMatrix.RowLocation(), RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(cpuSparseMatrix.ColLocation(), ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); } else { GPUSPARSE_INDEX_TYPE* h_CSCCol = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(ColSize()); CUDA_CALL(cudaMemcpy(h_CSCCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); CopyBuffer(cpuSparseMatrix.ColLocation(), h_CSCCol, SecondaryIndexCount()); GPUSPARSE_INDEX_TYPE* h_Row = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(RowSize()); CUDA_CALL(cudaMemcpy(h_Row, RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount()); } CUDA_CALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost)); } else NOT_IMPLEMENTED; } template void GPUSparseMatrix::CopyToDenseMatrix(GPUMatrix& denseMatrix) const { if (IsEmpty()) { denseMatrix.Resize(0, 0); return; } PrepareDevice(); cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cusparseMatDescr_t descr = 0; CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); denseMatrix.Resize(m_numRows, m_numCols); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); CUSPARSE_CALL(cusparseSetStream(cusparseHandle, t_stream)); if (m_format == MatrixFormat::matrixFormatSparseCSR) { if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*) BufferPointer(), RowLocation(), ColLocation(), (float*) denseMatrix.BufferPointer(), int(m_numRows))); } else { CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*) BufferPointer(), RowLocation(), ColLocation(), (double*) denseMatrix.BufferPointer(), int(m_numRows))); } } else if (m_format == MatrixFormat::matrixFormatSparseCSC) { if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*) BufferPointer(), RowLocation(), ColLocation(), (float*) denseMatrix.BufferPointer(), int(m_numRows))); } else { CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*) BufferPointer(), RowLocation(), ColLocation(), (double*) denseMatrix.BufferPointer(), int(m_numRows))); } } else { NOT_IMPLEMENTED; } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); CUSPARSE_CALL(cusparseDestroy(cusparseHandle)); denseMatrix.SetMatrixName(m_matrixName); } template void GPUSparseMatrix::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix& outMatrix) const { if (IsEmpty()) { outMatrix.ZeroInit(newFormat, GetComputeDeviceId()); return; } MatrixFormat oldFormat = GetFormat(); if (oldFormat == newFormat) { outMatrix.SetValue(*this); return; } PrepareDevice(); cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); CUSPARSE_CALL(cusparseSetStream(cusparseHandle, t_stream)); outMatrix.ChangeDeviceTo(GetComputeDeviceId()); outMatrix.Resize(m_numRows, m_numCols, m_nz, newFormat, true, false); outMatrix.SetNzCount(m_nz); if ((oldFormat == matrixFormatSparseCSR && newFormat == matrixFormatSparseCSC) || (oldFormat == matrixFormatSparseCSC && newFormat == matrixFormatSparseCSR)) { if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_elemSizeAllocated), (float*) BufferPointer(), RowLocation(), ColLocation(), (float*) outMatrix.BufferPointer(), outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); } else { CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_elemSizeAllocated), (double*) BufferPointer(), RowLocation(), ColLocation(), (double*) outMatrix.BufferPointer(), outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); } } else { NOT_IMPLEMENTED; } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); CUSPARSE_CALL(cusparseDestroy(cusparseHandle)); } template void GPUSparseMatrix::ConvertToSparseFormat(MatrixFormat newFormat) { if (IsEmpty()) { SetFormat(newFormat); return; } MatrixFormat oldFormat = GetFormat(); if (oldFormat == newFormat) return; GPUSparseMatrix tempMatrix(newFormat, GetComputeDeviceId()); ConvertToSparseFormat(newFormat, tempMatrix); *this = std::move(tempMatrix); } template GPUMatrix GPUSparseMatrix::CopyToDenseMatrix() const { GPUMatrix res(GetComputeDeviceId()); if (!IsEmpty()) CopyToDenseMatrix(res); return res; } template void GPUSparseMatrix::ChangeDeviceTo(DEVICEID_TYPE to_id) { if (!OwnBuffer()) LogicError("Cannot change device on Managed external matrix"); if (to_id == CPUDEVICE) LogicError("to_id must be valid GPU"); if (m_computeDevice == to_id) return; if (m_totalBufferSizeAllocated == 0) // nothing to move { assert(m_pArray == nullptr); } else { ElemType* d_dst = TracingGPUMemoryAllocator::Allocate(to_id, m_totalBufferSizeAllocated / sizeof(ElemType)); // first try peer access int canAccessPeer = false; CUDA_CALL(cudaDeviceCanAccessPeer(&canAccessPeer, to_id, m_computeDevice)); if (canAccessPeer) { cudaError_t cudaStatus = cudaDeviceEnablePeerAccess(m_computeDevice, 0); if (cudaStatus != cudaErrorPeerAccessAlreadyEnabled) { CUDA_CALL(cudaStatus); } CUDA_CALL(cudaMemcpyPeer(d_dst, to_id, m_pArray, m_computeDevice, m_totalBufferSizeAllocated)); } else { // peer access didn't work, just copy normal // make this more efficient by keeping some buffers available for each copy ElemType* h_dst = NULL; PrepareDevice(); CUDA_CALL(cudaMallocHost((void**) &h_dst, m_totalBufferSizeAllocated)); CUDA_CALL(cudaMemcpy(h_dst, m_pArray, m_totalBufferSizeAllocated, cudaMemcpyDeviceToHost)); PrepareDevice((DEVICEID_TYPE) to_id); CUDA_CALL(cudaMemcpy(d_dst, h_dst, m_totalBufferSizeAllocated, cudaMemcpyHostToDevice)); CUDA_CALL(cudaFreeHost(h_dst)); } TracingGPUMemoryAllocator::Free(m_computeDevice, m_pArray); m_pArray = d_dst; } SetComputeDeviceId(PrepareDevice(to_id)); } template void GPUSparseMatrix::SetValue(const GPUMatrix& denseMatrix) { if (!OwnBuffer()) LogicError("Cannot SetValue on Managed external matrix"); SetValue(denseMatrix, GetFormat()); } template void GPUSparseMatrix::SetValue(const GPUMatrix& denseMatrix, const MatrixFormat matrixFormat) { if (!OwnBuffer()) LogicError("Cannot SetValue on Managed external matrix"); if (matrixFormat != matrixFormatSparseCSR && matrixFormat != matrixFormatSparseCSC) { NOT_IMPLEMENTED; } PrepareDevice(); cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cusparseMatDescr_t descr = 0; CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); int numRows = (int) denseMatrix.GetNumRows(); // m int numCols = (int) denseMatrix.GetNumCols(); // n int* nnzPerRowOrCol = TracingGPUMemoryAllocator::Allocate(m_computeDevice, ((matrixFormat & matrixFormatRowMajor) ? numRows : numCols)); int nnzTotalDevHostPtr = -1; cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseSnnz(cusparseHandle, (matrixFormat & matrixFormatRowMajor) ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN, (int) numRows, (int) numCols, descr, reinterpret_cast(denseMatrix.BufferPointer()), (int) numRows, nnzPerRowOrCol, &nnzTotalDevHostPtr)); } else { CUSPARSE_CALL(cusparseDnnz(cusparseHandle, (matrixFormat & matrixFormatRowMajor) ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN, (int) numRows, (int) numCols, descr, reinterpret_cast(denseMatrix.BufferPointer()), (int) numRows, nnzPerRowOrCol, &nnzTotalDevHostPtr)); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); Resize(numRows, numCols, nnzTotalDevHostPtr, matrixFormat, true, false); SetNzCount(nnzTotalDevHostPtr); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); if (m_format == MatrixFormat::matrixFormatSparseCSR) { if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseSdense2csr(cusparseHandle, (int) m_numRows, (int) m_numCols, descr, reinterpret_cast(denseMatrix.BufferPointer()), (int) m_numRows, nnzPerRowOrCol, reinterpret_cast(BufferPointer()), RowLocation(), ColLocation())); } else { CUSPARSE_CALL(cusparseDdense2csr(cusparseHandle, (int) m_numRows, (int) m_numCols, descr, reinterpret_cast(denseMatrix.BufferPointer()), (int) m_numRows, nnzPerRowOrCol, reinterpret_cast(BufferPointer()), RowLocation(), ColLocation())); } } else if (m_format == MatrixFormat::matrixFormatSparseCSC) { if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseSdense2csc(cusparseHandle, (int) m_numRows, (int) m_numCols, descr, reinterpret_cast(denseMatrix.BufferPointer()), (int) m_numRows, nnzPerRowOrCol, reinterpret_cast(BufferPointer()), RowLocation(), ColLocation())); } else { CUSPARSE_CALL(cusparseDdense2csc(cusparseHandle, (int) m_numRows, (int) m_numCols, descr, reinterpret_cast(denseMatrix.BufferPointer()), (int) m_numRows, nnzPerRowOrCol, reinterpret_cast(BufferPointer()), RowLocation(), ColLocation())); } } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); SetMatrixName(denseMatrix.GetMatrixName()); } template GPUSparseMatrix& GPUSparseMatrix::operator=(const GPUSparseMatrix& deepCopy) { Clear(); if (this != &deepCopy) SetValue(deepCopy); SetMatrixName(deepCopy.m_matrixName); return *this; } template GPUSparseMatrix::GPUSparseMatrix(GPUSparseMatrix&& moveFrom) { m_computeDevice = moveFrom.m_computeDevice; m_numRows = moveFrom.m_numRows; m_numCols = moveFrom.m_numCols; m_nz = moveFrom.m_nz; m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated; m_pArray = moveFrom.m_pArray; m_sliceViewOffset = moveFrom.m_sliceViewOffset; m_format = moveFrom.m_format; m_externalBuffer = moveFrom.m_externalBuffer; m_matrixName = moveFrom.m_matrixName; m_blockSize = moveFrom.m_blockSize; m_rowToId = moveFrom.m_rowToId; m_tempHostBuffer = moveFrom.m_tempHostBuffer; m_tempHostBufferSize = moveFrom.m_tempHostBufferSize; moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freeed } template GPUSparseMatrix& GPUSparseMatrix::operator=(GPUSparseMatrix&& moveFrom) { if (this != &moveFrom) { if (OwnBuffer()) ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom m_computeDevice = moveFrom.m_computeDevice; m_numRows = moveFrom.m_numRows; m_numCols = moveFrom.m_numCols; m_nz = moveFrom.m_nz; m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated; m_pArray = moveFrom.m_pArray; m_sliceViewOffset = moveFrom.m_sliceViewOffset; m_format = moveFrom.m_format; m_externalBuffer = moveFrom.m_externalBuffer; m_matrixName = moveFrom.m_matrixName; m_blockSize = moveFrom.m_blockSize; m_rowToId = moveFrom.m_rowToId; m_tempHostBuffer = moveFrom.m_tempHostBuffer; m_tempHostBufferSize = moveFrom.m_tempHostBufferSize; moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); } return *this; } template GPUSparseMatrix::~GPUSparseMatrix() { ReleaseMemory(); } template void GPUSparseMatrix::ReleaseMemory() { // If OwnBuffer() is false then this matrix // is simply a view over another matrix. In that // case we shouldn't free anything. if (OwnBuffer()) { delete[] m_matrixName; m_matrixName = nullptr; delete[](byte*) m_tempHostBuffer; m_tempHostBuffer = nullptr; TracingGPUMemoryAllocator::Free(m_computeDevice, m_pArray); m_pArray = nullptr; TracingGPUMemoryAllocator::Free(m_computeDevice, m_rowToId); m_rowToId = nullptr; } ZeroInit(m_format, m_computeDevice); } //ResizeAsAndCopyIndexFrom - Resize this sparse matrix to have the same element structure as the passed matrix // a - sparse matrix whose structure we want to clone // remark: this was done for element wise operations where the structure will be identical after an operation template void GPUSparseMatrix::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix& a, const bool growOnly /*= true*/) { Resize(a.m_numRows, a.m_numCols, a.m_nz, a.m_format, growOnly, false); SetNzCount(a.m_nz); CUDA_CALL(cudaMemcpy(MajorIndexLocation(), a.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice)); CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), a.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice)); } //------------------------------------------------------------------------- // Start of new GPU Sparse Matrix code //------------------------------------------------------------------------- template void GPUSparseMatrix::Reshape(const size_t numRows, const size_t numCols) { if (!OwnBuffer()) LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally."); if (m_numRows == numRows && m_numCols == numCols) return; if (m_format != MatrixFormat::matrixFormatSparseCSC) NOT_IMPLEMENTED; if (m_numRows * m_numCols != numRows * numCols) LogicError("GPUSparseMatrix::Reshape: new matrix size does not match current size, can't be reshaped. Did you mean to resize?"); size_t bufferSizeNeeded = BufferSizeNeeded(numRows, numCols, m_elemSizeAllocated, m_format); ElemType* pArray = TracingGPUMemoryAllocator::Allocate(m_computeDevice, bufferSizeNeeded / sizeof(ElemType)); if (m_pArray != nullptr) { CUDA_CALL(cudaMemcpy(pArray, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice)); GPUSPARSE_INDEX_TYPE* majorIndexInNewBuffer = (GPUSPARSE_INDEX_TYPE*) (pArray + m_elemSizeAllocated); GPUSPARSE_INDEX_TYPE* secondaryIndexInNewBuffer = majorIndexInNewBuffer + MajorIndexCount(numRows, numCols, m_elemSizeAllocated, m_format); int blocksPerGrid = (int) ceil(1.0 * numCols / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); _reshape<<>>( m_numRows, // old row count m_numCols, // old col count numRows, // new row count numCols, // new col count MajorIndexLocation(), // old row index array SecondaryIndexLocation(), // old column index array majorIndexInNewBuffer, // new row index array secondaryIndexInNewBuffer // new column index array ); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); TracingGPUMemoryAllocator::Free(m_computeDevice, m_pArray); } m_pArray = pArray; m_numRows = numRows; m_numCols = numCols; m_totalBufferSizeAllocated = bufferSizeNeeded; // following are generated dynamically and no need to save if (m_rowToId != nullptr) TracingGPUMemoryAllocator::Free(m_computeDevice, m_rowToId); m_rowToId = TracingGPUMemoryAllocator::Allocate(m_computeDevice, m_elemSizeAllocated); } template void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly, bool keepExistingValues) { Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly, keepExistingValues); } //WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start. //TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation template void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*=true*/) { if (!OwnBuffer()) LogicError("Cannot Resize since the buffer is managed externally."); if (matrixFormat != m_format || m_numRows != numRows || m_numCols != numCols) keepExistingValues = false; size_t bufferSizeNeeded = BufferSizeNeeded(numRows, numCols, numNZElemToReserve, matrixFormat); bool reallocate = (m_totalBufferSizeAllocated < bufferSizeNeeded || (!growOnly && m_totalBufferSizeAllocated > bufferSizeNeeded)); if (reallocate) { ElemType* pArray = TracingGPUMemoryAllocator::Allocate(m_computeDevice, bufferSizeNeeded / sizeof(ElemType)); if (m_pArray != nullptr) { if (keepExistingValues) { if (m_nz > numNZElemToReserve || m_totalBufferSizeAllocated > bufferSizeNeeded) LogicError("Resize: To keep values m_nz should <= numNZElemToReserve."); CUDA_CALL(cudaMemcpy(pArray, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice)); GPUSPARSE_INDEX_TYPE* majorIndexInNewBuffer = (GPUSPARSE_INDEX_TYPE*) (pArray + numNZElemToReserve); CUDA_CALL(cudaMemcpy(majorIndexInNewBuffer, MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice)); GPUSPARSE_INDEX_TYPE* secondaryIndexInNewBuffer = majorIndexInNewBuffer + MajorIndexCount(numRows, numCols, numNZElemToReserve, matrixFormat); CUDA_CALL(cudaMemcpy(secondaryIndexInNewBuffer, SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice)); } else m_nz = 0; TracingGPUMemoryAllocator::Free(m_computeDevice, m_pArray); } m_pArray = pArray; // following are generated dynamically and no need to save if (m_rowToId != nullptr) TracingGPUMemoryAllocator::Free(m_computeDevice, m_rowToId); m_rowToId = TracingGPUMemoryAllocator::Allocate(m_computeDevice, numNZElemToReserve); m_totalBufferSizeAllocated = bufferSizeNeeded; m_elemSizeAllocated = numNZElemToReserve; } else // if requested size is smaller, keeping original values does not make sense { m_elemSizeAllocated = ElemCountFromBufferSize(numRows, numCols, matrixFormat, m_totalBufferSizeAllocated); } m_numRows = numRows; m_numCols = numCols; m_format = matrixFormat; } //Reset matrix so it can be reused template void GPUSparseMatrix::Reset() { m_nz = 0; m_blockSize = 0; } // copy features to GPU template void GPUSparseMatrix::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE* h_CSRRow, const GPUSPARSE_INDEX_TYPE* h_Col, const ElemType* h_Val, const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice /*= false*/, const DEVICEID_TYPE devId /*= -1*/) { if (!OwnBuffer()) LogicError("Cannot Set since the buffer is managed externally."); if (h_CSRRow == nullptr || h_Col == nullptr || h_Val == nullptr) LogicError("SetMatrixFromCSRFormat: nullptr passed in."); SetComputeDeviceId(PrepareDevice(devId)); m_format = matrixFormatSparseCSR; Resize(numRows, numCols, nz, true, false); SetNzCount(nz); cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; CUDA_CALL(cudaMemcpy(BufferPointer(), h_Val, NzSize(), kind)); if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE)) { CUDA_CALL(cudaMemcpy(RowLocation(), h_CSRRow, RowSize(), kind)); CUDA_CALL(cudaMemcpy(ColLocation(), h_Col, ColSize(), kind)); } else { GPUSPARSE_INDEX_TYPE* pCol = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(RowSize() + ColSize()); CopyBuffer(pCol, h_Col, MajorIndexCount()); GPUSPARSE_INDEX_TYPE* pRow = pCol + MajorIndexCount(); CopyBuffer(pRow, h_CSRRow, SecondaryIndexCount()); CUDA_CALL(cudaMemcpy(RowLocation(), pRow, RowSize(), kind)); CUDA_CALL(cudaMemcpy(ColLocation(), pCol, ColSize(), kind)); } } // this function will allocate memory while the caller needs to release it template void GPUSparseMatrix::GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t& numElemAllocated, size_t& nz, size_t& numRows, size_t& numCols) const { if (!OwnBuffer()) LogicError("Cannot Set since the buffer is managed externally."); if (h_CSRRow != nullptr || h_Col != nullptr || h_Val != nullptr) LogicError("GetMatrixFromCSRFormat: Passed pointers must be nullptr"); numElemAllocated = GetNumElemAllocated(); nz = GetNumNZElements(); numRows = GetNumRows(); numCols = GetNumCols(); if (IsEmpty() || nz == 0) return; else { h_Val = new ElemType[numElemAllocated]; h_CSRRow = new CPUSPARSE_INDEX_TYPE[m_numRows + 1]; h_Col = new CPUSPARSE_INDEX_TYPE[nz]; PrepareDevice(); CUDA_CALL(cudaMemcpy(h_Val, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost)); if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE)) { CUDA_CALL(cudaMemcpy(h_CSRRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(h_Col, ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); } else { GPUSPARSE_INDEX_TYPE* pCol = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(RowSize() + ColSize()); GPUSPARSE_INDEX_TYPE* pRow = pCol + MajorIndexCount(); CUDA_CALL(cudaMemcpy(pRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(pCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); CopyBuffer(h_Col, pCol, MajorIndexCount()); CopyBuffer(h_CSRRow, pRow, SecondaryIndexCount()); } } } template void GPUSparseMatrix::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCCol, const CPUSPARSE_INDEX_TYPE* h_Row, const ElemType* h_Val, const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice /*= false*/, const DEVICEID_TYPE devId /*= -1*/) { if (!OwnBuffer()) LogicError("Cannot Set since the buffer is managed externally."); if (h_CSCCol == nullptr || h_Row == nullptr || h_Val == nullptr) LogicError("SetMatrixFromCSCFormat: nullptr passed in."); SetComputeDeviceId(PrepareDevice(devId)); m_format = matrixFormatSparseCSC; Resize(numRows, numCols, nz, true, false); SetNzCount(nz); cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; CUDA_CALL(cudaMemcpy(BufferPointer(), h_Val, NzSize(), kind)); if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE)) { CUDA_CALL(cudaMemcpy(RowLocation(), h_Row, RowSize(), kind)); CUDA_CALL(cudaMemcpy(ColLocation(), h_CSCCol, ColSize(), kind)); } else { GPUSPARSE_INDEX_TYPE* pCol = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(RowSize() + ColSize()); GPUSPARSE_INDEX_TYPE* pRow = pCol + SecondaryIndexCount(); CopyBuffer(pCol, h_CSCCol, SecondaryIndexCount()); CopyBuffer(pRow, h_Row, MajorIndexCount()); CUDA_CALL(cudaMemcpy(RowLocation(), pRow, RowSize(), kind)); CUDA_CALL(cudaMemcpy(ColLocation(), pCol, ColSize(), kind)); } } // this function will allocate memory while the caller needs to release it template void GPUSparseMatrix::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t& numElemAllocated, size_t& nz, size_t& numRows, size_t& numCols) const { if (h_CSCCol != nullptr || h_Row != nullptr || h_Val != nullptr) LogicError("GetMatrixFromCSCFormat: Passed pointers must be nullptr"); numElemAllocated = GetNumElemAllocated(); nz = GetNumNZElements(); numRows = GetNumRows(); numCols = GetNumCols(); if (IsEmpty()) return; else { h_Val = new ElemType[numElemAllocated]; h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numRows + 1]; h_Row = new GPUSPARSE_INDEX_TYPE[nz]; PrepareDevice(); CUDA_CALL(cudaMemcpy(h_Val, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost)); if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE)) { CUDA_CALL(cudaMemcpy(h_Row, RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(h_CSCCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); } else { GPUSPARSE_INDEX_TYPE* pCol = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(RowSize() + ColSize()); GPUSPARSE_INDEX_TYPE* pRow = pCol + SecondaryIndexCount(); CUDA_CALL(cudaMemcpy(pRow, RowLocation(), RowSize(), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(pCol, ColLocation(), ColSize(), cudaMemcpyDeviceToHost)); CopyBuffer(h_CSCCol, pCol, SecondaryIndexCount()); CopyBuffer(h_Row, pRow, MajorIndexCount()); } } } #pragma endregion Constructors and Destructor #pragma region Static BLAS Functions // dense X sparse = dense template void GPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, const GPUSparseMatrix& rhs, const bool transposeB, ElemType beta, GPUMatrix& c) { if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId())) RuntimeError("GPUSparseMatrix::MultiplyAndWeightedAdd: All matrices must be on the same GPU"); if (lhs.IsEmpty() || rhs.IsEmpty()) LogicError("GPUSparseMatrix::MultiplyAndWeightedAdd: one of the input matrix is empty."); int m = transposeA ? (int) lhs.GetNumCols() : (int) lhs.GetNumRows(); int k = transposeA ? (int) lhs.GetNumRows() : (int) lhs.GetNumCols(); int l = transposeB ? (int) rhs.GetNumCols() : (int) rhs.GetNumRows(); int n = transposeB ? (int) rhs.GetNumRows() : (int) rhs.GetNumCols(); assert(m > 0 && k > 0 && l > 0 && n > 0); // converting from size_t to int may cause overflow assert(k == l); if (k != l) { InvalidArgument("GPUSparseMatrix::MultiplyAndWeightedAdd: The inner dimensions of a and b must match."); } if (beta == 0) c.Resize(m, n); else c.VerifySize(m, n); // Can't resize if beta != 0 c.PrepareDevice(); if (rhs.m_format == MatrixFormat::matrixFormatSparseCSC) { ConvolveAndWeightedAdd(alpha, lhs, transposeA, rhs, transposeB, beta, c, 1, 1, false, false); } else if (rhs.m_format == matrixFormatSparseCSR) { GPUSparseMatrix tempMatrix(matrixFormatSparseCSC, rhs.GetComputeDeviceId()); rhs.ConvertToSparseFormat(matrixFormatSparseCSC, tempMatrix); MultiplyAndWeightedAdd(alpha, lhs, transposeA, tempMatrix, transposeB, beta, c); } else { NOT_IMPLEMENTED; } } template void GPUSparseMatrix::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, const GPUSparseMatrix& rhs, const bool transposeB, ElemType beta, GPUMatrix& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId())) RuntimeError("GPUSparseMatrix::ConvolveAndWeightedAdd: All matrices must be on the same GPU"); if (lhs.IsEmpty() || rhs.IsEmpty()) LogicError("GPUSparseMatrix::ConvolveAndWeightedAdd: one of the input matrix is empty."); int m = transposeA ? (int) lhs.GetNumCols() : (int) lhs.GetNumRows(); int k = transposeA ? (int) lhs.GetNumRows() : (int) lhs.GetNumCols(); int l = transposeB ? (int) rhs.GetNumCols() : (int) rhs.GetNumRows(); int n = transposeB ? (int) rhs.GetNumRows() : (int) rhs.GetNumCols(); assert(m > 0 && k > 0 && l > 0 && n > 0); // converting from size_t to int may cause overflow int numSteps = 0; if (padding) numSteps = (int) ceil(1.0 * l / (horizontalSubsample * numChannels)); else if (l >= k) numSteps = 1 + (l - k) / (horizontalSubsample * numChannels); if (numSteps == 0) LogicError("ConvolveAndWeightedAdd: number of steps is zero. Matrix dimensions are incorrect or set padding to true."); int cRows = m * numSteps; int cCols = n; if (beta == 0) c.Resize(cRows, cCols); else c.VerifySize(cRows, cCols); // Can't resize if beta != 0 c.PrepareDevice(); if (rhs.m_format == MatrixFormat::matrixFormatSparseCSC) { if (!transposeB) { int blocksPerGrid = (int) ceil(1.0 * cRows * cCols / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); _dense1DConvMultSparseCSCAndWeightedAddToDense<<>>( m, // rowDense k, // colDense n, // colSparse numChannels, // number of input channels numSteps, // convolution num steps horizontalSubsample, // convolution step size channelwise, // channelwise or pixelwise multiplication alpha, reinterpret_cast(lhs.BufferPointer()), // dense transposeA, reinterpret_cast(rhs.BufferPointer()), // sparse nz values rhs.RowLocation(), rhs.ColLocation(), beta, reinterpret_cast(c.BufferPointer()) // dense target ); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } else { if (beta != 1.0) { RuntimeError("Only support c += alpha * a operation"); } int blocksPerGrid = (int) ceil(1.0 * cRows / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); for (int rowInB = 0; rowInB < l; rowInB++) { _dense1DConvMultSparseCSCTransposeAndAddToDense<<>>( m, // rowDense k, // colDense n, // colSparse numChannels, // number of input channels numSteps, // convolution num steps horizontalSubsample, // convolution step size channelwise, // channelwise or pixelwise multiplication rowInB, alpha, reinterpret_cast(lhs.BufferPointer()), // dense transposeA, reinterpret_cast(rhs.BufferPointer()), // sparse nz values rhs.RowLocation(), rhs.ColLocation(), reinterpret_cast(c.BufferPointer()) // dense target ); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } } else { NOT_IMPLEMENTED; } } template void GPUSparseMatrix::TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix& b, GPUSparseMatrix& c) { if (!c.OwnBuffer()) LogicError("Cannot modify externally managed matrix"); if (a.GetComputeDeviceId() != c.GetComputeDeviceId() || b.GetComputeDeviceId() != c.GetComputeDeviceId()) RuntimeError("GPUSparseMatrix::TensorShuffleScaleAndAdd: All matrices must be on the same GPU"); if (a.m_format != MatrixFormat::matrixFormatSparseCSC || b.m_format != MatrixFormat::matrixFormatSparseCSC || c.m_format != MatrixFormat::matrixFormatSparseCSC) NOT_IMPLEMENTED; // Can't distribute the operations if we need to move values across columns if (a.GetNumCols() != T || keepWeight != 0 || scaleFactor != 1) NOT_IMPLEMENTED; if (a.GetNumRows() != D * S * M * K) LogicError("GPUSparseMatrix::TensorShuffleScaleAndAdd: tensor dimensions and underlying matrix dimensions don't match"); c.Resize(a.GetNumRows(), a.GetNumCols(), a.GetNumElemAllocated(), true, false); c.SetNzCount(a.GetNumNZElements()); if (c.m_nz > 0) { c.PrepareDevice(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); CUDA_LONG N = (CUDA_LONG) c.GetNumElemAllocated(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); _tensorShuffleScaleAndAddRowSparse<<>>( reinterpret_cast(a.BufferPointer()), // source nz values a.RowLocation(), a.ColLocation(), reinterpret_cast(c.BufferPointer()), // target nz values c.RowLocation(), c.ColLocation(), D, S, M, K, T, c.GetNumElemAllocated()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } else { CUDA_CALL(cudaMemset(c.BufferPointer(), 0, c.BufferSizeAllocated())); } } // backward pass from hidden layer to feature weight // dense X sparse = sparse template void GPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, const GPUSparseMatrix& rhs, const bool transposeB, GPUSparseMatrix& c) { if (!c.OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId()) RuntimeError("GPUSparseMatrix::MultiplyAndAdd: All matrices must be on the same GPU"); int m = transposeA ? (int) lhs.GetNumCols() : (int) lhs.GetNumRows(); int k = transposeA ? (int) lhs.GetNumRows() : (int) lhs.GetNumCols(); int l = transposeB ? (int) rhs.GetNumCols() : (int) rhs.GetNumRows(); int n = transposeB ? (int) rhs.GetNumRows() : (int) rhs.GetNumCols(); assert(m > 0 && k > 0 && l > 0 && n > 0); (void) m; (void) n; // converting from size_t to int may cause overflow assert(k == l); if (k != l) { InvalidArgument("GPUSparseMatrix::MultiplyAndAdd: The inner dimensions of a and b must match."); } if (!transposeA && !transposeB) { NOT_IMPLEMENTED; } else if (!transposeA && transposeB) { if (rhs.GetFormat() != matrixFormatSparseCSC) NOT_IMPLEMENTED; c.SetFormat(matrixFormatSparseBlockCol); lhs.PrepareDevice(); int blocksPerGrid = 0; cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); // based on the size of m_nz in rhs and numCols in the resulted matrix we use different approaches if (n * 10 < GridDim::maxThreadsPerBlock * rhs.m_nz) { c.Resize(m, n, 1, true, false); // reserve memory for BlockId2ColOrRow() and ColOrRow2BlockId() size_t* blockSize = TracingGPUMemoryAllocator::Allocate(lhs.GetComputeDeviceId(), 1); CUDA_CALL(cudaMemset(blockSize, 0, sizeof(size_t))); CUDA_CALL(cudaMemset(c.BlockId2ColOrRow(), 0, sizeof(GPUSPARSE_INDEX_TYPE) * (n))); blocksPerGrid = (int) ceil(((double) rhs.m_nz) / GridDim::maxThreadsPerBlock); _findColsWithValues<<>>( rhs.RowLocation(), c.BlockId2ColOrRow(), rhs.m_nz); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); blocksPerGrid = (int) ceil(((double) n) / GridDim::maxThreadsPerBlock); _determineBlockIds<<>>( c.BlockId2ColOrRow(), c.ColOrRow2BlockId(), n, blockSize); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); CUDA_CALL(cudaMemcpy(&c.m_blockSize, blockSize, sizeof(size_t), cudaMemcpyDeviceToHost)); TracingGPUMemoryAllocator::Free(lhs.GetComputeDeviceId(), blockSize); size_t nnz = m * c.m_blockSize; c.Resize(m, n, nnz, true, true); // we need to keep the col2blockid and blockid2col info when resizing. c.m_nz = nnz; CUDA_CALL(cudaMemset(c.BufferPointer(), 0, sizeof(ElemType) * (c.m_elemSizeAllocated))); LONG64 N = (LONG64) lhs.GetNumElements(); // here we process for each row in lhs and each column in rhs (==columns in lhs) blocksPerGrid = (int) ceil(((double) N) / GridDim::maxThreadsPerBlock); _denseMulSparseCSCTransposeToSparseBlockCol2<<>>( alpha, lhs.BufferPointer(), m, l, rhs.BufferPointer(), rhs.RowLocation(), rhs.ColLocation(), c.ColOrRow2BlockId(), c.BufferPointer()); } else { c.m_blockSize = rhs.IdentifyRowsWithValues(); size_t nnz = m * c.m_blockSize; c.Resize(m, n, nnz, true, false); c.m_nz = nnz; CUDA_CALL(cudaMemset(c.BufferPointer(), 0, sizeof(ElemType) * (c.m_elemSizeAllocated))); CUDA_CALL(cudaMemset(c.BlockId2ColOrRow(), 0, sizeof(GPUSPARSE_INDEX_TYPE) * (c.m_blockSize))); LONG64 N = (LONG64) lhs.GetNumElements(); // here we process for each row in lhs and each column in rhs (==columns in lhs) blocksPerGrid = (int) ceil(((double) N) / GridDim::maxThreadsPerBlock); _denseMulSparseCSCTransposeToSparseBlockCol<<>>( alpha, lhs.BufferPointer(), m, l, rhs.BufferPointer(), rhs.RowLocation(), rhs.ColLocation(), rhs.m_rowToId, c.BufferPointer(), c.BlockId2ColOrRow()); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } else if (transposeA && !transposeB) { NOT_IMPLEMENTED; } else { NOT_IMPLEMENTED; } } //find the rows of rhs with values template size_t GPUSparseMatrix::IdentifyRowsWithValues() const { if (GetFormat() != matrixFormatSparseCSC) NOT_IMPLEMENTED; map indexer; GPUSPARSE_INDEX_TYPE* rowToId = (GPUSPARSE_INDEX_TYPE*) ReserveTempHostBuffer(sizeof(GPUSPARSE_INDEX_TYPE) * m_nz * 2); GPUSPARSE_INDEX_TYPE* h_Row = rowToId + m_nz; CUDA_CALL(cudaMemcpy(h_Row, RowLocation(), sizeof(GPUSPARSE_INDEX_TYPE) * m_nz, cudaMemcpyDeviceToHost)); for (size_t i = 0; i < m_nz; i++) { size_t row = h_Row[i]; if (indexer.find(row) == indexer.end()) { size_t id = indexer.size(); // We need to assign size to a temp variable due to difference in Linux and Windows indexer[row] = id; } rowToId[i] = indexer[row]; } CUDA_CALL(cudaMemcpy(m_rowToId, rowToId, sizeof(GPUSPARSE_INDEX_TYPE) * m_nz, cudaMemcpyHostToDevice)); return indexer.size(); } // used for gradients udpate template void GPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix& lhs, GPUMatrix& rhs) { if (lhs.GetNumRows() != rhs.GetNumRows() || lhs.GetNumCols() != rhs.GetNumCols()) LogicError("ScaleAndAdd: dimension mismatch"); if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId()) RuntimeError("GPUSparseMatrix::ScaleAndAdd: All matrices must be on the same GPU"); if (lhs.m_format == matrixFormatSparseBlockCol || lhs.m_format == matrixFormatSparseBlockRow) { bool blockCol = (lhs.m_format == matrixFormatSparseBlockCol); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); LONG64 N = (LONG64) lhs.GetNumNZElements(); int blocksPerGrid = (int) ceil(((double) N) / GridDim::maxThreadsPerBlock); _scaleSparseBlockAndAddToDense<<>>( alpha, blockCol, lhs.GetNumRows(), lhs.GetNumCols(), lhs.m_blockSize, lhs.BufferPointer(), lhs.BlockId2ColOrRow(), rhs.BufferPointer()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } else { ScaleAndAdd(alpha, lhs, 1, rhs, rhs); } } template GPUSparseMatrix& GPUSparseMatrix::InplaceTruncate(const ElemType threshold) { if (!OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); CUDA_LONG N = (CUDA_LONG) GetNumNZElements(); CUDA_LONG blocksPerGrid = (CUDA_LONG) ceil(N * 1.0 / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); ElemType* values = NzValues(); _inplaceTruncate<<>>(values, threshold, N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); return *this; } template GPUSparseMatrix& GPUSparseMatrix::InplaceSoftThreshold(const ElemType threshold) { if (!OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); CUDA_LONG N = (CUDA_LONG) GetNumNZElements(); CUDA_LONG blocksPerGrid = (CUDA_LONG) ceil(N * 1.0 / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); ElemType* values = NzValues(); _inplaceSoftThreshold<<>>(values, threshold, N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); return *this; } // normal update for smoothed gradients c and current gradients (this) template void GPUSparseMatrix::NormalGrad(GPUMatrix& c, const ElemType momentum) { if (!OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (c.IsEmpty()) { c.Resize(GetNumRows(), GetNumCols()); c.SetValue(0.0); } if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow) { bool isBlockCol = (m_format == MatrixFormat::matrixFormatSparseBlockCol); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); LONG64 N = (LONG64) GetNumNZElements(); int blocksPerGrid = (int) ceil(((double) N) / GridDim::maxThreadsPerBlock); _normalGradForSparseBlock<<>>( momentum, isBlockCol, GetNumRows(), GetNumCols(), m_blockSize, BufferPointer(), BlockId2ColOrRow(), c.BufferPointer()); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } else { NOT_IMPLEMENTED; } } template ElemType GPUSparseMatrix::Adagrad(GPUMatrix& c, const bool needAveMultiplier) { if (!OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); size_t numColsNeeded = GetNumCols(); if (needAveMultiplier) numColsNeeded += GetNumCols(); if (c.IsEmpty() || c.GetNumCols() < numColsNeeded) { c.Resize(GetNumRows(), numColsNeeded); c.SetValue(0.0); } assert(c.GetNumRows() == GetNumRows() && c.GetNumCols() == numColsNeeded); size_t n = this->GetNumElements(); ElemType* multipliers = nullptr; if (needAveMultiplier) multipliers = c.GetArray() + n; // temp memory used to store multipliers, if (m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) { NOT_IMPLEMENTED; } else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) { int blocksPerGrid = (m_nz + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; bool colMajor = (m_format == MatrixFormat::matrixFormatSparseBlockCol ? true : false); size_t len = colMajor ? GetNumRows() : GetNumCols(); _adagrad4BlockSparse<<>>(c.GetArray(), c.GetNumRows(), BufferPointer(), BlockId2ColOrRow(), multipliers, colMajor, len, m_nz); } else NOT_IMPLEMENTED; if (!needAveMultiplier) return 1; cublasHandle_t cuHandle = GPUMatrix::GetCublasHandle(GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) { float aveMultiplier = 0; CUBLAS_CALL(cublasSasum(cuHandle, (LONG64) m_nz, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / m_nz; } else { double aveMultiplier = 0; CUBLAS_CALL(cublasDasum(cuHandle, (LONG64) m_nz, reinterpret_cast(multipliers), 1, &aveMultiplier)); return (ElemType) aveMultiplier / m_nz; } } //------------------------------------------------------------------------- // End of new GPU Sparse Matrix code //------------------------------------------------------------------------- //sparse X dense = dense template void GPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeD, ElemType beta, GPUMatrix& c) { if (a.m_format != matrixFormatSparseCSR) NOT_IMPLEMENTED; if (transposeD) NOT_IMPLEMENTED; if (a.GetComputeDeviceId() != b.GetComputeDeviceId() || (b.GetComputeDeviceId() != a.GetComputeDeviceId())) RuntimeError("MultiplyAndWeightedAdd: All matrices must be on the same GPU"); a.PrepareDevice(); cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cusparseMatDescr_t descr = 0; CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); cusparseOperation_t oper = transposeA ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; int m = (int) a.GetNumRows(); int n = (int) b.GetNumCols(); assert(n == (int) c.GetNumCols()); int k = (int) a.GetNumCols(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsrmm(cusparseHandle, oper, m, n, k, (int) a.GetNumElemAllocated(), reinterpret_cast(&alpha), descr, reinterpret_cast(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast(b.BufferPointer()), (int) b.GetNumRows(), reinterpret_cast(&beta), reinterpret_cast(c.BufferPointer()), (int) c.GetNumRows())); } else { CUSPARSE_CALL(cusparseDcsrmm(cusparseHandle, oper, m, n, k, (int) a.GetNumElemAllocated(), reinterpret_cast(&alpha), descr, reinterpret_cast(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast(b.BufferPointer()), (int) b.GetNumRows(), reinterpret_cast(&beta), reinterpret_cast(c.BufferPointer()), (int) c.GetNumRows())); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); CUSPARSE_CALL(cusparseDestroy(cusparseHandle)); } template void GPUSparseMatrix::Multiply(const GPUSparseMatrix& S, const GPUMatrix& D, GPUMatrix& C) { C.Resize(S.GetNumRows(), D.GetNumCols()); MultiplyAndWeightedAdd(1, S, false, D, false, 0, C); } template void GPUSparseMatrix::Multiply(const GPUMatrix& D, const GPUSparseMatrix& S, GPUMatrix& C) { C.Resize(S.GetNumCols(), D.GetNumRows()); MultiplyAndWeightedAdd(1, D, false, S, false, 0, C); } // ElemCountFromBufferSize - Return the elemCountAllocated for a particular buffersize // totalBufferSize - total buffer we have to use // return: size of allocated elements/index slots available template size_t GPUSparseMatrix::ElemCountFromBufferSize(const size_t numRows, const size_t numCols, const MatrixFormat format, const size_t totalBufferSize) const { size_t elemSizeAllocated; if (format == matrixFormatSparseCSC) { elemSizeAllocated = (totalBufferSize - sizeof(GPUSPARSE_INDEX_TYPE) * (numCols + 1)) / (sizeof(GPUSPARSE_INDEX_TYPE) + sizeof(ElemType)); } else if (format == matrixFormatSparseCSR) { elemSizeAllocated = (totalBufferSize - sizeof(GPUSPARSE_INDEX_TYPE) * (numRows + 1)) / (sizeof(GPUSPARSE_INDEX_TYPE) + sizeof(ElemType)); } else if (format == matrixFormatSparseBlockCol) { elemSizeAllocated = (totalBufferSize - sizeof(GPUSPARSE_INDEX_TYPE) * 2 * numCols) / sizeof(ElemType); } else if (format == matrixFormatSparseBlockCol || format == matrixFormatSparseBlockRow) { elemSizeAllocated = (totalBufferSize - sizeof(GPUSPARSE_INDEX_TYPE) * 2 * numRows) / sizeof(ElemType); } else // uncompressed COO format { elemSizeAllocated = totalBufferSize / (2 * sizeof(GPUSPARSE_INDEX_TYPE) + sizeof(ElemType)); } return elemSizeAllocated; } template size_t GPUSparseMatrix::ElemCountFromBufferSize() const { return ElemCountFromBufferSize(m_numRows, m_numCols, m_format, m_totalBufferSizeAllocated); } // PrepareBuffer - Get the dimensions start buffer, computes the starting row/column of each value // m - rows in the source // n - cols in the source // canReuseBuffer - target matrix can be reused for temporary space // func - function to call to count elements in the result (returns count, and fills csrRowPtr array) template void GPUSparseMatrix::PrepareBuffer(size_t m, size_t n, bool canReuseBuffer, std::function func) { if (!OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (this->m_format != matrixFormatSparseCSR) NOT_IMPLEMENTED; PrepareDevice(); GPUSPARSE_INDEX_TYPE* csrRowPtrC = nullptr; GPUSparseMatrix& c = *this; size_t cSize = c.BufferSizeAllocated(); size_t rowBufferRequired = (m + 1) * sizeof(GPUSPARSE_INDEX_TYPE); bool allocatedBuffer = false; // do we have enough memory to store just the row buffer? if (cSize >= rowBufferRequired && c.BufferPointer() != nullptr && canReuseBuffer) { csrRowPtrC = (GPUSPARSE_INDEX_TYPE*) c.BufferPointer(); } else { csrRowPtrC = TracingGPUMemoryAllocator::Allocate(m_computeDevice, rowBufferRequired / sizeof(GPUSPARSE_INDEX_TYPE)); allocatedBuffer = true; } // get the non-zero count from the function (and size_t nnzC = func(csrRowPtrC); // now we know the number of Non-zeros in the result set, set the output size c.Resize(m, n, nnzC, true, false); c.m_nz = nnzC; CUDA_CALL(cudaMemcpy(c.SecondaryIndexLocation(), csrRowPtrC, c.SecondaryIndexSize(), cudaMemcpyDeviceToDevice)); // if we allocated the buffer, free it here if (allocatedBuffer) TracingGPUMemoryAllocator::Free(m_computeDevice, csrRowPtrC); } // Multiply - multiply one spares matrix by another sparse matrix // S1 - first sparse matrix // transposeS1 - transpose first matrix? // S2 - second sparse matrix // transposeS2 - tanspose second matrix? // c - result matrix // NOTE: if c has enough space allocated, it will be reused, otherwise it will be freed and a new memory block used template void GPUSparseMatrix::Multiply(const GPUSparseMatrix& S1, bool transposeS1, const GPUSparseMatrix& S2, bool transposeS2, GPUSparseMatrix& c) { if (!c.OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (S1.m_format != matrixFormatSparseCSR || S2.m_format != matrixFormatSparseCSR || c.m_format != matrixFormatSparseCSR) NOT_IMPLEMENTED; if (S1.GetComputeDeviceId() != S2.GetComputeDeviceId()) RuntimeError("Sparse matrix multiply: both matrices must be on the same device"); S1.PrepareDevice(); cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cusparseMatDescr_t descrA = 0, descrB = 0, descrC = 0; CUSPARSE_CALL(cusparseCreateMatDescr(&descrA)); CUSPARSE_CALL(cusparseCreateMatDescr(&descrB)); CUSPARSE_CALL(cusparseCreateMatDescr(&descrC)); cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); cusparseOperation_t operA = transposeS1 ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; cusparseOperation_t operB = transposeS2 ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; int m = int(transposeS1 ? S1.GetNumCols() : S1.GetNumRows()); int n = int(transposeS2 ? S2.GetNumRows() : S2.GetNumCols()); int k = int(transposeS1 ? S1.GetNumRows() : S1.GetNumCols()); int l = int(transposeS2 ? S2.GetNumCols() : S2.GetNumRows()); if (k != l) RuntimeError("Sparse matrix multiply: dimensionality mismatch"); int nnzA = (int) S1.GetNumNZElements(); int nnzB = (int) S2.GetNumNZElements(); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); // Step 1 c.PrepareBuffer(m, n, false, // false means we cannot reuse the "c" buffer if it exists for temporaries [&](GPUSPARSE_INDEX_TYPE* csrRowPtrC) -> size_t { int nnzTotal = -1; CUSPARSE_CALL(cusparseXcsrgemmNnz(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, S1.RowLocation(), S1.ColLocation(), descrB, nnzB, S2.RowLocation(), S2.ColLocation(), descrC, csrRowPtrC, &nnzTotal)); return nnzTotal; }); // Step 2 if (sizeof(float) == sizeof(ElemType)) { CUSPARSE_CALL(cusparseScsrgemm(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, (const float*) S1.BufferPointer(), S1.RowLocation(), S1.ColLocation(), descrB, nnzB, (const float*) S2.BufferPointer(), S2.RowLocation(), S2.ColLocation(), descrC, (float*) c.BufferPointer(), c.RowLocation(), c.ColLocation())); } else { CUSPARSE_CALL(cusparseDcsrgemm(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, (const double*) S1.BufferPointer(), S1.RowLocation(), S1.ColLocation(), descrB, nnzB, (const double*) S2.BufferPointer(), S2.RowLocation(), S2.ColLocation(), descrC, (double*) c.BufferPointer(), c.RowLocation(), c.ColLocation())); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); cusparseDestroy(cusparseHandle); } template GPUSparseMatrix& GPUSparseMatrix::AssignProductOf(const GPUSparseMatrix& a, const bool transposeA, const GPUSparseMatrix& b, const bool transposeB) { Multiply(a, transposeA, b, transposeB, *this); return *this; } template void GPUSparseMatrix::ScaleAndAdd(ElemType alpha, const GPUSparseMatrix& a, ElemType beta, const GPUSparseMatrix& b, GPUSparseMatrix& c) { if (!c.OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (a.m_format != matrixFormatSparseCSR || b.m_format != matrixFormatSparseCSR || c.m_format != matrixFormatSparseCSR) { NOT_IMPLEMENTED; } if (a.GetNumCols() != b.GetNumCols() || a.GetNumRows() != b.GetNumRows()) RuntimeError("Dimensions mismatch in ScaleAndAdd"); if (a.GetComputeDeviceId() != b.GetComputeDeviceId()) RuntimeError("ScaleAndAdd: matrices must be on the same device"); int m = (int) a.GetNumRows(); int n = (int) a.GetNumCols(); int nnzA = (int) a.GetNumNZElements(); int nnzB = (int) b.GetNumNZElements(); a.PrepareDevice(); cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cusparseMatDescr_t descrA = 0, descrB = 0, descrC = 0; CUSPARSE_CALL(cusparseCreateMatDescr(&descrA)); CUSPARSE_CALL(cusparseCreateMatDescr(&descrB)); CUSPARSE_CALL(cusparseCreateMatDescr(&descrC)); cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); // Step 1 bool inOutParameter = (&b == &c); c.PrepareBuffer(m, n, !inOutParameter, [&](GPUSPARSE_INDEX_TYPE* csrRowPtrC) -> size_t { int nnzTotal = -1; CUSPARSE_CALL(cusparseXcsrgeamNnz(cusparseHandle, m, n, descrA, nnzA, a.RowLocation(), a.ColLocation(), descrB, nnzB, b.RowLocation(), b.ColLocation(), descrC, csrRowPtrC, &nnzTotal)); return nnzTotal; }); // Step 2 if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsrgeam(cusparseHandle, m, n, reinterpret_cast(&alpha), descrA, nnzA, reinterpret_cast(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast(&beta), descrB, nnzB, reinterpret_cast(b.BufferPointer()), b.RowLocation(), b.ColLocation(), descrC, reinterpret_cast(c.BufferPointer()), c.RowLocation(), c.ColLocation())); } else { CUSPARSE_CALL(cusparseDcsrgeam(cusparseHandle, m, n, reinterpret_cast(&alpha), descrA, nnzA, reinterpret_cast(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast(&beta), descrB, nnzB, reinterpret_cast(b.BufferPointer()), b.RowLocation(), b.ColLocation(), descrC, reinterpret_cast(c.BufferPointer()), c.RowLocation(), c.ColLocation())); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); cusparseDestroy(cusparseHandle); } template void GPUSparseMatrix::ScaleAndAdd(ElemType alpha, const GPUSparseMatrix& a, ElemType beta, const GPUMatrix& b, GPUMatrix& c) { if (a.m_format != matrixFormatSparseCSR) NOT_IMPLEMENTED; if (a.GetNumRows() != b.GetNumRows() || a.GetNumRows() != c.GetNumRows() || a.GetNumCols() != b.GetNumCols() || a.GetNumCols() != c.GetNumCols()) LogicError("ScaleAndAdd: dimension mismatch"); if (a.GetComputeDeviceId() != b.GetComputeDeviceId() || a.GetComputeDeviceId() != c.GetComputeDeviceId()) RuntimeError("ScaleAndAdd: matrices must be on the same device"); b.PrepareDevice(); // copy b to c CUDA_CALL(cudaMemcpy(c.BufferPointer(), b.BufferPointer(), sizeof(ElemType) * b.GetNumElements(), cudaMemcpyDeviceToDevice)); if (beta != 1) { c *= beta; } cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); CUDA_LONG M = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * M / GridDim::maxThreadsPerBlock); _sparseCSRPlusDense<<>>(alpha, a.BufferPointer(), a.RowLocation(), a.ColLocation(), c.BufferPointer(), M); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } template void GPUSparseMatrix::ScaleAndAdd(ElemType alpha, const GPUMatrix& a, ElemType beta, const GPUSparseMatrix& b, GPUMatrix& c) { ScaleAndAdd(beta, b, alpha, a, c); } template void GPUSparseMatrix::Scale(ElemType alpha, GPUSparseMatrix& a) { if (!a.OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (a.IsEmpty()) return; CUDA_LONG N = (CUDA_LONG) a.GetNumNZElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); _scaleArray<<>>(alpha, a.NzValues(), N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } template void GPUSparseMatrix::ElementWisePower(ElemType alpha, const GPUSparseMatrix& a, GPUSparseMatrix& c) { if (!c.OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (a.GetComputeDeviceId() != c.GetComputeDeviceId()) { InvalidArgument("All matrices must be on the same GPU"); } else { if (a.IsEmpty()) LogicError("ElementWisePower: The input matrix a is empty."); c.ResizeAsAndCopyIndexFrom(a); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); a.PrepareDevice(); CUDA_LONG N = (CUDA_LONG) a.GetNumNZElements(); int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock); _elementWisePowerOnCuda<<>>(alpha, a.NzValues(), c.NzValues(), N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } } template ElemType GPUSparseMatrix::InnerProductOfMatrices(const GPUSparseMatrix& a, const GPUMatrix& b) { if (a.m_format != matrixFormatSparseCSR && a.m_format != matrixFormatSparseCSC) NOT_IMPLEMENTED; if (a.GetComputeDeviceId() != b.GetComputeDeviceId()) RuntimeError("a and b must be on the same device"); int m = (int) a.GetNumRows(); int n = (int) a.GetNumCols(); int nnz = (int) a.GetNumNZElements(); ElemType* cscValA = nullptr; GPUSPARSE_INDEX_TYPE* cscRowIndA = nullptr; GPUSPARSE_INDEX_TYPE* cscColPtrA = nullptr; cudaEvent_t done = nullptr; cusparseAction_t cpVals = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; cusparseHandle_t cusparseHandle = 0; if (a.m_format == matrixFormatSparseCSR) // need to put a in ColumnMajor format { cscValA = TracingGPUMemoryAllocator::Allocate(a.GetComputeDeviceId(), nnz); cscRowIndA = TracingGPUMemoryAllocator::Allocate(a.GetComputeDeviceId(), nnz); cscColPtrA = TracingGPUMemoryAllocator::Allocate(a.GetComputeDeviceId(), (n + 1)); CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase)); } else { CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase)); } if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); } else if (a.m_format == matrixFormatSparseCSC) { cscValA = (ElemType*) a.BufferPointer(); cscRowIndA = a.RowLocation(); cscColPtrA = a.ColLocation(); } else { NOT_IMPLEMENTED; } // Given sparse matrix in column major format, calculate indices for corresponding sparse vector GPUSPARSE_INDEX_TYPE* vectArray = TracingGPUMemoryAllocator::Allocate(a.GetComputeDeviceId(), a.m_nz); CUDA_LONG M = n; CUDA_LONG N = m; // GPUSPARSE_INDEX_TYPE* h_vectArray= new int[a.m_nz]; int blocksPerGrid = (int) ceil(1.0 * M / GridDim::maxThreadsPerBlock); if (do_sync) CUDA_CALL(cudaEventCreate(&done)); _getSparseVectorRepresntationForCSCMatrix<<>>(cscColPtrA, cscRowIndA, vectArray, M, N); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); TracingGPUMemoryAllocator::Free(a.GetComputeDeviceId(), cscRowIndA); TracingGPUMemoryAllocator::Free(a.GetComputeDeviceId(), cscColPtrA); // CUDA_CALL(cudaMemcpy(h_vectArray,vectArray,sizeof(GPUSPARSE_INDEX_TYPE)*a.m_nz,cudaMemcpyDeviceToHost)); // Actual dot product ElemType res = 0; if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseSdoti(cusparseHandle, (int) a.m_nz, reinterpret_cast(cscValA), vectArray, reinterpret_cast(b.BufferPointer()), reinterpret_cast(&res), idxBase)); } else { CUSPARSE_CALL(cusparseDdoti(cusparseHandle, (int) a.m_nz, reinterpret_cast(cscValA), vectArray, reinterpret_cast(b.BufferPointer()), reinterpret_cast(&res), idxBase)); } TracingGPUMemoryAllocator::Free(a.GetComputeDeviceId(), vectArray); TracingGPUMemoryAllocator::Free(a.GetComputeDeviceId(), cscValA); CUSPARSE_CALL(cusparseDestroy(cusparseHandle)); return res; } template ElemType GPUSparseMatrix::InnerProductOfMatrices(const GPUMatrix& a, const GPUSparseMatrix& b) { return GPUSparseMatrix::InnerProductOfMatrices(b, a); } // This is an utility function useful for debugging issues with sparse matrices. // It just checks that the CSC format indices are not corrupted / pointing to invalid memory. template bool GPUSparseMatrix::IsValid() const { if (m_format != MatrixFormat::matrixFormatSparseCSC) NOT_IMPLEMENTED; long* res = new long[4]; res[0] = 1; res[1] = 0; res[2] = 0; res[3] = 0; long* d_res = TracingGPUMemoryAllocator::Allocate(m_computeDevice, 4); CUDA_CALL(cudaMemcpy(d_res, res, sizeof(long) * 4, cudaMemcpyHostToDevice)); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); int blocksPerGrid = (int) ceil((1.0 * SecondaryIndexCount()) / GridDim::maxThreadsPerBlock); _isValid<<>>(MajorIndexLocation(), SecondaryIndexLocation(), GetNumRows(), GetNumCols(), GetNumNZElements(), d_res); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); CUDA_CALL(cudaMemcpy(res, d_res, sizeof(long) * 4, cudaMemcpyDeviceToHost)); if (res[0] == 1) { return true; } else { fprintf(stderr, "GPUSparseMatrix::IsValid returned false (additional info: %ld %ld %ld %ld)\n", res[0], res[1], res[2], res[3]); return false; } } template bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUSparseMatrix& b, const ElemType threshold) { if (a.GetNumNZElements() != b.GetNumNZElements() || a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) return false; if (a.m_format != b.m_format) NOT_IMPLEMENTED; long* res = new long[3]; res[0] = 1; res[1] = 1; res[2] = 1; long* d_res = TracingGPUMemoryAllocator::Allocate(a.GetComputeDeviceId(), 3); CUDA_CALL(cudaMemcpy(d_res, res, sizeof(long) * 3, cudaMemcpyHostToDevice)); int blocksPerGrid = (int) ceil(1.0 * a.GetNumNZElements() / GridDim::maxThreadsPerBlock); _areEqual<<>>(a.NzValues(), b.NzValues(), (CUDA_LONG) a.GetNumNZElements(), threshold, d_res); _areEqual<<>>(a.ColLocation(), b.ColLocation(), (CUDA_LONG) a.GetNumNZElements(), (int) threshold, d_res + 1); blocksPerGrid = (int) ceil((1.0 * a.GetNumRows() + 1.0) / GridDim::maxThreadsPerBlock); _areEqual<<>>(a.RowLocation(), b.RowLocation(), (CUDA_LONG) a.GetNumRows() + 1, (int) threshold, d_res + 2); CUDA_CALL(cudaMemcpy(res, d_res, sizeof(long) * 3, cudaMemcpyDeviceToHost)); if (res[0] * res[1] * res[2] == 1) return true; else return false; } template bool GPUSparseMatrix::AreEqual(const GPUMatrix& a, const GPUSparseMatrix& b, const ElemType threshold) { if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) return false; GPUSparseMatrix c(b.GetFormat(), b.GetComputeDeviceId()); c.SetValue(a); return AreEqual(c, b, threshold); } template bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUMatrix& b, const ElemType threshold) { if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) return false; GPUSparseMatrix c(a.GetFormat(), a.GetComputeDeviceId()); c.SetValue(b); return AreEqual(a, c, threshold); } template bool GPUSparseMatrix::IsEqualTo(const GPUSparseMatrix& a, const ElemType threshold) const { return AreEqual(*this, a, threshold); } template bool GPUSparseMatrix::IsEqualTo(const GPUMatrix& a, const ElemType threshold) const { return AreEqual(*this, a, threshold); } #pragma endregion Static BLAS Functions #pragma region Member BLAS Functions template DEVICEID_TYPE GPUSparseMatrix::GetComputeDeviceId() const { // for externally managed memory the CUDA context will have the current device if (!OwnBuffer()) { DEVICEID_TYPE devId; CUDA_CALL(cudaGetDevice(&devId)); return devId; } else return m_computeDevice; } template GPUMatrix GPUSparseMatrix::ElementProductOf(const GPUSparseMatrix& a, const GPUMatrix& b) { if (!b.OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); if (a.m_format != matrixFormatSparseCSR) NOT_IMPLEMENTED; if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) LogicError("ElementProductOf: matrix dimensions mismatch"); b.PrepareDevice(); GPUMatrix c(b.GetNumRows(), b.GetNumCols(), b.GetComputeDeviceId()); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); CUDA_LONG M = (CUDA_LONG) a.GetNumRows(); int blocksPerGrid = (int) ceil(1.0 * M / GridDim::maxThreadsPerBlock); _sparseCSRElemMulDense<<>>(a.BufferPointer(), a.RowLocation(), a.ColLocation(), b.BufferPointer(), c.BufferPointer(), M); if (do_sync) CUDA_CALL(cudaEventRecord(done)); if (do_sync) CUDA_CALL(cudaEventSynchronize(done)); if (do_sync) CUDA_CALL(cudaEventDestroy(done)); return c; } template GPUMatrix GPUSparseMatrix::ElementProductOf(const GPUMatrix& a, const GPUSparseMatrix& b) { return GPUSparseMatrix::ElementProductOf(b, a); } template GPUSparseMatrix GPUSparseMatrix::operator+(const GPUSparseMatrix& a) const { GPUSparseMatrix res(GetFormat(), GetComputeDeviceId()); GPUSparseMatrix::ScaleAndAdd(1, *this, 1, a, res); return res; } template GPUSparseMatrix GPUSparseMatrix::operator-(const GPUSparseMatrix& a) const { GPUSparseMatrix res(GetFormat(), GetComputeDeviceId()); GPUSparseMatrix::ScaleAndAdd(1, *this, -1, a, res); return res; } template GPUSparseMatrix& GPUSparseMatrix::operator^=(ElemType alpha) { GPUSparseMatrix& us = *this; ElementWisePower(alpha, us, us); return us; } template GPUSparseMatrix GPUSparseMatrix::operator^(ElemType alpha) const { GPUSparseMatrix c(GetFormat(), GetComputeDeviceId()); ElementWisePower(alpha, *this, c); return c; } template GPUSparseMatrix& GPUSparseMatrix::operator*=(ElemType alpha) { GPUSparseMatrix& us = *this; if (alpha != 1) Scale(alpha, us); return us; } template GPUSparseMatrix GPUSparseMatrix::operator*(ElemType alpha) const { GPUSparseMatrix c(*this); if (alpha != 1) Scale(alpha, c); return c; } template GPUSparseMatrix& GPUSparseMatrix::AssignElementPowerOf(const GPUSparseMatrix& a, const ElemType power) { ElementWisePower(power, a, *this); return *this; } template GPUSparseMatrix GPUSparseMatrix::Transpose() const { int m = (int) GetNumRows(); int n = (int) GetNumCols(); int nnz = (int) GetNumNZElements(); cusparseAction_t cpVals = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; assert(GetFormat() & matrixFormatCompressed); // for now this only supports compressed formats PrepareDevice(); GPUSparseMatrix c(GetFormat(), GetComputeDeviceId()); c.Resize(n, m, nnz, GetFormat(), true, false); c.m_nz = nnz; cusparseHandle_t cusparseHandle = 0; CUSPARSE_CALL(cusparseCreate(&cusparseHandle)); cudaEvent_t done = nullptr; if (do_sync) CUDA_CALL(cudaEventCreate(&done)); if (m_format == MatrixFormat::matrixFormatSparseCSR) { if (nnz > 0) { if (sizeof(ElemType) == sizeof(float)) { CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast(this->BufferPointer()), this->RowLocation(), this->ColLocation(), reinterpret_cast(c.BufferPointer()), c.ColLocation(), c.RowLocation(), cpVals, idxBase)); } else { CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast