// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #pragma once #include "Platform.h" #include "File.h" #include "Helpers.h" #include "CommonMatrix.h" #include "TensorShape.h" // only for SmallVector; I was hoping to keep this out #include "DebugUtil.h" #include "BestGpu.h" // for CPUONLY macro #include "ConcStack.h" #include #include #include #include #include // for cout/cerr #include // for unique_ptr #include // for ULONG_MAX #ifndef _WIN32 #include #endif // predeclare cublasHandle_t struct cublasContext; typedef struct cublasContext* cublasHandle_t; struct CUstream_st; typedef struct CUstream_st* cudaStream_t; #ifdef _WIN32 #ifndef MATH_API #ifdef MATH_EXPORTS #define MATH_API __declspec(dllexport) #else #define MATH_API __declspec(dllimport) #endif #endif /* MATH_API */ #else // no DLLs in Linux #define MATH_API #endif #ifndef USE_TIME_BASED_SEED #define USE_TIME_BASED_SEED ULONG_MAX #endif // Stream management functions void MATH_API SetStream(cudaStream_t stream); cudaStream_t MATH_API GetStream(); namespace Microsoft { namespace MSR { namespace CNTK { // ----------------------------------------------------------------------- // DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU // ----------------------------------------------------------------------- template class MATH_API DeviceBoundNumber { private: DEVICEID_TYPE m_computeDevice; ElemType* m_data; public: DeviceBoundNumber() { m_data = NULL; }; DeviceBoundNumber(const DeviceBoundNumber& deepCopy); DeviceBoundNumber(DeviceBoundNumber&& shallowCopy); ~DeviceBoundNumber(); DEVICEID_TYPE GetDeviceId() const { return m_computeDevice; } ElemType* ExposePointer2Value() const { return m_data; } // performs shallow copy only void ShallowCopyFrom(ElemType* newVal, int newValsDevceId); }; // ----------------------------------------------------------------------- // GPUMatrix // ----------------------------------------------------------------------- void PrepareDevice(DEVICEID_TYPE deviceId); template class MATH_API GPUMatrix : public BaseMatrix { typedef BaseMatrix B; using B::m_numRows; using B::m_numCols; using B::m_pArray; // without this, base members would require to use thi-> in GCC template friend class GPUMatrix; public: static const int MaxGpus = 8; // support up to 8 GPUs using BaseMatrix::m_computeDevice; using BaseMatrix::m_elemSizeAllocated; using BaseMatrix::m_matrixName; using BaseMatrix::m_format; using BaseMatrix::m_externalBuffer; using BaseMatrix::m_nz; using BaseMatrix::OwnBuffer; using BaseMatrix::GetNumElements; using BaseMatrix::IsEmpty; using BaseMatrix::GetArray; using BaseMatrix::GetNumRows; using BaseMatrix::GetNumCols; using BaseMatrix::SetMatrixName; private: static cublasHandle_t s_cuHandle[MaxGpus]; static void* s_curandGenerator; // Have to use disable the warning to avoid issues with __declspec(dllexport) on Windows (C4251). // Also, NVCC FE corresponding warning has to be disabled, see MathCUDA.vcxproj. // The only workaround is to use naked pointer. #pragma warning(push) #pragma warning(disable : 4251) mutable std::unique_ptr>>> m_workspace; #pragma warning(pop) private: void performElementWiseFunction(const ElementWiseOperator kind, const ElemType* src); size_t LocateElement(const size_t i, const size_t j) const; size_t LocateColumn(const size_t j) const; void Clear(); void ZeroInit(int deviceId); std::unique_ptr> GetOrCreateWorkspace() const; void ReleaseWorkspace(std::unique_ptr> src) const; public: explicit GPUMatrix(int deviceId); GPUMatrix(FILE* f, const char* matrixName, int deviceId); GPUMatrix(const size_t numRows, const size_t numCols, int deviceId); GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal); GPUMatrix(const GPUMatrix& deepCopyFrom); GPUMatrix& operator=(const GPUMatrix& deepCopyFrom); // assignment operator, deep copy GPUMatrix(GPUMatrix&& moveFrom); GPUMatrix& operator=(GPUMatrix&& moveFrom); // move assignment operator, shallow copy ~GPUMatrix(void); static void SetDevice(DEVICEID_TYPE deviceId); int GetComputeDeviceId() const; DEVICEID_TYPE PrepareDevice(DEVICEID_TYPE deviceId = -1) const; static cublasHandle_t GetCublasHandle(int computeDevice = -1); ElemType* CopyToArray() const; // allocated by the callee but need to be deleted by the caller size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const; // allocated by the callee but need to be deleted by the caller void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const; void ChangeDeviceTo(DEVICEID_TYPE to_id); public: GPUMatrix ColumnSlice(size_t startColumn, size_t numCols) const; GPUMatrix& AssignColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols); GPUMatrix& SetColumnSlice(const GPUMatrix& fromMatrix, size_t startColumn, size_t numCols); void CopyColumnsStrided(const GPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride); GPUMatrix Diagonal() const; size_t BufferSize() const { return m_numRows * m_numCols * sizeof(ElemType); } ElemType* BufferPointer() const { return m_pArray; } ElemType Adagrad(GPUMatrix& gradients, const bool needAveMultiplier); void FSAdagrad(GPUMatrix& gradients, GPUMatrix& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul); ElemType RmsProp(GPUMatrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier); void Reshape(const size_t numRows, const size_t numCols); void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow ElemType& operator()(const size_t /*row*/, const size_t /*col*/) { LogicError("GPUMatrix doesn't support this"); } const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support this"); } ElemType Get00Element() const; void SetValue(const ElemType v); void SetValue(const ElemType* d_v); // d_v is pointer to the the value in GPU memory void SetColumn(const ElemType* colPointer, size_t colInd); void SetColumn(const GPUMatrix& valMat, size_t colInd); void MaskColumnsValue(const GPUMatrix& columnsMask, ElemType val); void SetValue(const GPUMatrix& deepCopyFrom); void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags = matrixFlagNormal); void SetDiagonalValue(const ElemType v); void SetDiagonalValue(const GPUMatrix& vector); void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED); void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED); void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed = USE_TIME_BASED_SEED); GPUMatrix Transpose() const; GPUMatrix& AssignTransposeOf(const GPUMatrix& a); GPUMatrix& operator+=(const ElemType alpha); GPUMatrix operator+(const ElemType alpha) const; GPUMatrix& AssignSumOf(const ElemType alpha, const GPUMatrix& a); GPUMatrix& operator+=(const GPUMatrix& a); GPUMatrix operator+(const GPUMatrix& a) const; GPUMatrix& AssignSumOf(const GPUMatrix& a, const GPUMatrix& b); GPUMatrix& operator-=(const ElemType alpha); GPUMatrix operator-(const ElemType alpha) const; GPUMatrix& AssignDifferenceOf(const ElemType alpha, const GPUMatrix& a); GPUMatrix& AssignDifferenceOf(const GPUMatrix& a, const ElemType alpha); GPUMatrix& operator-=(const GPUMatrix& a); GPUMatrix operator-(const GPUMatrix& a) const; GPUMatrix& AssignDifferenceOf(const GPUMatrix& a, const GPUMatrix& b); GPUMatrix& operator*=(const ElemType alpha); GPUMatrix operator*(const ElemType alpha) const; GPUMatrix& AssignProductOf(const ElemType alpha, const GPUMatrix& a); GPUMatrix operator*(const GPUMatrix& a) const; GPUMatrix& AssignProductOf(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB); GPUMatrix& operator/=(ElemType alpha); GPUMatrix operator/(ElemType alpha) const; GPUMatrix& operator^=(ElemType alpha); // element-wise power GPUMatrix operator^(ElemType alpha) const; // element-wise power GPUMatrix& AssignElementPowerOf(const GPUMatrix& a, const ElemType power); GPUMatrix& ElementMultiplyWith(const GPUMatrix& a); GPUMatrix& AssignElementProductOf(const GPUMatrix& a, const GPUMatrix& b); GPUMatrix& AddElementProductOf(const GPUMatrix& a, const GPUMatrix& b); GPUMatrix& AssignElementDivisionOf(const GPUMatrix& a, const GPUMatrix& b); GPUMatrix& ElementDivideBy(const GPUMatrix& a); GPUMatrix& ColumnElementMultiplyWith(const GPUMatrix& a); GPUMatrix& RowElementMultiplyWith(const GPUMatrix& a); GPUMatrix& ColumnElementDivideBy(const GPUMatrix& a); GPUMatrix& RowElementDivideBy(const GPUMatrix& a); GPUMatrix& ElementInverse(); GPUMatrix& AssignElementInverseOf(const GPUMatrix& a); GPUMatrix& InplaceLinearRectifierDerivative(); GPUMatrix& AssignLinearRectifierDerivativeOf(const GPUMatrix& a); GPUMatrix& InplaceSigmoidDerivative(); GPUMatrix& AssignSigmoidDerivativeOf(const GPUMatrix& a); GPUMatrix& InplaceSigmoid(); GPUMatrix& AssignSigmoidOf(const GPUMatrix& a); GPUMatrix& InplaceTanh(); GPUMatrix& AssignTanhOf(const GPUMatrix& a); GPUMatrix& InplaceLogSoftmax(const bool isColWise); GPUMatrix& AssignLogSoftmaxOf(const GPUMatrix& a, const bool isColWise); GPUMatrix& InplaceHardmax(const bool isColWise); GPUMatrix& AssignHardmaxOf(const GPUMatrix& a, const bool isColWise); // sequence training GPUMatrix& DropFrame(const GPUMatrix& label, const GPUMatrix& gamma, const ElemType& threshhold); GPUMatrix& AssignSequenceError(const ElemType hsmoothingWeight, const GPUMatrix& label, const GPUMatrix& dnnoutput, const GPUMatrix& gamma, ElemType alpha); GPUMatrix& InplaceSqrt(); GPUMatrix& AssignSqrtOf(const GPUMatrix& a); GPUMatrix& InplaceExp(); GPUMatrix& AssignExpOf(const GPUMatrix& a); GPUMatrix& InplaceLog(); GPUMatrix& AssignLogOf(const GPUMatrix& a); GPUMatrix& InplaceCosine(); GPUMatrix& AssignCosineOf(const GPUMatrix& a); GPUMatrix& InplaceNegativeSine(); GPUMatrix& AssignNegativeSineOf(const GPUMatrix& a); GPUMatrix& InplaceAbs(); GPUMatrix& AssignAbsOf(const GPUMatrix& a); GPUMatrix& InplaceTruncateBottom(const ElemType threshold); GPUMatrix& AssignTruncateBottomOf(const GPUMatrix& a, const ElemType threshold); GPUMatrix& InplaceTruncateTop(const ElemType threshold); GPUMatrix& AssignTruncateTopOf(const GPUMatrix& a, const ElemType threshold); GPUMatrix& InplaceTruncate(const ElemType threshold); GPUMatrix& InplaceSoftThreshold(const ElemType threshold); GPUMatrix& SetToZeroIfAbsLessThan(const ElemType threshold); DeviceBoundNumber Sum_AsDeviceBoundNum() const; ElemType SumOfAbsElements() const; // sum of all abs(elements) ElemType SumOfElements() const; // sum of all elements GPUMatrix& AssignSumOfElements(const GPUMatrix& a); ElemType Max() const; bool IsEqualTo(const GPUMatrix& a, const ElemType threshold = 1e-8) const; static void VectorSum(const GPUMatrix& a, GPUMatrix& c, const bool isColWise); void VectorNorm1(GPUMatrix& c, const bool isColWise) const; GPUMatrix& AssignVectorNorm1Of(GPUMatrix& a, const bool isColWise); void VectorNorm2(GPUMatrix& c, const bool isColWise) const; GPUMatrix& AssignVectorNorm2Of(GPUMatrix& a, const bool isColWise); void VectorNormInf(GPUMatrix& c, const bool isColWise) const; GPUMatrix& AssignVectorNormInfOf(GPUMatrix& a, const bool isColWise); GPUMatrix& AssignInnerProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool isColWise); GPUMatrix& AssignKhatriRaoProductOf(const GPUMatrix& a, const GPUMatrix& b); GPUMatrix& AddColumnReshapeProductOf(const GPUMatrix& a, const GPUMatrix& b, const bool transposeAColumn); GPUMatrix& AddWithScaleOf(ElemType alpha, const GPUMatrix& a); ElemType FrobeniusNorm() const; GPUMatrix& AssignFrobeniusNormOf(const GPUMatrix& a); ElemType MatrixNormInf() const; ElemType MatrixNorm1() const; ElemType MatrixNorm0() const; // number of non-zero elemets GPUMatrix& AssignSignOf(const GPUMatrix& a); GPUMatrix& AddSignOf(const GPUMatrix& a); GPUMatrix& AssignToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows); GPUMatrix& AssignRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows); GPUMatrix& AddToRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows); GPUMatrix& AddWithRowSliceValuesOf(const GPUMatrix& a, const size_t startIndex, const size_t numRows); // GPUMatrix& AssignRowStackValuesOf(const std::vector*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols); GPUMatrix& AssignRepeatOf(const GPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats); GPUMatrix& AddToRowRepeatValuesOf(const GPUMatrix& a, const size_t numRowRepeats); GPUMatrix& AssignPositiveAndShiftedNegSample(const GPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber); GPUMatrix& AddFoldedPositiveAndShiftedNegSample(const GPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber); void VectorMax(GPUMatrix& maxIndexes, GPUMatrix& maxValues, const bool isColWise) const; void VectorMax(GPUMatrix& maxIndexes, GPUMatrix& maxValues, const bool isColWise, int topK) const; void VectorMin(GPUMatrix& minIndexes, GPUMatrix& minValues, const bool isColWise) const; GPUMatrix& AssignNumOfDiff(const GPUMatrix& a, const GPUMatrix& b, bool searchInCol = false); GPUMatrix& AssignInnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b); void AssignNoiseContrastiveEstimation(const GPUMatrix& a, const GPUMatrix& b, const GPUMatrix& bias, size_t sampleCount, GPUMatrix& tmp, GPUMatrix& c); void AssignNCEDerivative(GPUMatrix& tmp, const GPUMatrix& a, const GPUMatrix& b, size_t inputIndex, GPUMatrix& c); void AssignNCEUnnormalizedEval(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); void AssignSoftmaxSum(const GPUMatrix& a, GPUMatrix& softmax); void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const; void Print(const char* matrixName = NULL) const; // print whole matrix. can be expensive void ReadFromFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. void WriteToFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. GPUMatrix& AssignPackedConvolutionInput(const GPUMatrix& inputSubBatch, const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false); GPUMatrix& UnpackConvolutionInput(GPUMatrix& inputSubBatch, const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, bool zeroPadding = false) const; GPUMatrix& AssignMaxPoolingResult(const GPUMatrix& inputBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); GPUMatrix& AddMaxPoolingGradient(const GPUMatrix& outputGradientBatch, const GPUMatrix& inputBatch, const GPUMatrix& outputBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); GPUMatrix& AssignAveragePoolingResult(const GPUMatrix& inputBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); GPUMatrix& AddAveragePoolingGradient(const GPUMatrix& outputGradientBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); public: // static BLAS functions static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, ElemType beta, GPUMatrix& c); static void MultiplyAndAdd(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c); static void Multiply(const GPUMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeB, GPUMatrix& c); static void Multiply(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void Multiply1x1AndWeightedAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, ElemType beta, GPUMatrix& c); static void ScaleAndAdd(ElemType alpha, const GPUMatrix& a, GPUMatrix& c); static void ScaleAndAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void AddScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void AssignScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void AddScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void AssignScaledDifference(const GPUMatrix& alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void AddElementToElement(const GPUMatrix& a, const size_t ai, const size_t aj, GPUMatrix& c, const size_t ci, const size_t cj); // minus one at a specific position static void MinusOneAt(GPUMatrix& c, const size_t position); static void Scale(ElemType alpha, const GPUMatrix& a, GPUMatrix& c); static void Scale(GPUMatrix& alpha, GPUMatrix& a); // In this case matrix alpha must be 1x1 static void Scale(ElemType alpha, GPUMatrix& a); static void InnerProduct(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c, const bool isColWise); static ElemType InnerProductOfMatrices(const GPUMatrix& a, const GPUMatrix& b); static void ElementWisePower(ElemType alpha, const GPUMatrix& a, GPUMatrix& c); static bool AreEqual(const GPUMatrix& a, const GPUMatrix& b, const ElemType threshold = 1e-8); static void TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUMatrix& b, GPUMatrix& c); void TensorOp(ElemType beta, const GPUMatrix& a, ElemType alpha, ElementWiseOperator op, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 2>& regularStrides, const SmallVector& reducingOpDims, const std::array, 2>& reducingStrides); void TensorOp(ElemType beta, const GPUMatrix& a, const GPUMatrix& b, ElemType alpha, ElementWiseOperator op, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 3>& regularStrides, const SmallVector& reducingOpDims, const std::array, 3>& reducingStrides); void TensorOp(ElemType beta, const GPUMatrix& a, const GPUMatrix& b, const GPUMatrix& c, ElemType alpha, ElementWiseOperator op, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 4>& regularStrides, const SmallVector& reducingOpDims, const std::array, 4>& reducingStrides); static void CreateCurandObject(unsigned long seed, const char* caller); static void ResetCurandObject(unsigned long seed, const char* caller); static GPUMatrix Ones(const size_t rows, const size_t cols, int deviceId); static GPUMatrix Zeros(const size_t rows, const size_t cols, int deviceId); static GPUMatrix Eye(const size_t rows, int deviceId); static GPUMatrix RandomUniform(const size_t rows, const size_t cols, int deviceId, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED); static GPUMatrix RandomGaussian(const size_t rows, const size_t cols, int deviceId, const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED); static bool HasElement(const GPUMatrix& a, const ElemType v = 0.0); static ElemType GetLearnRateForBlock_Helper(const GPUMatrix& Gradients, const GPUMatrix& SmoothedGradients); ElemType LogAddSumOfElements() const; public: GPUMatrix& AssignElementProductOfWithShiftNeg(const GPUMatrix& a, const GPUMatrix& b, const size_t shift, const size_t nt); static void InnerProductWithShiftNeg(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c, const size_t shift, const size_t nt); GPUMatrix& GetARowByIndex(const GPUMatrix& a, const size_t m); static void ConductRowElementMultiplyWithShift(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c, const size_t shift, const bool isafixed); GPUMatrix& AssignElementProductOfWithShift(const GPUMatrix& a, const GPUMatrix& b, const size_t shift); public: static void RCRFBackwardCompute( const GPUMatrix& alpha, GPUMatrix& beta, const GPUMatrix& lbls, const GPUMatrix& pos_scores, const GPUMatrix& pair_scores, const int shift = 1); static void RCRFTransGrdCompute(const GPUMatrix& lbls, const GPUMatrix& alpha, const GPUMatrix& beta, const GPUMatrix& pair_scores, GPUMatrix& grd, const int startLbl, // the time 0 start symbol in the output layer const int shift); public: friend File& operator>>(File& stream, GPUMatrix& us) { stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); size_t elsize; stream >> elsize; if (sizeof(ElemType) != elsize) LogicError("Template argument size doesn't match those in file"); std::wstring matrixName; size_t numRows, numCols; int format; stream >> matrixName >> format >> numRows >> numCols; ElemType* d_array = new ElemType[numRows * numCols]; for (size_t i = 0; i < numRows * numCols; ++i) stream >> d_array[i]; stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); us.SetValue(numRows, numCols, us.GetComputeDeviceId(), d_array, matrixFlagNormal | format); delete[] d_array; us.m_matrixName = new wchar_t[matrixName.length() + 1]; wmemcpy(us.m_matrixName, matrixName.c_str(), matrixName.length() + 1); // us.m_matrixName = matrixName; return stream; } friend File& operator<<(File& stream, const GPUMatrix& us) { stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); stream << sizeof(ElemType); std::wstring s = (us.m_matrixName == NULL) ? std::wstring(L"unnamed") : std::wstring(us.m_matrixName); int format = us.m_format; stream << s << format; stream << us.m_numRows << us.m_numCols; ElemType* pArray = us.CopyToArray(); for (size_t i = 0; i < us.GetNumElements(); ++i) stream << pArray[i]; delete[] pArray; stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT")); return stream; } }; typedef GPUMatrix GPUSingleMatrix; }}} #ifndef CPUONLY #include // Error handling template const char* CudaErrString(ERRTYPE x); // actual error function is defined inside .cu files template static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode) { if (retCode != successCode) { try { #ifdef _WIN32 const char* hostname = getenv("COMPUTERNAME"); #else char hostname[HOST_NAME_MAX]; if (gethostname(hostname, HOST_NAME_MAX) != 0) strcpy(hostname, "?"); #endif int currentCudaDevice; cudaGetDevice(¤tCudaDevice); Microsoft::MSR::CNTK::RuntimeError("%s failure %d: %s ; GPU=%d ; hostname=%s ; expr=%s", libName, (int)retCode, CudaErrString(retCode), currentCudaDevice, hostname ? hostname : "?", exprString); } catch (const std::exception& e) // catch, log, and rethrow since CUDA code sometimes hangs in destruction, so we'd never get to see the error { std::cerr << e.what() << std::endl; throw; } } } #define CUDA_CALL(expr) (CudaCall((expr), #expr, "CUDA", cudaSuccess)) #define CUBLAS_CALL(expr) (CudaCall((expr), #expr, "CUBLAS", CUBLAS_STATUS_SUCCESS)) #define CUSPARSE_CALL(expr) (CudaCall((expr), #expr, "CUSPARSE", CUSPARSE_STATUS_SUCCESS)) #define CURAND_CALL(expr) (CudaCall((expr), #expr, "CURAND", CURAND_STATUS_SUCCESS)) #define CUDNN_CALL(expr) (CudaCall((expr), #expr, "cuDNN", CUDNN_STATUS_SUCCESS)) #endif