// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // #pragma once #include "Basics.h" // for RuntimeError() #include "Matrix.h" #include "File.h" #include "Helpers.h" #include "CommonMatrix.h" #include #include #include #include // NOTE NOTE NOTE: // use CPUSingleMatrix and CPUDoubleMatrix instead of using the template directly /////////////////////////////////////////////// // This class is exported from the Math.dll namespace Microsoft { namespace MSR { namespace CNTK { double logadd(double x, double y); //To compy with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor //convertion is need when passing data between CPUMatrix and C++ matrices template class MATH_API CPUMatrix : public BaseMatrix { typedef BaseMatrix B; using B::m_numRows; using B::m_numCols; using B::m_pArray; using B::m_computeDevice; using B::m_elemSizeAllocated; using B::m_externalBuffer; using B::m_format; using B::m_matrixName; // without this, base members would require to use thi-> in GCC public: CPUMatrix(); CPUMatrix(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. CPUMatrix(const size_t numRows, const size_t numCols); CPUMatrix(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal); CPUMatrix(const CPUMatrix& deepCopyFrom); // copy constructor, deep copy CPUMatrix& operator=(const CPUMatrix& deepCopyFrom); // assignment operator, deep copy CPUMatrix(CPUMatrix&& moveFrom); // move constructor, shallow copy CPUMatrix& operator=(CPUMatrix&& moveFrom); // move assignment operator, shallow copy ~CPUMatrix(); public: using B::OwnBuffer; using B::GetNumElements; using B::IsEmpty; using B::GetNumRows; using B::GetNumCols; using B::SetOwnBuffer; using B::SetMatrixName; size_t BufferSize() const { return m_numRows * m_numCols * sizeof(ElemType); } ElemType* BufferPointer() const { return m_pArray; } CPUMatrix ColumnSlice(size_t startColumn, size_t numCols) const; CPUMatrix& AssignColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols); CPUMatrix& SetColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols); void CopyColumnsStrided(const CPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride); CPUMatrix Diagonal() const; ElemType Adagrad(CPUMatrix& gradients, const bool needAveMultiplier); void FSAdagrad(CPUMatrix& gradients, CPUMatrix& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul); ElemType RmsProp(CPUMatrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier); void Reshape(const size_t numRows, const size_t numCols); void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow ElemType* CopyToArray() const; // allocated by the callee but need to be deleted by the caller size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const; // allocated by the callee but need to be deleted by the caller void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const; inline ElemType& operator()(const size_t row, const size_t col) { return m_pArray[LocateElement(row, col)]; } inline const ElemType& operator()(const size_t row, const size_t col) const { return m_pArray[LocateElement(row, col)]; } inline ElemType Get00Element() const { return m_pArray[0]; } void SetValue(const ElemType v); void SetValue(const CPUMatrix& deepCopyFrom); void SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, size_t matrixFlags = matrixFlagNormal); void MaskColumnsValue(const CPUMatrix& columnsMask, ElemType val); void SetColumn(const ElemType* colPointer, size_t colInd); void SetColumn(const CPUMatrix& valMat, size_t colInd); void SetColumn(const ElemType val, size_t j); void SetDiagonalValue(const ElemType v); void SetDiagonalValue(const CPUMatrix& vector); void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED); void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED); void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed = USE_TIME_BASED_SEED); void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED); CPUMatrix Transpose(); CPUMatrix& AssignTransposeOf(const CPUMatrix& a); CPUMatrix& operator+=(const ElemType alpha); CPUMatrix operator+(const ElemType alpha) const; CPUMatrix& AssignSumOf(const ElemType alpha, const CPUMatrix& a); CPUMatrix& operator+=(const CPUMatrix& a); CPUMatrix operator+(const CPUMatrix& a) const; CPUMatrix& AssignSumOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& operator-=(const ElemType alpha); CPUMatrix operator-(const ElemType alpha) const; CPUMatrix& AssignDifferenceOf(const ElemType alpha, const CPUMatrix& a); CPUMatrix& AssignDifferenceOf(const CPUMatrix& a, const ElemType alpha); CPUMatrix& operator-=(const CPUMatrix& a); CPUMatrix operator-(const CPUMatrix& a) const; CPUMatrix& AssignDifferenceOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& operator*=(const ElemType alpha); CPUMatrix operator*(const ElemType alpha) const; CPUMatrix& AssignProductOf(const ElemType alpha, const CPUMatrix& a); CPUMatrix operator*(const CPUMatrix& a) const; CPUMatrix& AssignProductOf(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB); CPUMatrix& operator/=(ElemType alpha); CPUMatrix operator/(ElemType alpha) const; CPUMatrix& operator^=(ElemType alpha); // element-wise power CPUMatrix operator^(ElemType alpha) const; // element-wise power CPUMatrix& AssignElementPowerOf(const CPUMatrix& a, const ElemType power); CPUMatrix& ElementMultiplyWith(const CPUMatrix& a); CPUMatrix& AssignElementProductOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& AddElementProductOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& AssignElementDivisionOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& ElementDivideBy(const CPUMatrix& a); CPUMatrix& ColumnElementMultiplyWith(const CPUMatrix& a); CPUMatrix& RowElementMultiplyWith(const CPUMatrix& a); CPUMatrix& ColumnElementDivideBy(const CPUMatrix& a); CPUMatrix& RowElementDivideBy(const CPUMatrix& a); CPUMatrix& ElementInverse(); CPUMatrix& AssignElementInverseOf(const CPUMatrix& a); CPUMatrix& InplaceSigmoid(); CPUMatrix& AssignSigmoidOf(const CPUMatrix& a); CPUMatrix& InplaceLinearRectifierDerivative(); CPUMatrix& AssignLinearRectifierDerivativeOf(const CPUMatrix& a); CPUMatrix& InplaceSigmoidDerivative(); CPUMatrix& AssignSigmoidDerivativeOf(const CPUMatrix& a); CPUMatrix& InplaceTanh(); CPUMatrix& AssignTanhOf(const CPUMatrix& a); CPUMatrix& InplaceLogSoftmax(const bool isColWise); CPUMatrix& AssignLogSoftmaxOf(const CPUMatrix& a, const bool isColWise); CPUMatrix& InplaceHardmax(const bool isColWise); CPUMatrix& AssignHardmaxOf(const CPUMatrix& a, const bool isColWise); // sequence training CPUMatrix& DropFrame(const CPUMatrix& label, const CPUMatrix& gamma, const ElemType& threshhold); CPUMatrix& AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix& label, const CPUMatrix& dnnoutput, const CPUMatrix& gamma, ElemType alpha); CPUMatrix& InplaceSqrt(); CPUMatrix& AssignSqrtOf(const CPUMatrix& a); CPUMatrix& InplaceExp(); CPUMatrix& AssignExpOf(const CPUMatrix& a); CPUMatrix& InplaceLog(); CPUMatrix& AssignLogOf(const CPUMatrix& a); CPUMatrix& InplaceLog10(); CPUMatrix& AssignLog10Of(const CPUMatrix& a); CPUMatrix& InplaceCosine(); CPUMatrix& AssignCosineOf(const CPUMatrix& a); CPUMatrix& InplaceNegativeSine(); CPUMatrix& AssignNegativeSineOf(const CPUMatrix& a); CPUMatrix& InplaceAbs(); CPUMatrix& AssignAbsOf(const CPUMatrix& a); CPUMatrix& InplaceTruncateBottom(const ElemType threshold); CPUMatrix& AssignTruncateBottomOf(const CPUMatrix& a, const ElemType threshold); CPUMatrix& InplaceTruncateTop(const ElemType threshold); CPUMatrix& AssignTruncateTopOf(const CPUMatrix& a, const ElemType threshold); CPUMatrix& InplaceTruncate(const ElemType threshold); CPUMatrix& InplaceSoftThreshold(const ElemType threshold); CPUMatrix& SetToZeroIfAbsLessThan(const ElemType threshold); ElemType SumOfAbsElements() const; // sum of all abs(elements) ElemType SumOfElements() const; // sum of all elements CPUMatrix& AssignSumOfElements(const CPUMatrix& a); bool IsEqualTo(const CPUMatrix& a, const ElemType threshold = 1e-8) const; static void VectorSum(const CPUMatrix& a, CPUMatrix& c, const bool isColWise); void VectorNorm1(CPUMatrix& c, const bool isColWise) const; CPUMatrix& AssignVectorNorm1Of(CPUMatrix& a, const bool isColWise); void VectorNorm2(CPUMatrix& c, const bool isColWise) const; CPUMatrix& AssignVectorNorm2Of(CPUMatrix& a, const bool isColWise); void AssignNoiseContrastiveEstimation(const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& tmp, CPUMatrix& c); void AssignSoftmaxSum(const CPUMatrix& a, CPUMatrix& softmax); void AssignNCEUnnormalizedEval(const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& bias, CPUMatrix& c); CPUMatrix& AssignNCEDerivative(const CPUMatrix& tmp, const CPUMatrix& a, const CPUMatrix& b, size_t inputIndex, CPUMatrix& c); void VectorNormInf(CPUMatrix& c, const bool isColWise) const; CPUMatrix& AssignVectorNormInfOf(CPUMatrix& a, const bool isColWise); CPUMatrix& AssignInnerProductOf(const CPUMatrix& a, const CPUMatrix& b, const bool isColWise); CPUMatrix& AssignKhatriRaoProductOf(const CPUMatrix& a, const CPUMatrix& b); CPUMatrix& AddColumnReshapeProductOf(const CPUMatrix& a, const CPUMatrix& b, const bool transposeAColumn); CPUMatrix& AddWithScaleOf(ElemType alpha, const CPUMatrix& a); ElemType FrobeniusNorm() const; CPUMatrix& AssignFrobeniusNormOf(const CPUMatrix& a); ElemType MatrixNormInf() const; ElemType MatrixNorm1() const; ElemType MatrixNorm0() const; // number of non-zero elemets CPUMatrix& AssignSignOf(const CPUMatrix& a); CPUMatrix& AddSignOf(const CPUMatrix& a); CPUMatrix& AssignRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows); CPUMatrix& AddToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows); CPUMatrix& AddWithRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows); // CPUMatrix& AssignRowStackValuesOf(const std::vector*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols); CPUMatrix& AssignToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows); CPUMatrix& AssignRepeatOf(const CPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats); CPUMatrix& AddToRowRepeatValuesOf(const CPUMatrix& a, const size_t numRowRepeats); CPUMatrix& AssignPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber); CPUMatrix& AddFoldedPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber); void VectorMax(CPUMatrix& maxIndexes, CPUMatrix& maxValues, const bool isColWise, int topK = 1) const; void VectorMin(CPUMatrix& minIndexes, CPUMatrix& minValues, const bool isColWise) const; CPUMatrix& AssignNumOfDiff(const CPUMatrix& a, const CPUMatrix& b, bool searchInCol = false); void Print(const char* matrixName, size_t rowStart, size_t rowEnd, size_t colStart, size_t colEnd) const; void Print(const char* matrixName = nullptr) const; // print whole matrix. can be expensive void ReadFromFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. void WriteToFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read. CPUMatrix& AssignPackedConvolutionInput(const CPUMatrix& inputSubBatch, const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false); CPUMatrix& UnpackConvolutionInput(CPUMatrix& inputSubBatch, const size_t inputWidth, const size_t inputHeight, const size_t inputChannels, const size_t outputWidth, const size_t outputHeight, const size_t outputChannels, const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample, const bool zeroPadding = false) const; CPUMatrix& AssignMaxPoolingResult(const CPUMatrix& inputBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); CPUMatrix& AddMaxPoolingGradient(const CPUMatrix& outputGradientBatch, const CPUMatrix& inputBatch, const CPUMatrix& outputBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); CPUMatrix& AssignAveragePoolingResult(const CPUMatrix& inputBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); CPUMatrix& AddAveragePoolingGradient(const CPUMatrix& outputGradientBatch, const size_t channels, const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample, const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample); public: static int SetNumThreads(int numThreads); // note: this does not depend on , i.e. you can call it on any // static BLAS functions static void SVD(const CPUMatrix& A, CPUMatrix& SIGMA, CPUMatrix& U, CPUMatrix& VT, CPUMatrix& W); static void MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, ElemType beta, CPUMatrix& c); static void MultiplyAndAdd(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, CPUMatrix& c); static void Multiply(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB, CPUMatrix& c); static void Multiply(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); static void Multiply1x1AndWeightedAdd(ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, ElemType beta, CPUMatrix& c); static void ScaleAndAdd(ElemType alpha, const CPUMatrix& a, CPUMatrix& c); static void AddScaledDifference(const ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); static void AssignScaledDifference(const ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); static void AddScaledDifference(const CPUMatrix& alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); // alpha must be 1X1 static void AssignScaledDifference(const CPUMatrix& alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); // alpha must be 1X1 static void AddElementToElement(const CPUMatrix& a, const size_t ai, const size_t aj, CPUMatrix& c, const size_t ci, const size_t cj); // static void AddLogElementToElement(const CPUMatrix& a, const size_t ai, const size_t aj, CPUMatrix& c, const size_t ci, const size_t cj); static void AssignElementToElement(const CPUMatrix& a, const size_t ai, const size_t aj, CPUMatrix& c, const size_t ci, const size_t cj); static void MinusOneAt(CPUMatrix& c, const size_t position); static void Scale(ElemType alpha, CPUMatrix& a); static void Scale(CPUMatrix alpha, CPUMatrix& a); // In this case Matrix alpha must be 1x1 static void Scale(ElemType alpha, const CPUMatrix& a, CPUMatrix& c); static void InnerProduct(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, const bool isColWise); static ElemType InnerProductOfMatrices(const CPUMatrix& a, const CPUMatrix& b); static void ElementWisePower(ElemType alpha, const CPUMatrix& a, CPUMatrix& c); static bool AreEqual(const CPUMatrix& a, const CPUMatrix& b, const ElemType threshold = 1e-8); static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix& b, CPUMatrix& c); void TensorOp(ElemType beta, const CPUMatrix& a, ElemType alpha, ElementWiseOperator op, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 2>& regularStrides, const SmallVector& reducingOpDims, const std::array, 2>& reducingStrides); void TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, ElemType alpha, ElementWiseOperator op, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 3>& regularStrides, const SmallVector& reducingOpDims, const std::array, 3>& reducingStrides); void TensorOp(ElemType beta, const CPUMatrix& a, const CPUMatrix& b, const CPUMatrix& c, ElemType alpha, ElementWiseOperator op, const std::array& offsets, const SmallVector& regularOpDims, const std::array, 4>& regularStrides, const SmallVector& reducingOpDims, const std::array, 4>& reducingStrides); static CPUMatrix Ones(const size_t rows, const size_t cols); static CPUMatrix Zeros(const size_t rows, const size_t cols); static CPUMatrix Eye(const size_t rows); static CPUMatrix RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED); static CPUMatrix RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED); // return true if v is an element in matrix c static bool HasElement(const CPUMatrix& a, const ElemType v = 0.0); public: CPUMatrix& AssignElementProductOfWithShiftNeg(const CPUMatrix& a, const CPUMatrix& b, size_t shift, size_t negnumber); static void InnerProductWithShiftNeg(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, const bool isColWise, size_t shift, size_t negnumber); // extract out a row from a, assign it to [this]. CPUMatrix& GetARowByIndex(const CPUMatrix& a, const size_t index); static void ConductRowElementMultiplyWithShift(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c, const size_t shift, bool bFirstmatrixfixed); CPUMatrix& AssignElementProductOfWithShift(const CPUMatrix& a, const CPUMatrix& b, const size_t shift); public: friend File& operator>>(File& stream, CPUMatrix& us) { stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); size_t elsize; stream >> elsize; if (sizeof(ElemType) != elsize) RuntimeError("Template argument size doesn't match those in file"); std::wstring matrixName; size_t numRows, numCols; int format; stream >> matrixName >> format >> numRows >> numCols; ElemType* d_array = new ElemType[numRows * numCols]; for (size_t i = 0; i < numRows * numCols; ++i) stream >> d_array[i]; stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT")); us.SetValue(numRows, numCols, d_array, matrixFlagNormal); if (us.m_matrixName) delete[] us.m_matrixName; us.m_matrixName = new wchar_t[matrixName.length() + 1]; wmemcpy(us.m_matrixName, matrixName.c_str(), matrixName.length() + 1); delete[] d_array; return stream; } friend File& operator<<(File& stream, const CPUMatrix& us) { stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); stream << sizeof(ElemType); std::wstring s = (us.m_matrixName == NULL) ? std::wstring(L"unnamed") : std::wstring(us.m_matrixName); int format = us.m_format; stream << s << format; stream << us.m_numRows << us.m_numCols; for (size_t i = 0; i < us.GetNumElements(); ++i) stream << us.m_pArray[i]; stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT")); return stream; } public: ElemType LogAddSumOfElements() const; public: // for RCRF static void RCRFBackwardCompute(const CPUMatrix& alpha, CPUMatrix& beta, const CPUMatrix& lbls, const CPUMatrix& pair_scores); static void _rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix& alpha, CPUMatrix& beta, const CPUMatrix& pair_scores); static void RCRFTransGrdCompute(const CPUMatrix& lbls, const CPUMatrix& alpha, const CPUMatrix& beta, const CPUMatrix& pair_scores, CPUMatrix& grd); static void _rcrfTransGrdCompute(size_t i, const CPUMatrix& lbls, const CPUMatrix& alpha, const CPUMatrix& beta, const CPUMatrix& pair_scores, CPUMatrix& grd, const size_t tPos // position ); protected: size_t LocateElement(const size_t i, const size_t j) const; size_t LocateColumn(const size_t j) const; private: void ZeroInit(); // should only be used by constructors. void Clear(); }; typedef CPUMatrix CPUSingleMatrix; typedef CPUMatrix CPUDoubleMatrix; } } }