https://github.com/Microsoft/CNTK
Raw File
Tip revision: 2003303aba820a91be4b2db9788d15db14cff6dd authored by Frank Seide on 08 September 2020, 19:38:44 UTC
some updates, Marian-related
Tip revision: 2003303
CPUMatrix.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Basics.h" // for RuntimeError()
#include "Matrix.h"
#include "File.h"
#include "Helpers.h"
#include "CommonMatrix.h"
#include "CPURNGHandle.h"
#include <vector>
#include <stdio.h>
#include <ctime>
#include <limits.h>
#include "QuantizedOperations.h"

//#include "GPUMatrix.h"
//#include "CPUSparseMatrix.h"
//#include "GPUSparseMatrix.h"

// NOTE NOTE NOTE:
// use CPUSingleMatrix and CPUDoubleMatrix instead of using the template directly
///////////////////////////////////////////////

// This class is exported from the Math.dll
namespace Microsoft { namespace MSR { namespace CNTK {

double logadd(double x, double y);

// To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
// conversion is need when passing data between CPUMatrix and C++ matrices
template <class ElemType>
class MATH_API CPUMatrix : public BaseMatrix<ElemType>
{
    typedef BaseMatrix<ElemType> Base;
    using Base::m_numRows;
    using Base::m_numCols;
    using Base::m_sliceViewOffset;
    using Base::HasExternalBuffer;
    using Base::ReleaseExternalBuffer;
    using Base::SetBuffer;
    using Base::SetComputeDeviceId;
    using Base::SetSizeAllocated;
    using Base::GetSizeAllocated;
    using Base::ZeroInit;
    using Base::ZeroValues;
    using Base::m_sob;
    using Base::ShallowCopyFrom;
    using Base::VerifyResizable;

public:
    using Base::VerifyWritable;
    using Base::GetComputeDeviceId;
    using Base::Buffer;
    using Base::GetNumRows;
    using Base::GetNumCols;
    using Base::GetNumElements;
    using Base::OwnBuffer;
    using Base::GetFormat;
    using Base::SetFormat;
    using Base::IsEmpty;
    using Base::VerifySize;

public:
    CPUMatrix();
    CPUMatrix(const CPUMatrix<ElemType>& shallowCopyFrom, bool shallow);     // copy constructor, shallow
    CPUMatrix(const size_t numRows, const size_t numCols);
    CPUMatrix(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal, IBaseMatrixStorageExternalBufferDeleter* deleter = nullptr);
    CPUMatrix(const CPUMatrix<ElemType>& deepCopyFrom);                      // copy constructor, deep copy
    CPUMatrix<ElemType>& operator=(const CPUMatrix<ElemType>& deepCopyFrom); // assignment operator, deep copy
    CPUMatrix(CPUMatrix<ElemType>&& moveFrom);                               // move constructor, shallow copy
    CPUMatrix<ElemType>& operator=(CPUMatrix<ElemType>&& moveFrom);          // move assignment operator, shallow copy

public:
    size_t BufferSize() const
    {
        return m_numRows * m_numCols * sizeof(ElemType);
    }

    // Returns pointer into underlying data buffer corresponding to slice-view. This makes it different from method Buffer()
    ElemType* Data() const
    {
        return Buffer() + m_sliceViewOffset;
    }

    CPUMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols, size_t sourceNumCols) const;
    CPUMatrix<ElemType>& AssignColumnSlice(const CPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
    CPUMatrix<ElemType>& SetColumnSlice(const CPUMatrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);

    void CopyColumnsStrided(const CPUMatrix<ElemType>& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride);

    CPUMatrix<ElemType> Diagonal() const;

    ElemType Adagrad(CPUMatrix<ElemType>& gradients, const bool needAveMultiplier);
    
    void FSAdagrad(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, 
                   ElemType momentum, ElemType adaWeight, ElemType adaMul, ElemType unitGainFactor);

    void Adam(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample,
              ElemType momentum, ElemType adaWeight, ElemType adaMul, ElemType epsilon, ElemType unitGainFactor, bool adamax=false);

    ElemType RmsProp(CPUMatrix<ElemType>& gradients,
                     ElemType RMS_GAMMA,
                     ElemType RMS_WGT_INC,
                     ElemType RMS_WGT_MAX,
                     ElemType RMS_WGT_DEC,
                     ElemType RMS_WGT_MIN,
                     const bool needAveMultiplier,
                     const bool initialized);

    void AdaDelta(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);

    void Reshape(const size_t numRows, const size_t numCols);


    // RequireSize is now the new preferred method of ensuring the correct size inside of the Matrix class. Since Resize will fail if the storage object has
    // multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
    // will call Resize, which may fail if the matrix has multiple views.
    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
    // Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
    // actually resizes the underlying matrix, doing any allocation as required.
    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow


    ElemType* CopyToArray() const;                                                 // allocated by the callee but need to be deleted by the caller
    size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const;    // allocated by the callee but need to be deleted by the caller
    void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const; // copy GPU or CPU matrix data to a CPU-side rectangular buffer

    inline ElemType& operator()(const size_t row, const size_t col)
    {
        return Data()[LocateElement(row, col)];
    }
    inline const ElemType& operator()(const size_t row, const size_t col) const
    {
        return Data()[LocateElement(row, col)];
    }
    inline ElemType Get00Element() const
    {
        return operator()(0, 0);
    }

    void SetValue(const ElemType v);
    void SetValue(const CPUMatrix<ElemType>& deepCopyFrom);
    //void SetValue(const GPUMatrix<ElemType>& deepCopyFrom);
    //void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
    //void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, size_t matrixFlags = matrixFlagNormal, IBaseMatrixStorageExternalBufferDeleter* deleter = nullptr);

    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val, size_t numColsPerMaskEntry);

    void SetColumn(const ElemType* colPointer, size_t colInd);
    void SetColumn(const CPUMatrix<ElemType>& valMat, size_t colInd);
    void SetColumn(const ElemType val, size_t j);

    void AssignValues(const double* data, size_t size);

    void SetDiagonalValue(const ElemType v);
    void SetDiagonalValue(const CPUMatrix<ElemType>& vector);
    void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    void SetUniformRandomValue(RNGHandle& rngHandle, const ElemType low, const ElemType high);
    void SetGaussianRandomValue(RNGHandle& rngHandle, const ElemType mean, const ElemType stdev);
    void SetGumbelRandomValue(RNGHandle& rngHandle, const ElemType loc, const ElemType scale);
    void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
    void SetTruncatedNormalRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
    void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle);
    void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);

    CPUMatrix<ElemType> Transpose();
    CPUMatrix<ElemType>& AssignTransposeOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& DoGatherColumnsOf (ElemType beta, const CPUMatrix<ElemType>& idx, const CPUMatrix<ElemType>& a, ElemType alpha);
    template <class ElemType2>
    CPUMatrix<ElemType>& DoScatterColumnsOf(ElemType beta, const CPUMatrix<ElemType2>& idx, const CPUMatrix<ElemType>& a, ElemType alpha);

    CPUMatrix<ElemType>& operator+=(const ElemType alpha);
    CPUMatrix<ElemType>  operator+(const ElemType alpha) const;
    CPUMatrix<ElemType>& AssignSumOf(const ElemType alpha, const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& operator+=(const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>  operator+(const CPUMatrix<ElemType>& a) const;
    CPUMatrix<ElemType>& AssignSumOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);

    CPUMatrix<ElemType>& operator-=(const ElemType alpha);
    CPUMatrix<ElemType>  operator-(const ElemType alpha) const;
    CPUMatrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>& AssignDifferenceOf(const CPUMatrix<ElemType>& a, const ElemType alpha);

    CPUMatrix<ElemType>& operator-=(const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>  operator-(const CPUMatrix<ElemType>& a) const;
    CPUMatrix<ElemType>& AssignDifferenceOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);

    CPUMatrix<ElemType>& operator*=(const ElemType alpha);
    CPUMatrix<ElemType>  operator*(const ElemType alpha) const;
    CPUMatrix<ElemType>& AssignProductOf(const ElemType alpha, const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>  operator*(const CPUMatrix<ElemType>& a) const;
    CPUMatrix<ElemType>& AssignProductOf(const CPUMatrix<ElemType>& a, const bool transposeA, const CPUMatrix<ElemType>& b, const bool transposeB);

    CPUMatrix<ElemType>& operator/=(ElemType alpha);
    CPUMatrix<ElemType>  operator/(ElemType alpha) const;

    CPUMatrix<ElemType>& operator^=(ElemType alpha);     // element-wise power
    CPUMatrix<ElemType>  operator^(ElemType alpha) const; // element-wise power
    CPUMatrix<ElemType>& AssignElementPowerOf(const CPUMatrix<ElemType>& a, const ElemType power);

    CPUMatrix<ElemType>& ElementMultiplyWith(const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>& AssignElementProductOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);
    CPUMatrix<ElemType>& AddElementProductOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);

    CPUMatrix<ElemType>& AssignElementDivisionOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);
    CPUMatrix<ElemType>& ElementDivideBy(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& ColumnElementMultiplyWith(const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>& RowElementMultiplyWith(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& ColumnElementDivideBy(const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>& RowElementDivideBy(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& ElementInverse();
    CPUMatrix<ElemType>& AssignElementInverseOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceSigmoid();
    CPUMatrix<ElemType>& AssignSigmoidOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceLinearRectifierDerivative();
    CPUMatrix<ElemType>& AssignLinearRectifierDerivativeOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceSigmoidDerivative();
    CPUMatrix<ElemType>& AssignSigmoidDerivativeOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceTanh();
    CPUMatrix<ElemType>& AssignTanhOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceLogSoftmax(const bool isColWise);
    CPUMatrix<ElemType>& AssignLogSoftmaxOf(const CPUMatrix<ElemType>& a, const bool isColWise);

    CPUMatrix<ElemType>& InplaceHardmax(const bool isColWise);
    CPUMatrix<ElemType>& AssignHardmaxOf(const CPUMatrix<ElemType>& a, const bool isColWise);

    // sequence training
    CPUMatrix<ElemType>& DropFrame(const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& gamma, const ElemType& threshhold);
    CPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& dnnoutput, const CPUMatrix<ElemType>& gamma, ElemType alpha);
    CPUMatrix<ElemType>& AssignCTCScore(const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta, const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, CPUMatrix<ElemType>& totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const size_t blankTokenId, const int delayConstraint, const bool isColWise);
    CPUMatrix<ElemType>& InplaceSqrt();
    CPUMatrix<ElemType>& AssignSqrtOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceExp();
    CPUMatrix<ElemType>& AssignExpOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceLog();
    CPUMatrix<ElemType>& AssignLogOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceLog10();
    CPUMatrix<ElemType>& AssignLog10Of(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceCosine();
    CPUMatrix<ElemType>& AssignCosineOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceNegativeSine();
    CPUMatrix<ElemType>& AssignNegativeSineOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceAcos();
    CPUMatrix<ElemType>& AssignAcosOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceAsin();
    CPUMatrix<ElemType>& AssignAsinOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceCosh();
    CPUMatrix<ElemType>& AssignCoshOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceSinh();
    CPUMatrix<ElemType>& AssignSinhOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceAbs();
    CPUMatrix<ElemType>& AssignAbsOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& InplaceTruncateBottom(const ElemType threshold);
    CPUMatrix<ElemType>& AssignTruncateBottomOf(const CPUMatrix<ElemType>& a, const ElemType threshold);
    CPUMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
    CPUMatrix<ElemType>& AssignTruncateTopOf(const CPUMatrix<ElemType>& a, const ElemType threshold);
    CPUMatrix<ElemType>& InplaceTruncate(const ElemType threshold);
    CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

    CPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);

    ElemType SumOfAbsElements() const; // sum of all abs(elements)
    ElemType SumOfElements() const;    // sum of all elements
    CPUMatrix<ElemType>& AssignSumOfElements(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& AssignOneHot(const CPUMatrix<ElemType>& a, vector<size_t>& shape, size_t axis);
    CPUMatrix<ElemType>& GatherFromTarget(const CPUMatrix<ElemType>& indices, const CPUMatrix<ElemType>& target, size_t row_elements);
    CPUMatrix<ElemType>& ScatterToIndices(const CPUMatrix<ElemType>& values, const CPUMatrix<ElemType>& indices, size_t row_elements);

    bool IsEqualTo(const CPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;

    static void VectorSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& c, const bool isColWise);

    void VectorNorm1(CPUMatrix<ElemType>& c, const bool isColWise) const;
    CPUMatrix<ElemType>& AssignVectorNorm1Of(CPUMatrix<ElemType>& a, const bool isColWise);

    void VectorNorm2(CPUMatrix<ElemType>& c, const bool isColWise) const;
    CPUMatrix<ElemType>& AssignVectorNorm2Of(CPUMatrix<ElemType>& a, const bool isColWise);

    void AssignNoiseContrastiveEstimation(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias,
                                          CPUMatrix<ElemType>& tmp, CPUMatrix<ElemType>& c);

    void AssignSoftmaxSum(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& softmax);

    void AssignNCEUnnormalizedEval(const CPUMatrix<ElemType>& a,
                                   const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& bias, CPUMatrix<ElemType>& c);

    CPUMatrix<ElemType>& AssignNCEDerivative(const CPUMatrix<ElemType>& tmp, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, size_t inputIndex, CPUMatrix<ElemType>& c);

    void VectorNormInf(CPUMatrix<ElemType>& c, const bool isColWise) const;
    CPUMatrix<ElemType>& AssignVectorNormInfOf(CPUMatrix<ElemType>& a, const bool isColWise);

    CPUMatrix<ElemType>& AssignInnerProductOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const bool isColWise);
    CPUMatrix<ElemType>& AssignKhatriRaoProductOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);
    CPUMatrix<ElemType>& AddColumnReshapeProductOf(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const bool transposeAColumn);

    CPUMatrix<ElemType>& AddWithScaleOf(ElemType alpha, const CPUMatrix<ElemType>& a);

    ElemType FrobeniusNorm() const;
    CPUMatrix<ElemType>& AssignFrobeniusNormOf(const CPUMatrix<ElemType>& a);

    ElemType MatrixNormInf() const;
    ElemType MatrixNorm1() const;
    ElemType MatrixNorm0() const; // number of non-zero elemets
    CPUMatrix<ElemType>& AssignSignOf(const CPUMatrix<ElemType>& a);
    CPUMatrix<ElemType>& AddSignOf(const CPUMatrix<ElemType>& a);

    CPUMatrix<ElemType>& AssignRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    CPUMatrix<ElemType>& AddToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    CPUMatrix<ElemType>& AddWithRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    // CPUMatrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const CPUMatrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

    CPUMatrix<ElemType>& AssignToRowSliceValuesOf(const CPUMatrix<ElemType>& a, const size_t startIndex, const size_t numRows);

    CPUMatrix<ElemType>& AssignRepeatOf(const CPUMatrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
    CPUMatrix<ElemType>& AddToRowRepeatValuesOf(const CPUMatrix<ElemType>& a, const size_t numRowRepeats);

    CPUMatrix<ElemType>& AssignPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
    CPUMatrix<ElemType>& AddFoldedPositiveAndShiftedNegSample(const CPUMatrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);

    void VectorMax(CPUMatrix<ElemType>& maxIndexes, CPUMatrix<ElemType>& maxValues, const bool isColWise, int topK = 1) const;
    void VectorMin(CPUMatrix<ElemType>& minIndexes, CPUMatrix<ElemType>& minValues, const bool isColWise) const;

    CPUMatrix<ElemType>& AssignNumOfDiff(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, bool searchInCol = false);

    void Print(const char* matrixName, ptrdiff_t rowStart, ptrdiff_t rowEnd, ptrdiff_t colStart, ptrdiff_t colEnd) const;
    void Print(const char* matrixName = nullptr) const; // print whole matrix. can be expensive

    void ReadFromFile(FILE* f, const char* matrixName); // matrixName is used to verify that correct matrix is read.
    void WriteToFile(FILE* f, const char* matrixName);  // matrixName is used to verify that correct matrix is read.

    CPUMatrix<ElemType>& AssignPackedConvolutionInput(const CPUMatrix<ElemType>& inputSubBatch,
                                                      const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
                                                      const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
                                                      const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample,
                                                      const bool zeroPadding = false);
    CPUMatrix<ElemType>& UnpackConvolutionInput(CPUMatrix<ElemType>& inputSubBatch,
                                                const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
                                                const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
                                                const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample,
                                                const bool zeroPadding = false) const;
    CPUMatrix<ElemType>& AssignMaxPoolingResult(const CPUMatrix<ElemType>& inputBatch, const size_t channels,
                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
    CPUMatrix<ElemType>& AddMaxPoolingGradient(const CPUMatrix<ElemType>& outputGradientBatch, const CPUMatrix<ElemType>& inputBatch, const CPUMatrix<ElemType>& outputBatch,
                                               const size_t channels,
                                               const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                               const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                               const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
    CPUMatrix<ElemType>& AssignAveragePoolingResult(const CPUMatrix<ElemType>& inputBatch, const size_t channels,
                                                    const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                    const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                    const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
    CPUMatrix<ElemType>& AddAveragePoolingGradient(const CPUMatrix<ElemType>& outputGradientBatch,
                                                   const size_t channels,
                                                   const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                   const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                   const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);

    void ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
                            const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
    void ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const;
    void ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
                                   const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const;

    void UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
                                const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
    void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const CPUMatrix<int>& mpRowCol,
                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
    void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
                                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;

    void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
    void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
                            const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                            CPUMatrix<ElemType>& grad, bool accumulateGradient) const;

    void MaxROIPoolingForward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
                              const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix<ElemType>& roiData, CPUMatrix<ElemType>& output, CPUMatrix<ElemType>& argmax, double spatialScale) const;

    void MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
                               const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix<ElemType>& roiData, CPUMatrix<ElemType>& grad, CPUMatrix<ElemType>& argmax, double spatialScale) const;

    void MaxUnpooling(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, const CPUMatrix<ElemType>& poolInput, CPUMatrix<ElemType>& input) const;

    void AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output, const bool poolIncludePad) const;
    void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                                CPUMatrix<ElemType>& grad, const bool poolIncludePad, bool accumulateGradient) const;

    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
                                   CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
                                    CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;

public:
    // This functions do not depend on <ElemType>, i.e. you can call them on any <ElemType>
    static int SetNumThreads(int numThreads);
    static int GetMaxNumThreads();

    static void SetCompatibleMode();

    // static BLAS functions
    static void SVD(const CPUMatrix<ElemType>& A, CPUMatrix<ElemType>& SIGMA, CPUMatrix<ElemType>& U, CPUMatrix<ElemType>& VT, CPUMatrix<ElemType>& W);

    static void MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& a, const bool transposeA, const CPUMatrix<ElemType>& b, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c, shared_ptr<QuantizedMultiplier<ElemType>> pQuantizedMultiplier=nullptr);
    static void MultiplyAndAdd(const CPUMatrix<ElemType>& a, const bool transposeA, const CPUMatrix<ElemType>& b, const bool transposeB, CPUMatrix<ElemType>& c);
    static void Multiply(const CPUMatrix<ElemType>& a, const bool transposeA, const CPUMatrix<ElemType>& b, const bool transposeB, CPUMatrix<ElemType>& c);
    static void Multiply(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
    static void Multiply1x1AndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, ElemType beta, CPUMatrix<ElemType>& c);

    static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& v, ElemType beta, CPUMatrix<ElemType>& c);

    static void ScaleAndAdd(ElemType alpha, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& c);
    static void AddScaledDifference(const ElemType alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
    static void AssignScaledDifference(const ElemType alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);
    static void AddScaledDifference(const CPUMatrix<ElemType>& alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);    // alpha must be 1X1
    static void AssignScaledDifference(const CPUMatrix<ElemType>& alpha, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c); // alpha must be 1X1

    static void AddElementToElement(ElemType beta, const CPUMatrix<ElemType>& a, const size_t ai, const size_t aj, CPUMatrix<ElemType>& c, const size_t ci, const size_t cj);

    static void MinusOneAt(CPUMatrix<ElemType>& c, const size_t position);

    static void Scale(ElemType alpha, CPUMatrix<ElemType>& a);
    static void Scale(CPUMatrix<ElemType> alpha, CPUMatrix<ElemType>& a); // In this case Matrix alpha must be 1x1
    static void Scale(ElemType alpha, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& c);
    static void InnerProduct(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c, const bool isColWise);
    static ElemType InnerProductOfMatrices(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b);
    static void ElementWisePower(ElemType alpha, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& c);

    static bool AreEqual(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const ElemType threshold = 1e-8);

    static void TensorShuffleScaleAndAdd(ElemType keepWeight, const CPUMatrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c);

    template <size_t N>
    static void TensorOp(size_t arity, const std::array<ElemType*, N>& pointers,
                         ElementWiseOperator op, ElementWiseOperator reductionOp, ElemType alpha, ElemType beta,
                         const std::array<size_t, N>& offsets,
                         const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, N>& regularStrides,
                         const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, N>& reducingStrides)
    {
        assert(pointers.size() == arity + 1); arity;
        TensorOp(beta, pointers, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }
private:
    // TODO: these are the same as TensorOp above except for 'arity' and different order. This is the result of some refactoring. Clean this up further some day.
    static void TensorOp(ElemType beta, const std::array<ElemType*, 1>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 1>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 1>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 1>& reducingStrides);
    static void TensorOp(ElemType beta, const std::array<ElemType*, 2>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
    static void TensorOp(ElemType beta, const std::array<ElemType*, 3>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
    static void TensorOp(ElemType beta, const std::array<ElemType*, 4>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 4>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
    static void TensorOp(ElemType beta, const std::array<ElemType*, 5>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 5>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 5>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 5>& reducingStrides);
public:

    int Argmin() const;
    int Argmax() const;
    int ArgOp(ElementWiseOperator reductionOp) const;

    // TODO: merge this with TensorOp() above
    static void TensorArgOp(std::array<ElemType*, 2> pointers, ElementWiseOperator reductionOp,
                            const std::array<size_t, 2>& offsets,
                            const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                            const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);

    static CPUMatrix<ElemType> Ones(const size_t rows, const size_t cols);
    static CPUMatrix<ElemType> Zeros(const size_t rows, const size_t cols);
    static CPUMatrix<ElemType> Eye(const size_t rows);
    static CPUMatrix<ElemType> RandomUniform(const size_t rows, const size_t cols, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    static CPUMatrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);

    // return true if v is an element in matrix c
    static bool HasElement(const CPUMatrix<ElemType>& a, const ElemType v = 0.0);

public:
    CPUMatrix<ElemType>& AssignElementProductOfWithShiftNeg(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, size_t shift, size_t negnumber);
    static void InnerProductWithShiftNeg(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c, const bool isColWise, size_t shift, size_t negnumber);
    // extract out a row from a, assign it to [this].
    CPUMatrix<ElemType>& GetARowByIndex(const CPUMatrix<ElemType>& a, const size_t index);
    static void ConductRowElementMultiplyWithShift(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& c, const size_t shift, bool bFirstmatrixfixed);
    CPUMatrix<ElemType>& AssignElementProductOfWithShift(const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const size_t shift);

public:
    friend File& operator>>(File& stream, CPUMatrix<ElemType>& us)
    {
        stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
        size_t elsize;
        stream >> elsize;
        if (sizeof(ElemType) != elsize)
            RuntimeError("Template argument size doesn't match those in file");
        std::wstring matrixName;
        size_t numRows, numCols;
        int format;
        stream >> matrixName >> format >> numRows >> numCols;
        ElemType* d_array = new ElemType[numRows * numCols];
        for (size_t i = 0; i < numRows * numCols; ++i)
            stream >> d_array[i];
        stream.GetMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
        us.SetValue(numRows, numCols, d_array, matrixFlagNormal);

        delete[] d_array;
        return stream;
    }
    friend File& operator<<(File& stream, const CPUMatrix<ElemType>& us)
    {
        stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
        stream << sizeof(ElemType);

        std::wstring s = std::wstring(L"unnamed");
        int format = us.GetFormat();
        stream << s << format;

        stream << us.m_numRows << us.m_numCols;
        for (size_t i = 0; i < us.GetNumElements(); ++i)
            stream << us.Data()[i];
        stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
        return stream;
    }

public:
    ElemType LogSumOfElements() const;

public:
    // for RCRF
    static void RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
                                    const CPUMatrix<ElemType>& lbls,
                                    const CPUMatrix<ElemType>& pair_scores);
    static void _rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
                                     CPUMatrix<ElemType>& beta,
                                     const CPUMatrix<ElemType>& pair_scores);

    static void RCRFTransGrdCompute(const CPUMatrix<ElemType>& lbls,
                                    const CPUMatrix<ElemType>& alpha,
                                    const CPUMatrix<ElemType>& beta,
                                    const CPUMatrix<ElemType>& pair_scores,
                                    CPUMatrix<ElemType>& grd);

    static void _rcrfTransGrdCompute(size_t i,
                                     const CPUMatrix<ElemType>& lbls,
                                     const CPUMatrix<ElemType>& alpha,
                                     const CPUMatrix<ElemType>& beta,
                                     const CPUMatrix<ElemType>& pair_scores,
                                     CPUMatrix<ElemType>& grd,
                                     const size_t tPos // position
                                     );

protected:
    size_t LocateElement(const size_t i, const size_t j) const;
    size_t LocateColumn(const size_t j) const;

private:
    void Clear();

    template<class ElemType2>
    void ScatterValues(ElemType2* indices, ElemType* value, ElemType* data, ElemType alpha, size_t num_indices, size_t rows, size_t cols, size_t indices_step = 1);
};

typedef CPUMatrix<float> CPUSingleMatrix;
typedef CPUMatrix<double> CPUDoubleMatrix;

}}}
back to top