Content - 529a91c523eb9cbcb9d470bf4a0fcbd304400e24 - 5687ec7/Source/Math/Matrix.h

visit type:
Tip revision: 1fc15ed1058afc48ade968758b2d4d23974f156d authored by Guoli Ye on 21 February 2020, 06:34:44 UTC
multithread GPU code checkin, no speedup though due to the stream in kernel function
Tip revision: 1fc15ed
Matrix.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// TODO:
//  - remove empty-matrix checks: if an op is well-defined with empty matrices, then do it
//  - Resize() must be cheap if it does nothing  (I already did that for CPU; already done for GPU?)
//

#pragma once

#include "Basics.h"
#include "File.h"
#include "CommonMatrix.h"
#include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
#include "RNGHandle.h"
#include "DataTransferer.h"
#include <limits.h>
#include <memory> // for shared_ptr
#include <array>
#include <initializer_list>
#include "QuantizedOperations.h"
#include "half.hpp"

// Forward declarations
namespace CNTK
{
    class NDArrayView;
    class Value;
}

// This class is exported from the Math.dll
namespace Microsoft { namespace MSR { namespace CNTK {

enum CurrentDataLocation
{
    NONE,
    CPU,
    GPU,
    BOTH
};

enum MatrixType
{
    UNDETERMINED,
    DENSE,
    SPARSE
};

// avoid pulling in these header files for consumers of this class
template <class ElemType> class GPUMatrix;
template <class ElemType> class CPUMatrix;
template <class ElemType> class GPUSparseMatrix;
template <class ElemType> class CPUSparseMatrix;
template <class ElemType> class DeviceBoundNumber;

// <ElemType>-agnostic base class
struct /*interface*/ MATH_API MatrixBase : public std::enable_shared_from_this<MatrixBase>
{
    virtual int GetDeviceId() const = 0;
    virtual MatrixType GetMatrixType() const = 0;
    virtual MatrixFormat GetFormat() const = 0;
    virtual void CastAssignValuesOf(const MatrixBase& other) = 0; // allows for mixed assignment with conversion
    // TODO: Move more generic functions such as getting dims, resizing, and getting/setting as scalars in here.
    virtual ~MatrixBase();
};
typedef std::shared_ptr<MatrixBase> MatrixBasePtr;

// Note: To comply with BLAS libraries, matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor convertion.
// !!!WARNING!!! This class is NOT THREAD SAFE. Test and add necessary modifications if using in multi-threaded environment
template <class ElemType>
class MATH_API Matrix : public MatrixBase
{
    friend class ::CNTK::NDArrayView;
    friend class ::CNTK::Value;

    typedef MatrixBase Base;
private:
    mutable BaseMatrix<ElemType>*                 m_baseMatrix;
    mutable shared_ptr<GPUMatrix      <ElemType>> m_GPUMatrix;
    mutable shared_ptr<CPUMatrix      <ElemType>> m_CPUMatrix;
    mutable shared_ptr<GPUSparseMatrix<ElemType>> m_GPUSparseMatrix;
    mutable shared_ptr<CPUSparseMatrix<ElemType>> m_CPUSparseMatrix;

    mutable MatrixType m_matrixType;
    mutable CurrentDataLocation m_currentDataLocation; // Indicates which matrix is current

    mutable DEVICEID_TYPE m_preferredDeviceId;
    mutable size_t m_numTimesDeviceChanged;
    mutable size_t m_numTimesMatrixTypeChanged;
    mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics

    // Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
    void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
    // Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
    void _transferToDevice(int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
    template <class ElemType2>
    static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType2>& b);
    static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c);
    static void DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d);
    static void CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest);

public:
    // Constructors, destructors and other static matrix builders
    // Each constructor can take deviceId as parameter.
    // If deviceId<0 then the matrix will be based in RAM (CPUMatrix)
    // Elseif deviceId>=0 then the matrix will be based on GPU with specified deviceId
    explicit Matrix(DEVICEID_TYPE deviceId);
    // This constructor is not used, but it makes the ownership of baseMatrix ambiguous. If it's to be used, ensure that the semantics with external buffer are clear.
#if 0
    Matrix(shared_ptr<BaseMatrix<ElemType>> baseMatrix, ElemType* pArray, DEVICEID_TYPE deviceId);                                     // constructor for setting Matrix from a base matrix (externally managed butter pArray)
#endif
    Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType = DENSE, const MatrixFormat matrixFormat = matrixFormatDense, const size_t nnz = 0);
    // TODO: Rewrite this constructor to eliminate the external buffers flag. Make a separate construction mechanism for Matrix objects that don't own their storage.
    Matrix(const size_t numRows, const size_t numCols, ElemType* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags = matrixFlagNormal, const size_t nnz = 0);
    Matrix(const Matrix<ElemType>& deepCopyFrom, DEVICEID_TYPE deviceId);
    Matrix(Matrix<ElemType>&& moveFrom);                                                    // move constructor, shallow copy
    Matrix<ElemType>& operator=(Matrix<ElemType>&& moveFrom);                               // move assignment operator, shallow copy

    Matrix<ElemType> DeepClone() const;

    // Disallow deep copy construction and assignment to avoid
    // inadvertent silent deep copying
    Matrix(const Matrix<ElemType>& deepCopyFrom) = delete;
    Matrix<ElemType>& operator=(const Matrix<ElemType>& deepCopyFrom) = delete;

    static Matrix<ElemType> Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId);
    static Matrix<ElemType> Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId);
    static Matrix<ElemType> Eye(const size_t rows, DEVICEID_TYPE deviceId);

#define USE_TIME_BASED_SEED ULONG_MAX
    static Matrix<ElemType> RandomUniform(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    static Matrix<ElemType> RandomGaussian(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);

    static void SetDevice(DEVICEID_TYPE deviceId); // TODO: unify with PrepareDevice()

    void ReleaseMemory();
    ~Matrix();

    // workaround to bugs in BOTH implementation: force to collapse to home location
    void CollapseDataLocation() const
    {
        SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
    }

private:
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID);                                  // only used internally to initialize a blank matrix
    Matrix(const MatrixFlags matrixFlags, DEVICEID_TYPE deviceID);                                                               // only used internally to initialize a blank matrix
    void Init(DEVICEID_TYPE deviceID);
    void SetDataLocation(CurrentDataLocation location, MatrixType type = UNDETERMINED) const;


public:
    void ShallowCopyFrom(const Matrix<ElemType>& other);
    // down-cast to make life easier
    template <class T>
    static shared_ptr<T> DownCast(shared_ptr<BaseMatrix<ElemType>> inode)
    {
        shared_ptr<T> node = dynamic_pointer_cast<T>(inode);
        if (!node)
            LogicError("A Matrix of mismatching type was passed.");
        return node;
    }

    MatrixType GetMatrixType() const override;
    MatrixFormat GetFormat() const override;
    bool OwnBuffer() const { return m_baseMatrix->OwnBuffer(); }
    int GetDeviceId() const; // -1 if CPU, otherwise GPU CUDA device id
    DEVICEID_TYPE GetPreferredDeviceId() const { return m_preferredDeviceId; }; // -1 if CPU, otherwise GPU CUDA device id
    void SetPreferredDeviceId(DEVICEID_TYPE preferredDeviceId) { m_preferredDeviceId = preferredDeviceId; }
    // Moves matrix from device id_from to device with id_to.
    // If emptyTransfer=true, then no data is ever moved, just corresponding GPU/CPU matrices are deleted and then created using empty constructor
    void TransferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = false, /*if false then keep source and set location to BOTH*/ bool emptyTransfer = false, bool updatePreferredDevice = true) const;
    // Same as TransferFromDeviceToDevice() but moves only if it is currently not on the target device
    void TransferToDeviceIfNotThere(int id_to, bool isBeingMoved = false, bool emptyTransfer = false, bool updatePreferredDevice = true) const;
    CurrentDataLocation GetCurrentMatrixLocation() const { return m_currentDataLocation; };
    void SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat, bool keepValues); // sets matrix type between dense and sparse
    size_t GetNumRows() const;
    size_t GetNumCols() const;
    size_t GetDiagSize() const;
    size_t GetNumElements() const;
    bool HasNoElements() const { return GetNumElements() == 0; }
    bool IsEmpty() const;
    size_t BufferSize() const;
    ElemType* Data() const;
    bool IsView() const;

    ElemType* CopyToArray() const;                                              // allocated by the callee but need to be deleted by the caller
    size_t CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const; // allocated by the callee but need to be deleted by the caller
    // colStride specifies leading dimension of dst.
    // REVIEW alexeyk: GPU version copies from device to host only, implement all versions (device <-> host).
    void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const;

    Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const; // note: 'const' is misleading here, as the returned matrix is a mutable reference

    // difference between AssignColumnSlice and SetColumnSlice
    // AssignColumnSlice :      this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1)
    // SetColumnSlice    :      this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, 0: startColumn+numCols-1)
    // AssignColumnSlice do not transfer data, it uses external data
    // SetColumnSlice    copies data

    Matrix<ElemType>& AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);
    Matrix<ElemType>& SetColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols);

    void CopyColumnsStrided(const Matrix<ElemType>& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride);

    Matrix<ElemType> Diagonal() const;
    void AssignDiagonalValuesTo(Matrix<ElemType>& diag) const;

    void SGDUpdate(Matrix<ElemType>& gradients, ElemType learnRatePerSample);
    void MomentumSGDUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& smoothedGradients, ElemType learnRatePerSample, ElemType momentum, ElemType unitGainFactor);
    void NesterovAcceleratedMomentumSGDUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& smoothedGradients, ElemType learnRatePerSample, ElemType momentum, ElemType unitGainFactor);

    ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
    void FSAdagradUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const double targetAdagradAvDenom_x_sqrtAdagradSqrFrames,
                         const double learnRatePerSample, const double meanMomentum, const double varMomentum, ElemType unitGainFactor);

    void AdamUpdate(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const double smoothedCount,
        const double learnRatePerSample, const double meanMomentum, const double varMomentum, const double epsilon, ElemType unitGainFactor, bool adamax = false);

    ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier, const bool initialized);

    template<typename GradType>
    void AdaDeltaUpdate(Matrix<GradType>& gradients, Matrix<ElemType>& functionvalues, ElemType learningRatePerSample, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp);

    void AdaDeltaFlushState(size_t stride, ElemType rho, int* timestamps, int currentTimestamp);

    void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, bool growOnly = true, bool keepValue = false); // by default we only reallocate if need to grow
    void Resize(const Matrix<ElemType>& other) // TODO: Should this carry over numNZElemToReserve for sparse matrices?
    {
        Resize(other.GetNumRows(), other.GetNumCols());
    }
    void Resize(const size_t numRows, const size_t numCols, bool keepValue)
    {
        Resize(numRows, numCols, 10000, true, keepValue);
    }

    void VerifySize(size_t rows, size_t cols)
    {
        m_baseMatrix->VerifySize(rows, cols);
    }

    // TODO: Call this ShallowClone instead?
    Matrix<ElemType> AsReference() const
    {
        return ColumnSlice(0, GetNumCols());
    }                                                                           // get a reference (e.g. this is not resizable but can be reshaped)
    void Reshape(const size_t numRows, const size_t numCols);                   // note: reshapes in place. To get a reshaped reference, use Reshaped()
    Matrix<ElemType> Reshaped(const size_t numRows, const size_t numCols) const // get a reshaped reference
    {
        Matrix<ElemType> result = AsReference();
        result.Reshape(numRows, numCols);
        return result;
    }

    // update number of columns
    // TODO: a future version may want to enforce retaining the content, to allow dynamically growing layouts column by column (when size is not known upfront)
    void ResizeColumns(const size_t numCols)
    {
        Resize(GetNumRows(), numCols);
    }

    // similarl to the repmat operation in matlab or octave
    static Matrix<ElemType> RepMat(const Matrix<ElemType>& frmMat, const size_t rows, const size_t cols);
    size_t GetAllocatedSize() const;
    void Reset(); // reset for sparse matrix

    const ElemType operator()(const size_t row, const size_t col) const;
    ElemType& operator()(const size_t row, const size_t col);
    ElemType GetValue(const size_t row, const size_t col) const { return operator()(row, col); } // use this for reading on non-const objects to avoid inefficiency
    ElemType Get00Element() const;

    void SetValue(const ElemType v);
    void SetValue(const DeviceBoundNumber<ElemType>& db_number);
    //void SetValue       (const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format = matrixFormatSparseCSR); // BUGBUG: default for 'format' is unexpected
    // SetValue respects the source matrix's information. It moves the target's location (if necessary), and then copies the sources values.
    void SetValue      (const Matrix<ElemType>& deepCopyFrom);
    // AssignValuesOf respects the target matrix's information. It copies the values from the target into the memory of the source.
    void AssignValuesOf(const Matrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags = matrixFlagNormal, DataTransferer* transferer = nullptr);
    void SetValue(const size_t rIdx, const size_t cIdx, ElemType val); // set matrix sparsely
    void SetValue(const size_t numRows, const size_t numCols, std::initializer_list<ElemType> l) // SetValue(2,3, {1,2,3,  4,5,6});
    {
        std::vector<ElemType> vals(l);
        assert(vals.size() == numRows * numCols);
        SetValue(numRows, numCols, GetDeviceId(), vals.data(), matrixFormatRowMajor);
    }
    void CastAssignValuesOf(const MatrixBase& other) override; // allows for mixed assignment with conversion
    static ElemType MakeNan(size_t payload);
    void Invalidate()
    {
        SetValue(MakeNan(__LINE__));
    }
    void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCCol, const CPUSPARSE_INDEX_TYPE* h_Row, const ElemType* h_Val,
        const size_t nz, const size_t numRows, const size_t numCols, DataTransferer* transferer = nullptr);

    void MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val, size_t numColsPerMaskEntry);

    void SetColumn(const ElemType* colPointer, size_t colInd);
    void SetColumn(const ElemType val, size_t colInd);
    void SetColumn(const Matrix<ElemType>& valMat, size_t colInd);

    void AdjustSparseBlockColumn(const GPUSPARSE_INDEX_TYPE* cpuCol2BlockId, size_t numBlocks, bool useBlockId2Col);

    void SetDiagonalValue(const ElemType v);
    void SetDiagonalValue(const Matrix<ElemType>& vector);
    void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    void SetUniformRandomValue(RNGHandle& rngHandle, const ElemType low, const ElemType high);
    void SetGaussianRandomValue(RNGHandle& rngHandle, const ElemType mean, const ElemType stdev);
    void SetGumbelRandomValue(RNGHandle& rngHandle, const ElemType loc, const ElemType scale);
    void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
    void SetTruncatedNormalRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
    void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle);
    void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
    Matrix<ElemType>& AssignNoiseContrastiveEstimation(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias, Matrix<ElemType>& tmp);

    Matrix<ElemType>& AssignNCEDerivative(const Matrix<ElemType>& tmp, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, size_t inputIndex);
    Matrix<ElemType>& AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax);
    Matrix<ElemType>& AssignNceUnnormalizedEval(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias);

    Matrix<ElemType>& AssignOneHot(const Matrix<ElemType>& a, vector<size_t>& shape, size_t axis, bool is_sparse);
    Matrix<ElemType>& GatherFromTarget(const Matrix<ElemType>& indices, const Matrix<ElemType>& target, size_t row_elements);
    Matrix<ElemType>& ScatterToIndices(const Matrix<ElemType>& values, const Matrix<ElemType>& indices, size_t row_elements, const Matrix<char>* mask = nullptr);

    Matrix<ElemType> Transpose(); // This method doesn't change state of Matrix. It should be a const function
    Matrix<ElemType>& AssignTransposeOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& DoGatherColumnsOf (ElemType beta, const Matrix<ElemType>& idx, const Matrix<ElemType>& a, ElemType alpha);
    Matrix<ElemType>& DoScatterColumnsOf(ElemType beta, const Matrix<ElemType>& idx, const Matrix<ElemType>& a, ElemType alpha, bool idxHaveDups);

    Matrix<ElemType>& operator+=(const ElemType alpha);
    Matrix<ElemType>  operator+(const ElemType alpha) const;
    Matrix<ElemType>& AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a);

    Matrix<ElemType>& operator+=(const Matrix<ElemType>& a);
    Matrix<ElemType>  operator+(const Matrix<ElemType>& a) const;
    Matrix<ElemType>& AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);

    Matrix<ElemType>& operator-=(const ElemType alpha);
    Matrix<ElemType>  operator-(const ElemType alpha) const;
    Matrix<ElemType>& AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a);
    Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha);

    Matrix<ElemType>& operator-=(const Matrix<ElemType>& a);
    Matrix<ElemType>  operator-(const Matrix<ElemType>& a) const;
    Matrix<ElemType>& AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);

    Matrix<ElemType>& operator*=(const ElemType alpha);
    Matrix<ElemType>  operator*(const ElemType alpha) const;
    Matrix<ElemType>& AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a);

    Matrix<ElemType>  operator*(const Matrix<ElemType>& a) const;
    Matrix<ElemType>& AssignProductOf(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB); // this = a * b
    Matrix<ElemType>& Assign1x1ProductOf(const Matrix<ElemType>& a1x1, const Matrix<ElemType>& b);                                         // this = a * b, where a is 1x1

    Matrix<ElemType>& operator/=(ElemType alpha);
    Matrix<ElemType>  operator/(ElemType alpha) const;

    Matrix<ElemType>& operator^=(ElemType alpha);     // element-wise power
    Matrix<ElemType>  operator^(ElemType alpha) const; // element-wise power
    Matrix<ElemType>& AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power);

    // TODO: There are several functions below that perform an in-place operation
    // We should prepend the names of these functions with InPlace for clearly indicating
    // the semantics for callers.
    Matrix<ElemType>& ElementMultiplyWith(const Matrix<ElemType>& a);
    Matrix<ElemType>& AssignElementProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
    Matrix<ElemType>& AddElementProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);

    Matrix<ElemType>& AssignElementDivisionOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
    Matrix<ElemType>& ElementDivideBy(const Matrix<ElemType>& a);

    Matrix<ElemType>& ColumnElementMultiplyWith(const Matrix<ElemType>& a);
    Matrix<ElemType>& RowElementMultiplyWith(const Matrix<ElemType>& a);

    Matrix<ElemType>& ColumnElementDivideBy(const Matrix<ElemType>& a);
    Matrix<ElemType>& RowElementDivideBy(const Matrix<ElemType>& a);

    Matrix<ElemType>& ElementInverse();
    Matrix<ElemType>& AssignElementInverseOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceLinearRectifierDerivative();
    Matrix<ElemType>& AssignLinearRectifierDerivativeOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceSigmoidDerivative();
    Matrix<ElemType>& AssignSigmoidDerivativeOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceSigmoid();
    Matrix<ElemType>& AssignSigmoidOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceTanh();
    Matrix<ElemType>& AssignTanhOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceAtanh();
    Matrix<ElemType>& AssignAtanhOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceLogSoftmax(const bool isColWise);
    Matrix<ElemType>& AssignLogSoftmaxOf(const Matrix<ElemType>& a, const bool isColWise);

    Matrix<ElemType>& InplaceHardmax(const bool isColWise);
    Matrix<ElemType>& AssignHardmaxOf(const Matrix<ElemType>& a, const bool isColWise);

    // sequence training
    Matrix<ElemType>& DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType& threshhold);
    Matrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label, const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha);

    Matrix<ElemType>& AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta, const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, Matrix<ElemType>& totalScore,
        const vector<size_t> & extraUttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep,
        const size_t mbSize, const size_t blankTokenId, const int delayConstraint, const bool isColWise);
    Matrix<ElemType>& AssignUserOp1(Matrix<ElemType>& in1, Matrix<ElemType>& in2, const vector<size_t>& uttFrameToChanInd, const vector<size_t>& uttPhoneToChanInd,
        const vector<size_t>& uttFrameBeginIdx, const vector<size_t>& uttPhoneBeginIdx, const vector<size_t>& uttBeginForOutputditribution, const vector<size_t>& uttFrameNum,
        const vector<size_t>& uttPhoneNum, const size_t totalcol, const size_t numParallelSequences, const size_t numPhoneParallelSequences);
    Matrix<ElemType>& AssignUserOp2(Matrix<ElemType>& in1, const vector<size_t>& uttFrameToChanInd, const vector<size_t>& uttPhoneToChanInd,
        const vector<size_t>& uttFrameBeginIdx, const vector<size_t>& uttPhoneBeginIdx, const vector<size_t>& uttBeginForOutputditribution, const vector<size_t>& uttFrameNum,
        const vector<size_t>& uttPhoneNum, const size_t numParallelSequences, const size_t numPhoneParallelSequences, const size_t maxFrameNum, const size_t maxPhoneNum, const size_t Idx);
    Matrix<ElemType>& AssignRNNTScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta, const Matrix<ElemType>& phoneSeq,
        const Matrix<ElemType>& phoneBoundary, const vector<size_t>& uttFrameToChanInd, const vector<size_t> & uttFrameBeginIdx, const vector<size_t> & uttBeginForOutputditribution,
        const vector<size_t>& uttPhoneToChanInd, const vector<size_t> & uttPhoneBeginIdx, 
        const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t numParallelSequences, const size_t numPhoneParallelSequences, const size_t maxPhoneNum, const size_t maxFrameNum,
        Matrix<ElemType>& totalScore, const size_t blankTokenId, const int delayConstraint, const bool isColWise,
        std::vector<float>& vt_probs, const std::vector<float>& vt_wer, const std::vector<size_t>& vt_labseqlen, bool lengthNorm, bool wordPathPosteriorFromDecodeMBR, bool doMBR);

    Matrix<ElemType>& InplaceSqrt();
    Matrix<ElemType>& AssignSqrtOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceExp();
    Matrix<ElemType>& AssignExpOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceLog();
    Matrix<ElemType>& AssignLogOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceCosine();
    Matrix<ElemType>& AssignCosineOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceNegativeSine();
    Matrix<ElemType>& AssignNegativeSineOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceTan();
    Matrix<ElemType>& AssignTanOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceAcos();
    Matrix<ElemType>& AssignAcosOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceAsin();
    Matrix<ElemType>& AssignAsinOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceAtan();
    Matrix<ElemType>& AssignAtanOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceCosh();
    Matrix<ElemType>& AssignCoshOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceSinh();
    Matrix<ElemType>& AssignSinhOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceAsinh();
    Matrix<ElemType>& AssignAsinhOf(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceLog10();
    Matrix<ElemType>& AssignLog10Of(const Matrix<ElemType>& a);

    Matrix<ElemType>& InplaceAbs();
    Matrix<ElemType>& AssignAbsOf(const Matrix<ElemType>& a);

    // TODO: rename these to InPlaceFloor() and -Ceil() (I never know what it means to truncate a bottom)
    //       And also document and implement that sparse matrices can only truncate towards 0.
    Matrix<ElemType>& InplaceTruncateBottom(const ElemType threshold);
    Matrix<ElemType>& AssignTruncateBottomOf(const Matrix<ElemType>& a, const ElemType threshold);
    Matrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
    Matrix<ElemType>& AssignTruncateTopOf(const Matrix<ElemType>& a, const ElemType threshold);
    Matrix<ElemType>& InplaceTruncate(const ElemType threshold);
    Matrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
    void InplaceTranspose();

    Matrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
    Matrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);


    DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
    ElemType SumOfAbsElements() const; // sum of all abs(elements)
    ElemType SumOfElements() const;    // sum of all elements
    Matrix<ElemType>& AssignSumOfElements(const Matrix<ElemType>& a);

    ElemType LogSumOfElements() const;

    Matrix<ElemType>& AssignToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    Matrix<ElemType>& AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    Matrix<ElemType>& AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    Matrix<ElemType>& AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows);
    // Matrix<ElemType>&  AssignRowStackValuesOf(const std::vector<const Matrix<ElemType>*>& inputMatrices, const size_t sliceStartCol, const size_t sliceNumCols);

    Matrix<ElemType>& AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats);
    Matrix<ElemType>& AddToRowRepeatValuesOf(const Matrix<ElemType>& a, const size_t numRepeats);

    Matrix<ElemType>& AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);
    Matrix<ElemType>& AddFoldedPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber);

    bool IsValid() const;
    bool IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold = 1e-8) const;

    static void VectorSum(const Matrix<ElemType>& a, Matrix<ElemType>& c, const bool isColWise);

    void VectorNorm1(Matrix<ElemType>& c, const bool isColWise) const;
    Matrix<ElemType>& AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise); // TODO: arg should be const

    void VectorNorm2(Matrix<ElemType>& c, const bool isColWise) const;
    Matrix<ElemType>& AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise); // TODO: arg should be const

    void VectorNormInf(Matrix<ElemType>& c, const bool isColWise) const;
    Matrix<ElemType>& AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise);

    Matrix<ElemType>& AssignInnerProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise);
    Matrix<ElemType>& AssignKhatriRaoProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
    Matrix<ElemType>& AddColumnReshapeProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool transposeAColumn);

    Matrix<ElemType>& AddWithScaleOf(ElemType alpha, const Matrix<ElemType>& a); // this += alpha * a

    ElemType FrobeniusNorm() const;
    Matrix<ElemType>& AssignFrobeniusNormOf(const Matrix<ElemType>& a);

    ElemType MatrixNormInf() const;
    ElemType MatrixNorm1() const;
    ElemType MatrixNorm0() const; // number of non-zero elemets
    Matrix<ElemType>& AssignSignOf(const Matrix<ElemType>& a);
    Matrix<ElemType>& AddSignOf(const Matrix<ElemType>& a);
    void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const;
    void VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const;
    void VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const;

    Matrix<ElemType>& AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol = false);

    Matrix<ElemType>& AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b); // this method will resize(1,1) first

    bool HasNan(const char* name) const;
    size_t CountNanInf() const;

    void Print(const char* matrixName, ptrdiff_t rowFirst, ptrdiff_t rowLast, ptrdiff_t colFirst, ptrdiff_t colLast) const;
    void Print(const char* matrixName = nullptr) const; // print whole matrix. can be expensive

    Matrix<ElemType>& AssignPackedConvolutionInput(const Matrix<ElemType>& inputSubBatch,
                                                   const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
                                                   const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
                                                   const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample,
                                                   const bool zeroPadding = false);
    Matrix<ElemType>& UnpackConvolutionInput(Matrix<ElemType>& inputSubBatch,
                                             const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
                                             const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
                                             const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample,
                                             const bool zeroPadding = false) const;
    Matrix<ElemType>& AssignMaxPoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels,
                                             const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                             const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                             const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
    Matrix<ElemType>& AddMaxPoolingGradient(const Matrix<ElemType>& outputGradientBatch, const Matrix<ElemType>& inputBatch, const Matrix<ElemType>& outputBatch,
                                            const size_t channels,
                                            const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                            const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                            const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
    Matrix<ElemType>& AssignAveragePoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels,
                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                 const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);
    Matrix<ElemType>& AddAveragePoolingGradient(const Matrix<ElemType>& outputGradientBatch,
                                                const size_t channels,
                                                const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);

    void ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
                            const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
    void ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const;
    void ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
                                   const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const;

    void UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const Matrix<int>& mpRowCol,
                                const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
    void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const Matrix<int>& mpRowCol,
                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
    void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
                                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;

    void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
    void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
                            const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
                            Matrix<ElemType>& grad, bool accumulateGradient) const;

    void MaxROIPoolingForward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
                              const size_t pooledWidth, const size_t pooledHeight, const Matrix<ElemType>& roiData, Matrix<ElemType>& output, Matrix<ElemType>& argmax, double spatialScale) const;

    void MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
                               const size_t pooledWidth, const size_t pooledHeight, const Matrix<ElemType>& roiData, Matrix<ElemType>& grad, Matrix<ElemType>& argmax, double spatialScale) const;

    void MaxUnpooling(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, const Matrix<ElemType>& poolInput, Matrix<ElemType>& input) const;

    void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output, const bool poolIncludePad) const;
    void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad, const bool poolIncludePad, bool accumulateGradient) const;

    template<class StatType>
    void BatchNormalizationForward(const Matrix<StatType>& scale, const Matrix<StatType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
                                   Matrix<StatType>& runMean, Matrix<StatType>& runVariance, Matrix<ElemType>& out, double epsilon,
                                   Matrix<StatType>& saveMean, Matrix<StatType>& saveInvStdDev) const;

    template<class StatType>
    void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<StatType>& scale, double blendFactor, const Matrix<StatType>& saveMean, const Matrix<StatType>& saveInvStdDev,
                                    Matrix<StatType>& scaleGrad, Matrix<StatType>& biasGrad) const;

    void RNNForward(const Matrix<ElemType>& inputX, const Matrix<ElemType>& paramW, size_t xDim, size_t yDim, const vector<size_t>& numSequencesForFrame, const struct RnnAttributes& rnnAttributes, Matrix<ElemType>& reserve, Matrix<ElemType>& workspace);
    void RNNBackwardData(const Matrix<ElemType>& outputDY, const Matrix<ElemType>& paramW, Matrix<ElemType>& outputDX, const struct RnnAttributes& rnnAttributes, Matrix<ElemType>& reserve, Matrix<ElemType>& workspace);
    void RNNBackwardWeights(const Matrix<ElemType>& inputX, const Matrix<ElemType>& outputY, Matrix<ElemType>& dw, const struct RnnAttributes& rnnAttributes, Matrix<ElemType>& reserve, Matrix<ElemType>& workspace);

public:
    // TODO: why are these not static? And why are they here?
    ElemType Exp10(ElemType num);
    ElemType Mod(ElemType x, ElemType y);
    ElemType LogAdd(ElemType x, ElemType y);

public:
    // static BLAS functions

    // singular value decomposition of A as A = U*SIGMA*VT
    static void SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, Matrix<ElemType>& U, Matrix<ElemType>& VT, Matrix<ElemType>& W);

    static void MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, shared_ptr<QuantizedMultiplier<ElemType>> pQuantizedMultiplier=nullptr); // SGEMM
    static void MultiplyAndAdd(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
    static void Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, Matrix<ElemType>& c);
    static void Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
    static void Multiply1x1AndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType beta, Matrix<ElemType>& c);
    static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB, ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise);

    static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& v, ElemType beta, Matrix<ElemType>& c);

    static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
    static void ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c);
    static void AddScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
    static void AssignScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);
    static void AddScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c); // c += alpha * (a - b)
    static void AssignScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c);

    static void AddElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj);
    // static void AddLogElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj);
    static void AssignElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj);
    static void MinusOneAt(Matrix<ElemType>& c, const size_t position);

    static void Scale(ElemType alpha, Matrix<ElemType>& a);
    static void Scale(const Matrix<ElemType>& alpha, Matrix<ElemType>& a); // In this case Matrix alpha must be 1x1
    static void Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
    static void InnerProduct(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise);
    static ElemType InnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b);
    static void ElementWisePower(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c);
    static void BatchMatMul(ElemType beta, const Matrix<ElemType>& a, const bool transposeA, const int m, const Matrix<ElemType>& b, const bool transposeB, const int n, Matrix<ElemType>& c, const bool isColWise);

    static bool AreEqual(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const ElemType threshold = 1e-8);
    static bool HasElement(const Matrix<ElemType>& a, const ElemType value = 0.0);

    static void TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c);

    void TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
    void TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);

    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 4>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

    void TensorArgOp(const Matrix<ElemType>& a, ElementWiseOperator reductionOp,
                     const std::array<size_t, 2>& offsets,
                     const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                     const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);

public:
    void Read(File& stream);
    void Write(File& stream) const;

    Matrix<ElemType>& Shift(const Matrix<ElemType>& a, int shift);

    Matrix<ElemType>& AssignElementProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift, size_t negnumber);
    Matrix<ElemType>& AssignInnerProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise, size_t shift, size_t negnumber);
    static void InnerProductWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise, size_t shift, size_t negnumber);
    Matrix<ElemType>& GetARowByIndex(const Matrix<ElemType>& a, size_t index);
    static void ConductRowElementMultiplyWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, size_t shift, bool bFirstmatrixfixed);
    Matrix<ElemType>& AssignElementProductOfWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift);

public:
    static void RCRFBackwardCompute(const Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
                                    Matrix<ElemType>& functionValues, const Matrix<ElemType>& lbls,
                                    const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const int shift);

    static void RCRFTransGrdCompute(const Matrix<ElemType>& lbls,
                                    const Matrix<ElemType>& alpha,
                                    const Matrix<ElemType>& beta,
                                    const Matrix<ElemType>& pair_scores,
                                    Matrix<ElemType>& grd,
                                    const int startLbl, // the time 0 start symbol in the output layer
                                    const int shift);

    template <typename T>
    friend class MatrixQuantizer;

    template <typename T>
    friend class QuantizedMatrix;

    template <typename T>
    friend class Matrix;
};

// overload I/O operators
template <class ElemType>
File& operator>>(File& stream, Matrix<ElemType>& M)
{
    M.Read(stream);
    return stream;
}
template <class ElemType>
File& operator<<(File& stream, const Matrix<ElemType>& M)
{
    M.Write(stream);
    return stream;
}

typedef Matrix<float> SingleMatrix;
typedef Matrix<double> DoubleMatrix;
typedef Matrix<half> HalfMatrix;

}}}
Browse the archive

https://github.com/Microsoft/CNTK