Content - dd967f6999648f3201812cb1fa08932a56b12297 - 0e13b0f/Source/Math/Matrix.cpp

visit type:
Tip revision: 99647833fddfd80aa10e1d261ff309088ed71b12 authored by Alexey Kamenev on 28 March 2016, 18:14:08 UTC
Changes to test perf.
Tip revision: 9964783
Matrix.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// Matrix.cpp -- main CPP file that contains all Matrix functions exported by the CNTKMath.dll
//
#include "stdafx.h"
#include "Basics.h"
#include "Matrix.h"
#include "CPUMatrix.h"
#include "CPUSparseMatrix.h"
#include "GPUMatrix.h"
#include "GPUSparseMatrix.h"
#include "File.h"
#include <assert.h>
#include <math.h>
#include "GPUWatcher.h" // bring in this class as well so that it gets exported from this DLL
#ifndef CPUONLY
#pragma comment(lib, "MathCUDA.lib") // built by CNTKMathCUDA project
#endif

#pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#pragma warning(disable : 4239) // nonstandard extension; triggered by this pattern: "auto& second = transposeB ? b.m_GPUMatrix->Transpose() : *b.m_GPUMatrix;"
#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons

#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif

// Helper to dispath matrix calls to the 4 underlying matrix libraries (CPU,GPU) x (DENSE,SPARSE)
// 'MatrixPointerToCheck' determines where the operation takes place.
// 'MatrixPointerToSetFlag' is the output. If not null and its location is BOTH, we collapse it to one.
#define DISPATCH_MATRIX_ON_FLAG(MatrixPointerToCheck, MatrixPointerToSetFlag, CPUDense, GPUDense, CPUSparse, GPUSparse) \
    {                                                                                                                   \
        CurrentDataLocation curLocation = (MatrixPointerToCheck)->GetCurrentMatrixLocation();                           \
        if (curLocation == CurrentDataLocation::GPU || curLocation == CurrentDataLocation::BOTH)                        \
        {                                                                                                               \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                          \
            {                                                                                                           \
                GPUDense;                                                                                               \
                if (MatrixPointerToSetFlag != nullptr)                                                                  \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);   \
            }                                                                                                           \
            else                                                                                                        \
            {                                                                                                           \
                GPUSparse;                                                                                              \
                if (MatrixPointerToSetFlag != nullptr)                                                                  \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::SPARSE);  \
            }                                                                                                           \
        }                                                                                                               \
        else if (curLocation == CurrentDataLocation::CPU)                                                               \
        {                                                                                                               \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                          \
            {                                                                                                           \
                CPUDense;                                                                                               \
                if (MatrixPointerToSetFlag != nullptr)                                                                  \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);   \
            }                                                                                                           \
            else                                                                                                        \
            {                                                                                                           \
                CPUSparse;                                                                                              \
                if (MatrixPointerToSetFlag != nullptr)                                                                  \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::SPARSE);  \
            }                                                                                                           \
        }                                                                                                               \
        else                                                                                                            \
        {                                                                                                               \
            RuntimeError("Matrices do not exist in either CPU or GPU.");                                                \
        }                                                                                                               \
    }

// version of dispatch macro that prefers the CPU if the 'MatrixPointerToCheck' location is BOTH
#define DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(MatrixPointerToCheck, MatrixPointerToSetFlag, CPUDense, GPUDense, CPUSparse, GPUSparse) \
    {                                                                                                                                \
        CurrentDataLocation curLocation = (MatrixPointerToCheck)->GetCurrentMatrixLocation();                                        \
        if (curLocation == CurrentDataLocation::GPU)                                                                                 \
        {                                                                                                                            \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                       \
            {                                                                                                                        \
                GPUDense;                                                                                                            \
                if (MatrixPointerToSetFlag != nullptr)                                                                               \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);                \
            }                                                                                                                        \
            else                                                                                                                     \
            {                                                                                                                        \
                GPUSparse;                                                                                                           \
                if (MatrixPointerToSetFlag != nullptr)                                                                               \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::SPARSE);               \
            }                                                                                                                        \
        }                                                                                                                            \
        else if (curLocation == CurrentDataLocation::CPU || curLocation == CurrentDataLocation::BOTH)                                \
        {                                                                                                                            \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                       \
            {                                                                                                                        \
                CPUDense;                                                                                                            \
                if (MatrixPointerToSetFlag != nullptr)                                                                               \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);                \
            }                                                                                                                        \
            else                                                                                                                     \
            {                                                                                                                        \
                CPUSparse;                                                                                                           \
                if (MatrixPointerToSetFlag != nullptr)                                                                               \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::SPARSE);               \
            }                                                                                                                        \
        }                                                                                                                            \
        else                                                                                                                         \
        {                                                                                                                            \
            RuntimeError("Matrices do not exist in either CPU or GPU.");                                                             \
        }                                                                                                                            \
    }

// version of helper macro that executes both CPU and GPU macros if 'MatrixPointerToCheck' location is BOTH
#define DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(MatrixPointerToCheck, MatrixPointerToSetFlag, CPUDense, GPUDense, CPUSparse, GPUSparse) \
    {                                                                                                                                 \
        CurrentDataLocation curLocation = (MatrixPointerToCheck)->GetCurrentMatrixLocation();                                         \
        if (curLocation == CurrentDataLocation::BOTH)                                                                                 \
        {                                                                                                                             \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                        \
            {                                                                                                                         \
                CPUDense;                                                                                                             \
                GPUDense;                                                                                                             \
                if (MatrixPointerToSetFlag != nullptr)                                                                                \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::BOTH, MatrixType::DENSE);                \
            }                                                                                                                         \
            else                                                                                                                      \
            {                                                                                                                         \
                CPUSparse;                                                                                                            \
                GPUSparse;                                                                                                            \
                if (MatrixPointerToSetFlag != nullptr)                                                                                \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::BOTH, MatrixType::SPARSE);               \
            }                                                                                                                         \
        }                                                                                                                             \
        else if (curLocation == CurrentDataLocation::GPU)                                                                             \
        {                                                                                                                             \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                        \
            {                                                                                                                         \
                GPUDense;                                                                                                             \
                if (MatrixPointerToSetFlag != nullptr)                                                                                \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::DENSE);                 \
            }                                                                                                                         \
            else                                                                                                                      \
            {                                                                                                                         \
                GPUSparse;                                                                                                            \
                if (MatrixPointerToSetFlag != nullptr)                                                                                \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::GPU, MatrixType::SPARSE);                \
            }                                                                                                                         \
        }                                                                                                                             \
        else if (curLocation == CurrentDataLocation::CPU)                                                                             \
        {                                                                                                                             \
            if ((MatrixPointerToCheck)->GetMatrixType() != MatrixType::SPARSE)                                                        \
            {                                                                                                                         \
                CPUDense;                                                                                                             \
                if (MatrixPointerToSetFlag != nullptr)                                                                                \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::DENSE);                 \
            }                                                                                                                         \
            else                                                                                                                      \
            {                                                                                                                         \
                CPUSparse;                                                                                                            \
                if (MatrixPointerToSetFlag != nullptr)                                                                                \
                    ((Matrix*) MatrixPointerToSetFlag)->SetDataLocation(CurrentDataLocation::CPU, MatrixType::SPARSE);                \
            }                                                                                                                         \
        }                                                                                                                             \
        else                                                                                                                          \
        {                                                                                                                             \
            RuntimeError("Matrices do not exist in either CPU or GPU.");                                                              \
        }                                                                                                                             \
    }

namespace Microsoft { namespace MSR { namespace CNTK {

MatrixBase::~MatrixBase() { }

#pragma region Constructors, destructors and other static matrix builders

// Initialize all members over virgin memory.
//This function will only initialize default bland matrix. The actual matrices need to allocated
//after calling this function and flags need to set correctly by calling SetDataLocation.
// This clears out the entire object and brings it into destructable state.
// Note: Keep this in sync with member definition and ShallowCopyFrom().
template <class ElemType>
void Matrix<ElemType>::Init(DEVICEID_TYPE deviceId)
{
    m_baseMatrix      = nullptr;
    m_GPUMatrix       = nullptr;
    m_CPUMatrix       = nullptr;
    m_GPUSparseMatrix = nullptr;
    m_CPUSparseMatrix = nullptr;

    m_matrixType          = MatrixType::UNDETERMINED;
    m_currentDataLocation = CurrentDataLocation::NONE;

    m_preferredDeviceId = deviceId;
    m_numTimesDeviceChanged = 0;
    m_numTimesMatrixTypeChanged = 0;
    m_devicesTransferedTo[1]    = m_devicesTransferedTo[0] = CPUDEVICE - 1; // (some value that is different from any valid value)
}

// shallow-copy all members
template <class ElemType>
void Matrix<ElemType>::ShallowCopyFrom(const Matrix<ElemType>& other)
{
    m_baseMatrix                = other.m_baseMatrix;
    m_GPUMatrix                 = other.m_GPUMatrix;
    m_CPUMatrix                 = other.m_CPUMatrix;
    m_GPUSparseMatrix           = other.m_GPUSparseMatrix;
    m_CPUSparseMatrix           = other.m_CPUSparseMatrix;

    m_matrixType                = other.m_matrixType;
    m_currentDataLocation       = other.m_currentDataLocation;

    m_preferredDeviceId         = other.m_preferredDeviceId;
    m_numTimesDeviceChanged     = other.m_numTimesDeviceChanged;
    m_numTimesMatrixTypeChanged = other.m_numTimesMatrixTypeChanged;
    m_devicesTransferedTo[0]    = other.m_devicesTransferedTo[0]; // TODO: spelling
    m_devicesTransferedTo[1]    = other.m_devicesTransferedTo[1];
}

// Call this function after an update operation has created/set/updated the respective pointers.
// What gets updated:
//  - m_currentDataLocation: from function argument
//  - m_matrixType:          from function argument unless UNDETERMINED in which case m_matrixType remains unmodified
//  - m_baseMatrix:          to one of current values of m_[GC]PU{Sparse,}Matrix
template <class ElemType>
void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType type) const
{
    // if the object used to live on BOTH, this will collapse it to 'location' (unless we actually wrote into BOTH)
    // In that case, we do a sanity check here that the object is an owning Matrix,
    // since otherwise the collapsing would go unnoticed by the original owner.
    // The cases to cover:
    //  - original owner is BOTH, and this is called on the original owner
    //    -> The result was written to 'location' so we should collapse it to there.
    //  - original owning matrix is in BOTH state
    //    and a view inherits this
    //    -> FORBIDDEN to write into CPU or GPU since we cannot ensure we wrote into the one that will be read next
    //  - original owning matrix is CPU or GPU
    //    and a view onto it is put into BOTH state
    //    -> inefficent to read, since this is likely happening over again; so put the owner into BOTH state
    //    -> FORBIDDEN to write into CPU or GPU since we don't know the owner's true location and hence cannot ensure we wrote to the correct place
    if (m_currentDataLocation == CurrentDataLocation::BOTH && location != CurrentDataLocation::BOTH)
    {
        // we get here if we wrote into this object that was BOTH but is no longer
        if (!OwnBuffer()) // this means we should not have written into it in the first place, so fail now (better late than never)
            LogicError("SetDataLocation: A non-owning object cannot be written to in BOTH state.");
    }
    m_currentDataLocation = location;

    // set the matrix type if passed in
    if (type != MatrixType::UNDETERMINED)
        m_matrixType = type;

    // Note: m_currentDataLocation may also be CurrentDataLocation::BOTH, in which case the base matrix will be GPU.
    if (m_matrixType == MatrixType::DENSE)
        m_baseMatrix = ((m_currentDataLocation == CurrentDataLocation::CPU) ? (BaseMatrix<ElemType>*) m_CPUMatrix : (BaseMatrix<ElemType>*) m_GPUMatrix);
    else if (m_matrixType == MatrixType::SPARSE)
        m_baseMatrix = ((m_currentDataLocation == CurrentDataLocation::CPU) ? (BaseMatrix<ElemType>*) m_CPUSparseMatrix : (BaseMatrix<ElemType>*) m_GPUSparseMatrix);
    // sanity check
    if (!m_baseMatrix && m_matrixType != MatrixType::UNDETERMINED)
        LogicError("SetDataLocation: new m_baseMatrix must not be NULL.");
}

//this is a private constructor only used internally to initialize a blank matrix
template <class ElemType>
Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID)
{
    Init(deviceID);

    if (!(matrixFlags & matrixFlagDontOwnBuffer))
        SwitchToMatrixType(matrixType, matrixFormat, false);
}

//this is a private constructor only used internally to initialize a blank matrix
template <class ElemType>
Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID)
{
    Init(deviceID);

    if (!(matrixFlags & matrixFlagDontOwnBuffer))
        SwitchToMatrixType(matrixType, matrixType == MatrixType::DENSE ? MatrixFormat::matrixFormatDense : MatrixFormat::matrixFormatSparseCSC, false);
}

//this is a private constructor only used internally to initialize a blank matrix
template <class ElemType>
Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, DEVICEID_TYPE deviceID)
{
    Init(deviceID);

    if (!(matrixFlags & matrixFlagDontOwnBuffer))
        SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
}

template <class ElemType>
Matrix<ElemType>::Matrix(DEVICEID_TYPE deviceID)
{
    Init(deviceID);

    SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
}

// constructor for Matrix class to wrap an externally managed BaseMatrix
// baseMatrix - base matrix for this element
// pArray - pointer to current data array, will replace existing pointer in baseMatrix if != NULL
// deviceId - deviceId where the pArray exists
template <class ElemType>
Matrix<ElemType>::Matrix(BaseMatrix<ElemType>* baseMatrix, ElemType* pArray, DEVICEID_TYPE deviceId) // constructor for setting Matrix from a base matrix
{
    Init(deviceId);

    if (baseMatrix->GetFormat() & matrixFormatSparse)
    {
        if (m_preferredDeviceId == CPUDEVICE)
        {
            m_CPUSparseMatrix = (CPUSparseMatrix<ElemType>*) baseMatrix;
            SetDataLocation(CPU, SPARSE);
        }
        else
        {
            m_GPUSparseMatrix = (GPUSparseMatrix<ElemType>*) baseMatrix;
            SetDataLocation(GPU, SPARSE);
        }
    }
    else
    {
        if (m_preferredDeviceId == CPUDEVICE)
        {
            m_CPUMatrix = (CPUMatrix<ElemType>*) baseMatrix;
            SetDataLocation(CPU, DENSE);
        }
        else
        {
            m_GPUMatrix = (GPUMatrix<ElemType>*) baseMatrix;
            SetDataLocation(GPU, DENSE);
        }
    }
    m_baseMatrix = baseMatrix;
    m_baseMatrix->SetArray(pArray);
}

template <class ElemType>
Matrix<ElemType>::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat)
{
    Init(deviceId);

    if (matrixType == MatrixType::SPARSE)
    {
        if (m_preferredDeviceId == CPUDEVICE)
        {
            m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(matrixFormat, numRows, numCols, 0);
            SetDataLocation(CPU, SPARSE);
        }
        else
        {
            m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(numRows, numCols, 0, m_preferredDeviceId, matrixFormat);
            SetDataLocation(GPU, SPARSE);
        }
    }
    else
    {
        if (matrixFormat != matrixFormatDense)
        {
            NOT_IMPLEMENTED;
        }

        if (m_preferredDeviceId == CPUDEVICE)
        {
            m_CPUMatrix = new CPUMatrix<ElemType>(numRows, numCols);
            SetDataLocation(CPU, DENSE);
        }
        else
        {
            m_GPUMatrix = new GPUMatrix<ElemType>(numRows, numCols, m_preferredDeviceId);
            SetDataLocation(GPU, DENSE);
        }

        SetValue(0);
    }
}

template <class ElemType>
Matrix<ElemType>::Matrix(const size_t numRows, const size_t numCols, ElemType* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags, const size_t nnz)
{
    Init(deviceId);

    if (m_preferredDeviceId == CPUDEVICE)
    {
        if (matrixFlags & matrixFormatSparse)
        {
            // WARNING: matrixFlag is not passed in and externally managed array cannot be passed in
            m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(matrixFormatSparseCSC, numRows, numCols, nnz);
            SetDataLocation(CPU, SPARSE);
        }
        else
        {
            m_CPUMatrix = new CPUMatrix<ElemType>(numRows, numCols, pArray, matrixFlags);
            SetDataLocation(CPU, DENSE);
        }
    }
    else
    {
        if (matrixFlags & matrixFormatSparse)
        {
            // m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(numRows,numCols,nnz, pArray,matrixFlags,m_preferredDeviceId);
            m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(m_preferredDeviceId, MatrixFormat(matrixFlags & MatrixFormat::matrixFormatMask));
            m_GPUSparseMatrix->Resize(numRows, numCols, nnz, true, false);
            SetDataLocation(GPU, SPARSE);
        }
        else
        {
            m_GPUMatrix = new GPUMatrix<ElemType>(numRows, numCols, m_preferredDeviceId, pArray, matrixFlags);
            SetDataLocation(GPU, DENSE);
        }
    }

    if (matrixFlagDontOwnBuffer & matrixFlags)
        m_baseMatrix->SetOwnBuffer(false);
}

//copy constructor, deep copy
template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::DeepClone() const
{
    return Matrix<ElemType>(*this, GetDeviceId());
}

template <class ElemType>
Matrix<ElemType>::Matrix(const Matrix<ElemType>& deepCopyFrom, DEVICEID_TYPE deviceId)
{
    int origCopyFromDeviceId = deepCopyFrom.GetDeviceId();

    Init(deviceId); // will set m_preferredDeviceId

    deepCopyFrom._transferToDevice(m_preferredDeviceId, true);

    DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom,
                            this,
                            m_CPUMatrix = new CPUMatrix<ElemType>(*(deepCopyFrom.m_CPUMatrix)),
                            m_GPUMatrix = new GPUMatrix<ElemType>(*(deepCopyFrom.m_GPUMatrix)),
                            m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(*(deepCopyFrom.m_CPUSparseMatrix)),
                            m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(*(deepCopyFrom.m_GPUSparseMatrix)));

    // should we move back?
    deepCopyFrom._transferToDevice(origCopyFromDeviceId, true);

    m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId;
}


//move constructor, shallow copy
template <class ElemType>
Matrix<ElemType>::Matrix(Matrix<ElemType>&& moveFrom)
{
    Init((DEVICEID_TYPE) moveFrom.GetDeviceId());

#if 1
    operator=(move(moveFrom));
#else
    DISPATCH_MATRIX_ON_FLAG(&moveFrom,
                            this,
                            m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&>(*(moveFrom.m_CPUMatrix))),
                            m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(*(moveFrom.m_GPUMatrix))),
                            m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(static_cast<CPUSparseMatrix<ElemType>&&>(*(moveFrom.m_CPUSparseMatrix))),
                            m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(static_cast<GPUSparseMatrix<ElemType>&&>(*(moveFrom.m_GPUSparseMatrix))));

    m_preferredDeviceId = moveFrom.m_preferredDeviceId;
#endif
}

//move assignment operator, shallow copy
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator=(Matrix<ElemType>&& moveFrom)
{
    if (this == &moveFrom)
        LogicError("Matrix: Move assignment into itself is forbidden.");
    ReleaseMemory(); // free held memory if any
#if 1
    // shallow-copy all members
    ShallowCopyFrom(moveFrom);
    // virgin-init the source
    moveFrom.Init(CPUDEVICE);
#else
    m_preferredDeviceId = moveFrom.m_preferredDeviceId;

    DISPATCH_MATRIX_ON_FLAG(&moveFrom,
                            this,
                            if (m_CPUMatrix != nullptr) m_CPUMatrix->operator=(static_cast<CPUMatrix<ElemType>&&>(*(moveFrom.m_CPUMatrix)));
                            else m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&>(*(moveFrom.m_CPUMatrix))),

                            if (m_GPUMatrix != nullptr) m_GPUMatrix->operator=(static_cast<GPUMatrix<ElemType>&&>(*(moveFrom.m_GPUMatrix)));
                            else m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(*(moveFrom.m_GPUMatrix))),

                            if (m_CPUSparseMatrix != nullptr) m_CPUSparseMatrix->operator=(static_cast<CPUSparseMatrix<ElemType>&&>(*(moveFrom.m_CPUSparseMatrix)));
                            else m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(static_cast<CPUSparseMatrix<ElemType>&&>(*(moveFrom.m_CPUSparseMatrix))),

                            if (m_GPUSparseMatrix != nullptr) m_GPUSparseMatrix->operator=(static_cast<GPUSparseMatrix<ElemType>&&>(*(moveFrom.m_GPUSparseMatrix)));
                            else m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(static_cast<GPUSparseMatrix<ElemType>&&>(*(moveFrom.m_GPUSparseMatrix))));

#endif
    return *this;
}

template <class ElemType>
void Matrix<ElemType>::ReleaseMemory()
{
    delete m_CPUMatrix;
    delete m_GPUMatrix;
    delete m_GPUSparseMatrix;
    delete m_CPUSparseMatrix;
    m_CPUMatrix = nullptr;
    m_GPUMatrix = nullptr;
    m_GPUSparseMatrix = nullptr;
    m_CPUSparseMatrix = nullptr;

    m_matrixType = MatrixType::UNDETERMINED;
    m_currentDataLocation = CurrentDataLocation::NONE;
}

template <class ElemType>
Matrix<ElemType>::~Matrix(void)
{
    ReleaseMemory();
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::Ones(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId)
{
    Matrix<ElemType> c(rows, cols, deviceId); // will initialize to 0
    c.SetValue(1);
    return c;
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::Zeros(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId)
{
    Matrix<ElemType> c(rows, cols, deviceId); // will initialize to 0
    c.SetValue(0);
    return c;
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::Eye(const size_t rows, DEVICEID_TYPE deviceId)
{
    Matrix<ElemType> c(rows, rows, deviceId); // will initialize to 0
    c.SetDiagonalValue(1);
    return c;
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::RandomUniform(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType low, const ElemType high, unsigned long seed)
{
    Matrix<ElemType> c(rows, cols, deviceId); // will initialize to 0
    c.SetUniformRandomValue(low, high, seed);
    return c;
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::RandomGaussian(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const ElemType mean, const ElemType sigma, unsigned long seed)
{
    Matrix<ElemType> c(rows, cols, deviceId); // will initialize to 0
    c.SetGaussianRandomValue(mean, sigma, seed);
    return c;
}

template <class ElemType>
void Matrix<ElemType>::SetDevice(DEVICEID_TYPE deviceId)
{
    if (deviceId >= 0)
        GPUMatrix<ElemType>::SetDevice(deviceId);
}

template <class ElemType>
void Matrix<ElemType>::Read(File& stream)
{
    Matrix<ElemType>& M = *this;
    char type;
    stream >> type;
    if (type == 'd')
    {
        if (M.GetDeviceId() < 0)
        {
            if (M.m_CPUMatrix == NULL)
                M.m_CPUMatrix = new CPUMatrix<ElemType>();
            stream >> (*M.m_CPUMatrix);
            M.SetDataLocation(CPU, DENSE);
        }
        else
        {
            if (M.m_GPUMatrix == NULL)
                M.m_GPUMatrix = new GPUMatrix<ElemType>(M.GetDeviceId());
            stream >> (*M.m_GPUMatrix);
            M.SetDataLocation(GPU, DENSE);
        }
    }
    else if (type == 's')
    {
        if (M.GetDeviceId() < 0)
        {
            NOT_IMPLEMENTED; // You might want to tranfer your matrix to GPU
        }
        else
        {
            if (M.m_GPUSparseMatrix == NULL)
                M.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(M.GetDeviceId());
            stream >> (*M.m_GPUSparseMatrix);
            M.SetDataLocation(GPU, SPARSE);
        }
    }
    else
        LogicError("Read: Input file corrupt (invalid matrix type field 0x%02d, should be 'f' or 'd').", type);
}

template <class ElemType>
void Matrix<ElemType>::Write(File& stream) const
{
    const Matrix<ElemType>& M = *this;
    if (M.GetMatrixType() == MatrixType::DENSE)
    {
        stream << 'd';
        if (M.GetDeviceId() < 0)
            stream << (*M.m_CPUMatrix);
        else
            stream << (*M.m_GPUMatrix);
    }
    else
    {
        stream << 's';
        if (M.GetDeviceId() < 0)
            NOT_IMPLEMENTED // stream<<(*M.m_CPUMatrix);
                else stream
                << (*M.m_GPUSparseMatrix);
    }
}

#pragma endregion Constructors, destructors and other static matrix builders

#pragma region Basic Operators

template <class ElemType>
size_t Matrix<ElemType>::BufferSize() const
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_baseMatrix->GetSizeAllocated() * sizeof(ElemType),
                            return m_baseMatrix->GetSizeAllocated() * sizeof(ElemType),
                            return m_CPUSparseMatrix->BufferSize(),
                            return m_GPUSparseMatrix->BufferSizeAllocated());
}

template <class ElemType>
ElemType* Matrix<ElemType>::BufferPointer() const
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_baseMatrix->GetArray(),
                            return m_baseMatrix->GetArray(),
                            return m_CPUSparseMatrix->BufferPointer(),
                            return (ElemType*) m_GPUSparseMatrix->BufferPointer());
}

template <class ElemType>
size_t Matrix<ElemType>::NzCount() const
{
    return m_baseMatrix->NzCount();
}

template <class ElemType>
ElemType* Matrix<ElemType>::CopyToArray() const
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->CopyToArray(),
                            return m_GPUMatrix->CopyToArray(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

//memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done
//return number of elements copied
template <class ElemType>
size_t Matrix<ElemType>::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->CopyToArray(arrayCopyTo, currentArraySize),
                            return m_GPUMatrix->CopyToArray(arrayCopyTo, currentArraySize),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            m_CPUMatrix->CopySection(numRows, numCols, dst, colStride),
                            m_GPUMatrix->CopySection(numRows, numCols, dst, colStride),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

// BUGBUG: Some code checks before calling here whether one of the dimensions is 0.
//         This function must handle that case properly, that is, preserving the non-zero dimension.
template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
{
    int devId = GetDeviceId();

    Matrix<ElemType> slice(matrixFlagDontOwnBuffer, (DEVICEID_TYPE) devId); // this already creates pointers

    slice.m_preferredDeviceId = m_preferredDeviceId;

    // create slices for the underlying object
    // Note: In case of data location == BOTH, this creates two objects just like in the source.
    if (GetMatrixType() == MatrixType::DENSE)
    {
        if (GetCurrentMatrixLocation() == CPU || GetCurrentMatrixLocation() == BOTH)
        {
            if (slice.m_CPUMatrix != nullptr)
                slice.m_CPUMatrix->operator=(static_cast<CPUMatrix<ElemType>&&>(m_CPUMatrix->ColumnSlice(startColumn, numCols)));
            else
                slice.m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&>(m_CPUMatrix->ColumnSlice(startColumn, numCols)));
        }
        if (GetCurrentMatrixLocation() == GPU || GetCurrentMatrixLocation() == BOTH)
        {
            if (slice.m_GPUMatrix != nullptr)
                slice.m_GPUMatrix->operator=(static_cast<GPUMatrix<ElemType>&&>(m_GPUMatrix->ColumnSlice(startColumn, numCols)));
            else
                slice.m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(m_GPUMatrix->ColumnSlice(startColumn, numCols)));
        }
    }
    else if (GetMatrixType() == MatrixType::SPARSE)
    {
        if (GetCurrentMatrixLocation() == CPU || GetCurrentMatrixLocation() == BOTH)
        {
            if (slice.m_CPUSparseMatrix != nullptr)
                slice.m_CPUSparseMatrix->operator=(static_cast<CPUSparseMatrix<ElemType>&&>(m_CPUSparseMatrix->ColumnSlice(startColumn, numCols)));
            else
                slice.m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(static_cast<CPUSparseMatrix<ElemType>&&>(m_CPUSparseMatrix->ColumnSlice(startColumn, numCols)));
        }
        if (GetCurrentMatrixLocation() == GPU || GetCurrentMatrixLocation() == BOTH)
        {
            if (slice.m_GPUSparseMatrix != nullptr)
                slice.m_GPUSparseMatrix->operator=(static_cast<GPUSparseMatrix<ElemType>&&>(m_GPUSparseMatrix->ColumnSlice(startColumn, numCols)));
            else
                slice.m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(static_cast<GPUSparseMatrix<ElemType>&&>(m_GPUSparseMatrix->ColumnSlice(startColumn, numCols)));
        }
    }
    else
        LogicError("Undetermined matrix type");

    // update the slice's m_currentDataLocation, m_matrixType, and m_baseMatrix
    // This will work for CPU, GPU, and BOTH.
    slice.SetDataLocation(GetCurrentMatrixLocation(), GetMatrixType());

    return slice;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
{
    ReleaseMemory();
    m_preferredDeviceId = fromMatrix.m_preferredDeviceId;

    DISPATCH_MATRIX_ON_FLAG(&fromMatrix,
                            this,
                            if (m_CPUMatrix != nullptr) m_CPUMatrix->AssignColumnSlice(*fromMatrix.m_CPUMatrix, startColumn, numCols);
                            else m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&>(fromMatrix.m_CPUMatrix->ColumnSlice(startColumn, numCols))),

                            if (m_GPUMatrix != nullptr) m_GPUMatrix->AssignColumnSlice(*fromMatrix.m_GPUMatrix, startColumn, numCols);
                            else m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(fromMatrix.m_GPUMatrix->ColumnSlice(startColumn, numCols))),

                            NOT_IMPLEMENTED,

                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::SetColumnSlice(const Matrix<ElemType>& fromMatrix, size_t startColumn, size_t numCols)
{
    assert(m_CPUMatrix != nullptr || m_GPUMatrix != nullptr);
    // must already been allocated

    DISPATCH_MATRIX_ON_FLAG(&fromMatrix,
                            this,
                            m_CPUMatrix->SetColumnSlice(*fromMatrix.m_CPUMatrix, startColumn, numCols),
                            m_GPUMatrix->SetColumnSlice(*fromMatrix.m_GPUMatrix, startColumn, numCols),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
void Matrix<ElemType>::CopyColumnsStrided(const Matrix<ElemType>& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride)
{
    assert(m_CPUMatrix != nullptr || m_GPUMatrix != nullptr);

    DISPATCH_MATRIX_ON_FLAG(&fromMatrix,
                            this,
                            m_CPUMatrix->CopyColumnsStrided(*fromMatrix.m_CPUMatrix, numCols, srcNumColsStride, destNumColsStride),
                            m_GPUMatrix->CopyColumnsStrided(*fromMatrix.m_GPUMatrix, numCols, srcNumColsStride, destNumColsStride),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::Diagonal() const
{
    int devId = GetDeviceId();

    Matrix<ElemType> diag(matrixFlagDontOwnBuffer, (DEVICEID_TYPE) devId);
    diag.m_preferredDeviceId = m_preferredDeviceId;

    AssignDiagonalValuesTo(diag);

    return diag;
}

template <class ElemType>
void Matrix<ElemType>::AssignDiagonalValuesTo(Matrix<ElemType>& diag) const
{
    int devId = GetDeviceId();
    DecideAndMoveToRightDevice(*this, diag);

    if (GetMatrixType() == MatrixType::DENSE)
    {
        if (devId == CPUDEVICE)
        {
            if (diag.m_CPUMatrix != nullptr)
                diag.m_CPUMatrix->operator=(static_cast<CPUMatrix<ElemType>&&>(m_CPUMatrix->Diagonal()));
            else
                diag.m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&>(m_CPUMatrix->Diagonal()));
            diag.SetDataLocation(CPU, DENSE);
        }
        else
        {
            if (diag.m_GPUMatrix != nullptr)
                diag.m_GPUMatrix->operator=(static_cast<GPUMatrix<ElemType>&&>(m_GPUMatrix->Diagonal()));
            else
                diag.m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(m_GPUMatrix->Diagonal()));
            diag.SetDataLocation(GPU, DENSE);
        }
    }
    else if (GetMatrixType() == MatrixType::SPARSE)
    {
        // TODO: Implement optimized diagonal functions for sparse matrices. For now use the DiagonalToDense instead.
        if (devId == CPUDEVICE)
        {
            if (diag.m_CPUMatrix != nullptr)
                diag.m_CPUMatrix->operator=(static_cast<CPUMatrix<ElemType>&&>(m_CPUSparseMatrix->DiagonalToDense()));
            else
                diag.m_CPUMatrix = new CPUMatrix<ElemType>(static_cast<CPUMatrix<ElemType>&&>(m_CPUSparseMatrix->DiagonalToDense()));
            diag.SetDataLocation(CPU, DENSE);
        }
        else
        {
            if (diag.m_GPUMatrix != nullptr)
                diag.m_GPUMatrix->operator=(static_cast<GPUMatrix<ElemType>&&>(m_GPUSparseMatrix->DiagonalToDense()));
            else
                diag.m_GPUMatrix = new GPUMatrix<ElemType>(static_cast<GPUMatrix<ElemType>&&>(m_GPUSparseMatrix->DiagonalToDense()));
            diag.SetDataLocation(GPU, DENSE);
        }
    }
    else
        LogicError("Undetermined matrix type");

}

//this function will change the matrix type between DENSE and SPARSE.
//WARNING: The correct implementation is to copy the matrix between DENSE and SPARSE
//         However, the conversion functions are not implemented yet and so it will always create
//         a new blank matrix and destroy all info in the original matrix if different matrix type is asked.
// In case of !keepValues, the matrix content will be undefined.
template <class ElemType>
void Matrix<ElemType>::SwitchToMatrixType(MatrixType newMatrixType, MatrixFormat newMatrixFormat, bool keepValues)
{
    // This check should be uncommented but unfortunately there are still places
    // this function is being called with incorrect "default" format value
    /*if (m_matrixType == newMatrixType && GetFormat() != newMatrixFormat)
            NOT_IMPLEMENTED;*/

    if (m_matrixType == newMatrixType)
        return;

    if (m_baseMatrix == nullptr)
        keepValues = false;

#define NUM_MATRIXTYPE_CHANGED_WARN 20
    m_numTimesMatrixTypeChanged++;

    if (m_numTimesMatrixTypeChanged == NUM_MATRIXTYPE_CHANGED_WARN)
        fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long) GetNumRows(), (unsigned long) GetNumCols(), NUM_MATRIXTYPE_CHANGED_WARN);

    if (GetDeviceId() < 0) // CPU
    {
        if (newMatrixType == MatrixType::SPARSE)
        {
            if (m_baseMatrix == nullptr)
                m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat);
            else
                m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(newMatrixFormat, GetNumRows(), GetNumCols(), 1);

            if (keepValues)
                CopyElementsFromDenseToSparse(*m_CPUMatrix, *m_CPUSparseMatrix);

            delete m_CPUMatrix;
            m_CPUMatrix = nullptr;

            SetDataLocation(CPU, SPARSE);
        }
        else if (newMatrixType == MatrixType::DENSE)
        {
            if (m_baseMatrix == nullptr)
                m_CPUMatrix = new CPUMatrix<ElemType>();
            else
                m_CPUMatrix = new CPUMatrix<ElemType>(GetNumRows(), GetNumCols());

            if (keepValues)
                m_CPUMatrix->SetValue(m_CPUSparseMatrix->CopyColumnSliceToDense(0, GetNumCols()));

            delete m_CPUSparseMatrix;
            m_CPUSparseMatrix = nullptr;

            SetDataLocation(CPU, DENSE);
        }
        else
            LogicError("SwitchToMatrixType: Unexpected/invalid new matrix type");
    }
    else // GPU
    {
        if (newMatrixType == MatrixType::SPARSE)
        {
            if (m_baseMatrix == nullptr)
                m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(GetDeviceId(), newMatrixFormat);
            else
                m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(GetNumRows(), GetNumCols(), 0, GetDeviceId(), newMatrixFormat);

            if (keepValues)
                m_GPUSparseMatrix->SetValue(*m_GPUMatrix);

            delete m_GPUMatrix;
            m_GPUMatrix = nullptr;

            SetDataLocation(GPU, SPARSE);
        }
        else if (newMatrixType == MatrixType::DENSE)
        {
            if (m_baseMatrix == nullptr)
                m_GPUMatrix = new GPUMatrix<ElemType>(GetDeviceId());
            else
                m_GPUMatrix = new GPUMatrix<ElemType>(GetNumRows(), GetNumCols(), GetDeviceId());

            if (keepValues)
                m_GPUSparseMatrix->CopyToDenseMatrix(*m_GPUMatrix);

            delete m_GPUSparseMatrix;
            m_GPUSparseMatrix = nullptr;

            SetDataLocation(GPU, DENSE);
        }
        else
            LogicError("SwitchToMatrixType: Unexpected/invalid new matrix type");
    }
}

template <class ElemType>
void Matrix<ElemType>::CopyElementsFromDenseToSparse(CPUMatrix<ElemType>& from, CPUSparseMatrix<ElemType>& dest)
{
    foreach_coord (row, col, from)
    {
        auto val = from(row, col);
        dest.SetValue(row, col, val);
    }
}

template <class ElemType>
ElemType Matrix<ElemType>::Get00Element() const
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->Get00Element(),
                            return m_GPUMatrix->Get00Element(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
const ElemType Matrix<ElemType>::operator()(const size_t row, const size_t col) const
{
    DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
                                         nullptr,
                                         return m_CPUMatrix->operator()(row, col),
                                         _transferFromDeviceToDevice(GetDeviceId(), CPUDEVICE, false); return m_CPUMatrix->operator()(row, col),
                                                NOT_IMPLEMENTED,
                                                NOT_IMPLEMENTED);
}

//WARNING: This function is very slow for GPUs since it requires copying values between CPUs and GPUs.
//In addition, if ColumnSlice is used after this function but before the values are copied back to GPU
//the operation will fail since the memory is not managed by the slice.
// If you don't need to modify the values, to call the const version above, or GetValue(row,col) which does that for you unambiguously.
// TODO: Can we remove this, and have users use SetValue() instead? To avoid this potential error?
template <class ElemType>
ElemType& Matrix<ElemType>::operator()(const size_t row, const size_t col)
{
    DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
                                         nullptr,
                                         return m_CPUMatrix->operator()(row, col),
                                         _transferFromDeviceToDevice(GetDeviceId(), CPUDEVICE, false); SetDataLocation(CPU, DENSE); return m_CPUMatrix->operator()(row, col),
                                                                             NOT_IMPLEMENTED,
                                                                             NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::Transpose()
{
    if (IsEmpty())
        LogicError("Transpose: Matrix is empty.");

    Matrix<ElemType> c(GetNumCols(), GetNumRows(), (DEVICEID_TYPE) GetDeviceId(), this->GetMatrixType(), this->GetFormat());
    c.AssignTransposeOf(*this);
    return c;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignTransposeOf(const Matrix<ElemType>& a)
{
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignTransposeOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignTransposeOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignTransposeOf(*a.m_GPUSparseMatrix));

    return *this;
}

// *this[:,j] = a[:,m[j]] * alpha + *this[:,j] * beta
// m has width of 'this' and contains values w.r.t. 'a'
// Invalid entries (gap columns) are denoted by m(0,j) == -1.
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::DoGatherColumnsOf(ElemType beta, const Matrix<ElemType>& m, const Matrix<ElemType>& a, ElemType alpha)
{
    DecideAndMoveToRightDevice(*this, m, a); // TODO: only move target if beta != 0

    DISPATCH_MATRIX_ON_FLAG(&a,
        this,
        m_CPUMatrix->DoGatherColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
        m_GPUMatrix->DoGatherColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
        NOT_IMPLEMENTED,
        NOT_IMPLEMENTED);

    return *this;
}

// *this[:,m[j]] = a[:,j] * alpha + *this[:,m[j]] * beta
// m has width of 'a' and contains values w.r.t. 'this'
// Unlike gather, for scatter, 'this' must have been sized already.
// Invalid entries (gap columns) are denoted by m(0,j) == -1.
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::DoScatterColumnsOf(ElemType beta, const Matrix<ElemType>& m, const Matrix<ElemType>& a, ElemType alpha)
{
    DecideAndMoveToRightDevice(*this, m, a); // TODO: only move target if beta != 0

    DISPATCH_MATRIX_ON_FLAG(&a,
        this,
        m_CPUMatrix->DoScatterColumnsOf(beta, *m.m_CPUMatrix, *a.m_CPUMatrix, alpha),
        m_GPUMatrix->DoScatterColumnsOf(beta, *m.m_GPUMatrix, *a.m_GPUMatrix, alpha),
        NOT_IMPLEMENTED,
        NOT_IMPLEMENTED);

    return *this;
}

// set all elements of a matrix to a scalar value
// For sparse matrices, the only allowed value is 0.
template <class ElemType>
void Matrix<ElemType>::SetValue(const ElemType v)
{
    if (IsEmpty()) // if empty then we are done
        return;

    if (v == 0 && GetMatrixType() == MatrixType::SPARSE) // if sparse, setting it to 0 is special
    {
        Reset();
        return;
    }

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetValue(v),
                            m_GPUMatrix->SetValue(v),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetValue(const DeviceBoundNumber<ElemType>& db_number)
{
    if (IsEmpty()) // if empty then we are done
        return;
    // LogicError("SetValue: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetValue(*db_number.ExposePointer2Value()),
                            {
                            if (GetDeviceId() != db_number.GetDeviceId())
                                RuntimeError("Matrix and device bound number must be on the same device");
                                m_GPUMatrix->SetValue(db_number.ExposePointer2Value());
                            },
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <>
/*static*/ float Matrix<float>::MakeNan(size_t /*payload*/)
{
    return nanf("");
}
template <>
/*static*/ double Matrix<double>::MakeNan(size_t /*payload*/)
{
    return nan("");
}
template <>
/*static*/ char Matrix<char>::MakeNan(size_t)
{
    return 0;
} // (needed for completeness)

template <class ElemType>
void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val)
{
    if (GetNumCols() != columnsMask.GetNumCols())
        RuntimeError("MaskColumnsValue: Matrix and column mask must have equal number of columns.");

    if (GetCurrentMatrixLocation() == CPU && (columnsMask.GetCurrentMatrixLocation() == CPU || columnsMask.GetCurrentMatrixLocation() == BOTH))
        ; // OK
    else if (GetDeviceId() != columnsMask.GetDeviceId() && columnsMask.GetCurrentMatrixLocation() != BOTH)
        RuntimeError("MaskColumnsValue: Matrix and column mask must be on the same device.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val),
                            m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetColumn(const ElemType* colPointer, size_t colInd)
{
    if (colPointer == nullptr)
        InvalidArgument("SetColumn: colPointer is null.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetColumn(colPointer, colInd),
                            m_GPUMatrix->SetColumn(colPointer, colInd),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetColumn(const ElemType val, size_t colInd)
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetColumn(val, colInd),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetColumn(const Matrix<ElemType>& colMat, size_t colInd)
{
    DecideAndMoveToRightDevice(*this, colMat);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetColumn(*colMat.m_CPUMatrix, colInd),
                            m_GPUMatrix->SetColumn(*colMat.m_GPUMatrix, colInd),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetValue(const Matrix<ElemType>& deepCopyFrom, const MatrixFormat format /*= matrixFormatSparseCSR*/)
{
    if (this == &deepCopyFrom)
        return;

    m_preferredDeviceId = deepCopyFrom.m_preferredDeviceId;
    DecideAndMoveToRightDevice(deepCopyFrom, *this);
    SwitchToMatrixType(deepCopyFrom.GetMatrixType(), format, false);

    DISPATCH_MATRIX_ON_FLAG(&deepCopyFrom,
                            this,
                            m_CPUMatrix->SetValue(*deepCopyFrom.m_CPUMatrix),
                            m_GPUMatrix->SetValue(*deepCopyFrom.m_GPUMatrix),
                            m_CPUSparseMatrix->SetValue(*deepCopyFrom.m_CPUSparseMatrix),
                            m_GPUSparseMatrix->SetValue(*deepCopyFrom.m_GPUSparseMatrix));
}

template <class ElemType>
void Matrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, const size_t matrixFlags)
{
    if (((numRows * numCols) > 0) && (pArray == nullptr))
        InvalidArgument("Invalid pArray.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetValue(numRows, numCols, pArray, matrixFlags),
                            m_GPUMatrix->SetValue(numRows, numCols, deviceId, pArray, matrixFlags),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetValue(const size_t rIdx, const size_t cIdx, ElemType val)
{
    DISPATCH_MATRIX_ON_FLAG_USECPU_4BOTH(this,
                                         this,
                                         (*m_CPUMatrix)(rIdx, cIdx) = val,
                                         NOT_IMPLEMENTED,
                                         m_CPUSparseMatrix->SetValue(rIdx, cIdx, val),
                                         NOT_IMPLEMENTED);
}

// read features
template <class ElemType>
void Matrix<ElemType>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCCol, const CPUSPARSE_INDEX_TYPE* h_Row, const ElemType* h_Val,
                                              const size_t nz, const size_t numRows, const size_t numCols)
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            m_CPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols),
                            m_GPUSparseMatrix->SetMatrixFromCSCFormat(h_CSCCol, h_Row, h_Val, nz, numRows, numCols));
}

template <class ElemType>
void Matrix<ElemType>::SetDiagonalValue(const ElemType v)
{
    if (IsEmpty())
        LogicError("SetDiagonalValue: Matrix is empty.");

    if (GetNumRows() != GetNumCols())
        LogicError("SetDiagonalValue: NumRows and NumCols do not agree.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetDiagonalValue(v),
                            m_GPUMatrix->SetDiagonalValue(v),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetDiagonalValue(const Matrix<ElemType>& vector)
{
    if (GetNumRows() != GetNumCols())
        LogicError("SetDiagonalValue: NumRows and NumCols do not agree.");

    if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1)
        LogicError("SetDiagonalValue: Input vector must be a vector.");

    if (vector.GetNumRows() * vector.GetNumCols() != GetNumRows())
        LogicError("SetDiagonalValue: Input vector must match matrix dimension.");

    if (IsEmpty())
        return;

    DecideAndMoveToRightDevice(*this, vector);

    if (vector.GetNumElements() == 1) // reduce to simple form
    {
        DISPATCH_MATRIX_ON_FLAG(&vector,
                                nullptr,
                                SetDiagonalValue(vector(0, 0)),
                                SetDiagonalValue(vector.m_GPUMatrix->Get00Element()), // BUGBUG: efficiency
                                SetDiagonalValue(vector(0, 0)),
                                SetDiagonalValue(vector.m_GPUMatrix->Get00Element()) // BUGBUG: efficiency
                                );
    }
    else if (vector.GetNumRows() != GetNumRows())
        LogicError("SetDiagonalValue: input vector's dimension does not agree with [this].");
    else
    {
        // WARNING: we use this pointer to decide which function to call. However, vector may be stored in a different matrix type (DENSE, SPARSE)
        DISPATCH_MATRIX_ON_FLAG(this,
                                this,
                                assert(vector.m_CPUMatrix != nullptr);
                                m_CPUMatrix->SetDiagonalValue(*vector.m_CPUMatrix),
                                assert(vector.m_GPUMatrix != nullptr);
                                m_GPUMatrix->SetDiagonalValue(*vector.m_GPUMatrix),
                                NOT_IMPLEMENTED,
                                NOT_IMPLEMENTED);
    }
}

template <class ElemType>
void Matrix<ElemType>::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed)
{
    if (IsEmpty())
        return;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetUniformRandomValue(low, high, seed),
                            m_GPUMatrix->SetUniformRandomValue(low, high, seed),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
{
    if (sigma <= 0)
        InvalidArgument("SetUniformRandomValue: sigma must be a positive value.");

    if (IsEmpty())
        return;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetGaussianRandomValue(mean, sigma, seed),
                            m_GPUMatrix->SetGaussianRandomValue(mean, sigma, seed),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed)
{
    if (sigma <= 0)
        InvalidArgument("SetUniformRandomValue: sigma must be a positive value.");

    if (IsEmpty())
        return;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddGaussianRandomValue(mean, sigma, seed),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

//maskRate: percentage of values masked out (similar to dropout rate)
//scaleValue: which scale value to set to the left ones (unmasked items).
template <class ElemType>
void Matrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
{
    if (IsEmpty())
        return;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetUniformRandomMask(maskRate, scaleValue, seed),
                            m_GPUMatrix->SetUniformRandomMask(maskRate, scaleValue, seed),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients,
                                  Matrix<ElemType>& functionValues,
                                  const ElemType learnRatePerSample,
                                  const ElemType momentum,
                                  const bool useNesterovMomentum)
{
    DecideAndMoveToRightDevice(*this, gradients, functionValues);

    if (!useNesterovMomentum)
    {
        DISPATCH_MATRIX_ON_FLAG(&gradients,
                                nullptr,
                                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
                                functionValues -= *this,
                                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
                                functionValues -= *this,
                                if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
                                ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
                                if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
                                ScaleAndAdd(-learnRatePerSample, gradients, functionValues));
    }
    else
    {
        DISPATCH_MATRIX_ON_FLAG(&gradients,
                                nullptr,
                                { /* CPU dense */
                                  ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
                                  ScaleAndAdd(-momentum, *this, functionValues);
                                  ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradients, functionValues);
                                  // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient,
                                },
                                { /* GPU dense */
                                  ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
                                  ScaleAndAdd(-momentum, *this, functionValues);
                                  ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradients, functionValues);
                                },
                                { /* CPU sparse */
                                  if (momentum != 0)
                                  {
                                      Matrix<ElemType> gradientCache(gradients.GetDeviceId());
                                      gradientCache.SetValue(gradients);
                                      gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum);
                                      ScaleAndAdd(-momentum, *this, functionValues);
                                      ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradientCache, functionValues);
                                  }
                                },
                                { /* GPU sparse */
                                  if (momentum != 0)
                                  {
                                      Matrix<ElemType> gradientCache(gradients.GetDeviceId());
                                      gradientCache.SetValue(gradients);
                                      gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
                                      ScaleAndAdd(-momentum, *this, functionValues);
                                      ScaleAndAdd(-(1 - momentum) * learnRatePerSample, gradientCache, functionValues);
                                  }
                                });
    }
}

//both this and gradients will be changed
template <class ElemType>
ElemType Matrix<ElemType>::Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier)
{
    DecideAndMoveToRightDevice(*this, gradients);

    DISPATCH_MATRIX_ON_FLAG(&gradients,
                            &gradients,
                            return m_CPUMatrix->Adagrad(*gradients.m_CPUMatrix, needAveMultiplier);
                            SetDataLocation(CPU),
                            return m_GPUMatrix->Adagrad(*gradients.m_GPUMatrix, needAveMultiplier);
                            SetDataLocation(GPU),
                            return gradients.m_CPUSparseMatrix->Adagrad(*m_CPUMatrix, needAveMultiplier);
                            SetDataLocation(CPU),
                            return gradients.m_GPUSparseMatrix->Adagrad(*m_GPUMatrix, needAveMultiplier);
                            SetDataLocation(GPU));
}

template <class ElemType>
void Matrix<ElemType>::FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
{
    // TODO: The values of 'adagradT' and 'targetadagradavdenom' are currently hardcoded constants taken from DBN (empirically determined).
    // These should be made configurable if needed
    const size_t adagradT = 2 * 3600 * 100;
    const ElemType targetadagradavdenom = 0.0025; // 1/400 magic constant
    const ElemType adagradkeepweight = static_cast<ElemType>(exp(-1.0 * mbSize / adagradT));

    static ElemType aggadagradsqrframes = 0;
    aggadagradsqrframes = adagradkeepweight * aggadagradsqrframes + (1.0f - adagradkeepweight) * mbSize;
    const ElemType targetadagradavdenom_x_sqrtadagradsqrframes = static_cast<ElemType>(targetadagradavdenom * sqrt(aggadagradsqrframes));

    DISPATCH_MATRIX_ON_FLAG(&gradients,
                            &gradients,
                            m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes);
                            SetDataLocation(CPU),
                            m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, learnRatePerSample, momentum, adagradkeepweight, targetadagradavdenom_x_sqrtadagradsqrframes);
                            SetDataLocation(GPU),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
                                   ElemType RMS_GAMMA,
                                   ElemType RMS_WGT_INC,
                                   ElemType RMS_WGT_MAX,
                                   ElemType RMS_WGT_DEC,
                                   ElemType RMS_WGT_MIN,
                                   const bool needAveMultiplier)
{
    DecideAndMoveToRightDevice(*this, gradients);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &gradients,
                            return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier);
                            SetDataLocation(CPU),
                            return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier);
                            SetDataLocation(GPU),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
{
    if (numRows != GetNumRows() || numCols != GetNumCols())
    {
        DISPATCH_MATRIX_ON_FLAG(this,
                                this,
                                m_CPUMatrix->Reshape(numRows, numCols),
                                m_GPUMatrix->Reshape(numRows, numCols),
                                NOT_IMPLEMENTED,
                                m_GPUSparseMatrix->Reshape(numRows, numCols));
    }
}

// Note: Resize() will leave the matrix content undefined.
template <class ElemType>
void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve /*=0*/, bool growOnly /*=true*/)
{
    // TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
                                          this,
                                          m_CPUMatrix->Resize(numRows, numCols, growOnly),
                                          m_GPUMatrix->Resize(numRows, numCols, growOnly),
                                          m_CPUSparseMatrix->Resize(numRows, numCols, numNZElemToReserve, growOnly, false),
                                          m_GPUSparseMatrix->Resize(numRows, numCols, numNZElemToReserve, growOnly, false));
#ifdef _DEBUG
    if (GetMatrixType() != MatrixType::SPARSE)
        Invalidate(); // Fill the matrix with NaNs to detect using the content which is undefined. Unfortunately this won't work for sparse matrices.
#endif
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::RepMat(const Matrix<ElemType>& frmMat, const size_t rowRatio, const size_t colRatio)
{
    size_t nCols = frmMat.GetNumCols();
    size_t nRows = frmMat.GetNumRows();

    if (rowRatio > 1)
        RuntimeError("RepMat not yet supporting raw ratio larger than 1");
    size_t newCols = colRatio * nCols;

    Matrix<ElemType> c(nRows, newCols, frmMat.GetDeviceId());
    for (size_t i = 0; i < colRatio; i++)
    {
        c.ColumnSlice(i * nCols, nCols).SetValue(frmMat);
    }

    return c;
}

template <class ElemType>
size_t Matrix<ElemType>::GetAllocatedSize() const
{
    return m_baseMatrix->GetSizeAllocated();
}

// reset for sparse matrix. Semantically the same as setting all values to 0.
template <class ElemType>
void Matrix<ElemType>::Reset()
{
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
                                          this,
                                          NOT_IMPLEMENTED,
                                          NOT_IMPLEMENTED,
                                          m_CPUSparseMatrix->Reset(),
                                          m_GPUSparseMatrix->Reset());
}

template <class ElemType>
size_t Matrix<ElemType>::GetNumRows() const
{
    return m_baseMatrix->GetNumRows();
}

template <class ElemType>
size_t Matrix<ElemType>::GetNumCols() const
{
    return m_baseMatrix->GetNumCols();
}

template <class ElemType>
size_t Matrix<ElemType>::GetNumElements() const
{
    return GetNumRows() * GetNumCols();
}

template <class ElemType>
bool Matrix<ElemType>::IsEmpty() const
{
    return m_baseMatrix->IsEmpty();
}

#pragma endregion Basic Operators

#pragma region Member BLAS Functions

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator+=(ElemType alpha)
{
    return AssignSumOf(alpha, *this);
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator+(ElemType alpha) const
{
    Matrix<ElemType> c(GetNumRows(), GetNumCols(), GetDeviceId());
    c.AssignSumOf(alpha, *this);
    return c;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSumOf(const ElemType alpha, const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignSumOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignSumOf(alpha, *a.m_CPUMatrix),
                            m_GPUMatrix->AssignSumOf(alpha, *a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//if [this] and a have same dimension then [this]=[this]+a
//if a is a column vector, add to all columns of [this]
//if a is a row vector, add to all rows of [this]
//if a is a scalar, add it to all elements.
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator+=(const Matrix<ElemType>& a)
{
    DecideAndMoveToRightDevice(*this, a);

    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->operator+=(*a.m_CPUMatrix),
                            m_GPUMatrix->operator+=(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//if [this] and a have same dimension then OUTPUT=[this]+a
//if a is a column vector, add to all columns of [this]
//if a is a row vector, add to all rows of [this]
template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator+(const Matrix<ElemType>& a) const
{
    if (GetNumElements() == 1)
    {
        Matrix<ElemType> c(a.DeepClone());

        DISPATCH_MATRIX_ON_FLAG(this,
                                &c,
                                c += (*this)(0, 0),
                                c += (m_GPUMatrix->Get00Element()), // BUGBUG: efficiency
                                c += (*this)(0, 0),
                                NOT_IMPLEMENTED);
        return c;
    }
    else if (a.GetNumElements() == 1)
    {
        Matrix<ElemType> c(this->DeepClone());

        DISPATCH_MATRIX_ON_FLAG(&a,
                                &c,
                                c += a(0, 0),
                                c += (a.m_GPUMatrix->Get00Element()), // BUGBUG: efficiency
                                c += a(0, 0),
                                NOT_IMPLEMENTED);
        return c;
    }
    else
    {
        Matrix<ElemType> c(this->DeepClone()); // this implementation will introduce a copy overhead. but make resue of the code
        c += a;
        return c;
    }
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSumOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.GetNumElements() == 1)
    {
        SetValue(b);
        (*this) += a;
    }
    else
    {
        SetValue(a);
        (*this) += b;
    }
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator-=(ElemType alpha)
{
    return AssignDifferenceOf(*this, alpha);
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator-(ElemType alpha) const
{
    Matrix<ElemType> c(GetNumRows(), GetNumCols(), GetDeviceId());
    c.AssignDifferenceOf(*this, alpha);
    return c;
}

//for each column of a, we assign numRows starting from startIndex to this
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows)
{
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
                            m_GPUMatrix->AssignRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
    return *this;
}

//for each column of a, we assign all rows of a to this starting from startIndex
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
                            m_GPUMatrix->AssignToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//for the row slice of this starting from startIndex we add a to it.
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddToRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddToRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
                            m_GPUMatrix->AddToRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//for each column of this, we add row slice of a starting from startIndex
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddWithRowSliceValuesOf(const Matrix<ElemType>& a, const size_t startIndex, const size_t numRows)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddWithRowSliceValuesOf(*a.m_CPUMatrix, startIndex, numRows),
                            m_GPUMatrix->AddWithRowSliceValuesOf(*a.m_GPUMatrix, startIndex, numRows),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignRepeatOf(const Matrix<ElemType>& a, const size_t numRowRepeats, const size_t numColRepeats)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignRepeatOf(*a.m_CPUMatrix, numRowRepeats, numColRepeats),
                            m_GPUMatrix->AssignRepeatOf(*a.m_GPUMatrix, numRowRepeats, numColRepeats),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddToRowRepeatValuesOf(const Matrix<ElemType>& a, const size_t numRepeats)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddToRowRepeatValuesOf(*a.m_CPUMatrix, numRepeats),
                            m_GPUMatrix->AddToRowRepeatValuesOf(*a.m_GPUMatrix, numRepeats),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//used in the DSSM model. The resulted *this is a [a.GetRows()*(negNumber+1), a.GetCols()] matrix
//each column contains posNumber of  positive samples (original) and negNumber negative samples generated by copying
//sample shifted by shiftNumber columns
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber),
                            m_GPUMatrix->AssignPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//used in the DSSM model. *this = *this + positive and negative samples folded back to the right place
//each column of a contains posNumber of  positive samples (original) and negNumber negative samples generated by copying
//sample shifted by shiftNumber columns
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddFoldedPositiveAndShiftedNegSample(const Matrix<ElemType>& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber)
{
    DecideAndMoveToRightDevice(*this, a);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_CPUMatrix, posNumber, negNumber, shiftNumber),
                            m_GPUMatrix->AddFoldedPositiveAndShiftedNegSample(*a.m_GPUMatrix, posNumber, negNumber, shiftNumber),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignDifferenceOf(const ElemType alpha, const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignDifferenceOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignDifferenceOf(alpha, *a.m_CPUMatrix),
                            m_GPUMatrix->AssignDifferenceOf(alpha, *a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignDifferenceOf(const Matrix<ElemType>& a, const ElemType alpha)
{
    if (a.IsEmpty())
        LogicError("AssignDifferenceOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignDifferenceOf(*a.m_CPUMatrix, alpha),
                            m_GPUMatrix->AssignDifferenceOf(*a.m_GPUMatrix, alpha),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//if [this] and a have same dimension then [this]=[this]-a
//if a is a column vector, minus it from all columns of [this]
//if a is a row vector, minus it from all rows of [this]
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator-=(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("Minus Operation: Matrix a is empty.");
    DecideAndMoveToRightDevice(*this, a);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this, * m_CPUMatrix -= *a.m_CPUMatrix, * m_GPUMatrix -= *a.m_GPUMatrix,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//if [this] and a have same dimension then output=[this]-a
//if a is a column vector, minus it from all columns of [this]
//if a is a row vector, minus it from all rows of [this]
template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator-(const Matrix<ElemType>& a) const
{
    Matrix<ElemType> c(this->DeepClone()); // this implementation will introduce a copy overhead. but make resue of the code
    ScaleAndAdd(-1, a, c);
    return c;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignDifferenceOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    // if first arg broadcasts, we swap first and the flip the sign
    // This is because there is no equivalent to operator-=() that works the other way round.
    // TODO: We need ternary ops where the output storage is separate.
    if (a.GetNumRows() < b.GetNumRows() || a.GetNumCols() < b.GetNumCols())
    {
        if (a.GetNumRows() > b.GetNumRows() || a.GetNumCols() > b.GetNumCols())
            LogicError("AssignDifferenceOf: Invalid dimensions.");
        AssignDifferenceOf(b, a);
        *this *= -1;
        return *this;
    }
    if (this != &a)
        SetValue(a);
    (*this) -= b;
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator*=(ElemType alpha)
{
    Scale(alpha, *this);
    return *this;
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator*(ElemType alpha) const
{
    Matrix<ElemType> c(GetNumRows(), GetNumCols(), (DEVICEID_TYPE) m_preferredDeviceId);
    Scale(alpha, *this, c);
    return c;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignProductOf(const ElemType alpha, const Matrix<ElemType>& a)
{
    Scale(alpha, a, *this);
    return *this;
}

// [this]=a*b
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignProductOf(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB)
{
    if (a.GetNumElements() == 1)
    {
        if (transposeB)
            AssignTransposeOf(b);
        else
            this->SetValue(b);

        DISPATCH_MATRIX_ON_FLAG(this,
                                nullptr,
                                (*this) *= a(0, 0),
                                (*this) *= a.m_GPUMatrix->Get00Element(),
                                (*this) *= a(0, 0),
                                NOT_IMPLEMENTED);
    }
    else if (b.GetNumElements() == 1)
    {
        if (transposeA)
            AssignTransposeOf(a);
        else
            this->SetValue(a);

        DISPATCH_MATRIX_ON_FLAG(this,
                                nullptr,
                                (*this) *= b(0, 0),
                                (*this) *= b.m_GPUMatrix->Get00Element(),
                                (*this) *= b(0, 0),
                                NOT_IMPLEMENTED);
    }
    else
        Multiply(a, transposeA, b, transposeB, *this);

    return *this;
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator*(const Matrix<ElemType>& a) const
{
    if (GetNumElements() == 1)
    {
        Matrix<ElemType> c((DEVICEID_TYPE) a.GetPreferredDeviceId());

        DISPATCH_MATRIX_ON_FLAG(this,
                                nullptr,
                                c.AssignProductOf((*this)(0, 0), a),
                                c.AssignProductOf(m_GPUMatrix->Get00Element(), a), // BUGBUG: efficiency
                                c.AssignProductOf((*this)(0, 0), a),
                                NOT_IMPLEMENTED);

        return c;
    }
    else if (a.GetNumElements() == 1)
    {
        Matrix<ElemType> c((DEVICEID_TYPE) GetPreferredDeviceId());

        DISPATCH_MATRIX_ON_FLAG(&a,
                                nullptr,
                                c.AssignProductOf(a(0, 0), (*this)),
                                c.AssignProductOf(a.m_GPUMatrix->Get00Element(), (*this)), // BUGBUG: efficiency
                                c.AssignProductOf(a(0, 0), (*this)),
                                NOT_IMPLEMENTED);

        return c;
    }
    else
    {
        Matrix<ElemType> c(GetNumRows(), a.GetNumCols(), (DEVICEID_TYPE) GetPreferredDeviceId());
        Multiply(*this, a, c);
        return c;
    }
}

// [this]=a*b  where a is a 1x1 scalar
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::Assign1x1ProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    Multiply1x1AndWeightedAdd(+1, a, b, 0.0f, *this);
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator/=(ElemType alpha)
{
    (*this) *= 1 / alpha;
    return (*this);
}

template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator/(ElemType alpha) const
{
    return ((*this) * (1 / alpha));
}

//element-wise power
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::operator^=(ElemType alpha)
{
    auto& us = *this;
    ElementWisePower(alpha, us, us);
    return us;
}

//element-wise power
template <class ElemType>
Matrix<ElemType> Matrix<ElemType>::operator^(ElemType alpha) const
{
    Matrix<ElemType> c(GetNumRows(), GetNumCols(), (DEVICEID_TYPE) GetDeviceId());
    ElementWisePower(alpha, *this, c);
    return c;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignElementPowerOf(const Matrix<ElemType>& a, const ElemType power)
{
    ElementWisePower(power, a, *this);
    return *this;
}

//[this]=[this] .* a (we cannot override operator .* in c++)
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::ElementMultiplyWith(const Matrix<ElemType>& a)
{
    return AssignElementProductOf(*this, a);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::ElementDivideBy(const Matrix<ElemType>& a)
{
    return AssignElementDivisionOf(*this, a);
}

//[this]=a .* b
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AssignElementProductOf: Matrix is empty.");

    assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("The input matrix dimensions do not match.");

    DecideAndMoveToRightDevice(a, b, *this);
    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignElementProductOf(*a.m_CPUMatrix, *b.m_CPUMatrix),
                            m_GPUMatrix->AssignElementProductOf(*a.m_GPUMatrix, *b.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddElementProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AddElementProductOf: Matrix is empty.");

    assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("The input matrix dimensions do not match.");

    if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols()))
        InvalidArgument("The input matrix dimensions do not match [this].");

    DecideAndMoveToRightDevice(*this, a, b);

    if (!(a.GetMatrixType() == b.GetMatrixType() && GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            m_CPUMatrix->AddElementProductOf(*a.m_CPUMatrix, *b.m_CPUMatrix),
                            m_GPUMatrix->AddElementProductOf(*a.m_GPUMatrix, *b.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//[this]=a ./ b
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignElementDivisionOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AssignElementDivisionOf: Matrix is empty.");

    assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("The input matrix dimensions do not match.");

    DecideAndMoveToRightDevice(a, b, *this);
    // WARNING: a and b must have same type
    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignElementDivisionOf(*a.m_CPUMatrix, *b.m_CPUMatrix),
                            m_GPUMatrix->AssignElementDivisionOf(*a.m_GPUMatrix, *b.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::ColumnElementMultiplyWith(const Matrix<ElemType>& a)
{
    if (a.IsEmpty() || IsEmpty())
        LogicError("ColumnElementMultiplyWith: Matrix is empty.");

    if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1))
        InvalidArgument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows.");

    DecideAndMoveToRightDevice(*this, a);
    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->ColumnElementMultiplyWith(*a.m_CPUMatrix),
                            m_GPUMatrix->ColumnElementMultiplyWith(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::RowElementMultiplyWith(const Matrix<ElemType>& a)
{
    if (a.IsEmpty() || IsEmpty())
        LogicError("RowElementMultiplyWith: Matrix is empty.");

    if (!(a.GetNumCols() == GetNumCols() && a.GetNumRows() == 1))
        InvalidArgument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns.");

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->RowElementMultiplyWith(*a.m_CPUMatrix),
                            m_GPUMatrix->RowElementMultiplyWith(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::RowElementDivideBy(const Matrix<ElemType>& a)
{
    if (a.IsEmpty() || IsEmpty())
        LogicError("RowElementDivideBy: Matrix is empty.");

    if (!(a.GetNumCols() == GetNumCols() && a.GetNumRows() == 1))
        InvalidArgument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns.");

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->RowElementDivideBy(*a.m_CPUMatrix),
                            m_GPUMatrix->RowElementDivideBy(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::ColumnElementDivideBy(const Matrix<ElemType>& a)
{
    if (a.IsEmpty() || IsEmpty())
        LogicError("ColumnElementDivideBy: Matrix is empty.");

    if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1))
        InvalidArgument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows.");

    DecideAndMoveToRightDevice(*this, a);
    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->ColumnElementDivideBy(*a.m_CPUMatrix),
                            m_GPUMatrix->ColumnElementDivideBy(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//[this]=1 ./ a
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::ElementInverse()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->ElementInverse(),
                            m_GPUMatrix->ElementInverse(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->ElementInverse());

    return (*this);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignElementInverseOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignElementInverseOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignElementInverseOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignElementInverseOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignElementInverseOf(*a.m_GPUSparseMatrix));

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceSigmoid()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceSigmoid(),
                            m_GPUMatrix->InplaceSigmoid(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceSigmoid());

    return (*this);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSigmoidOf(const Matrix<ElemType>& a)
{
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignSigmoidOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignSigmoidOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignSigmoidOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=sigmoid([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceLinearRectifierDerivative()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceLinearRectifierDerivative(),
                            m_GPUMatrix->InplaceLinearRectifierDerivative(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceLinearRectifierDerivative());

    return (*this);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignLinearRectifierDerivativeOf(const Matrix<ElemType>& a)
{
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignLinearRectifierDerivativeOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=sigmoid([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceSigmoidDerivative()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceSigmoidDerivative(),
                            m_GPUMatrix->InplaceSigmoidDerivative(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return (*this);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSigmoidDerivativeOf(const Matrix<ElemType>& a)
{
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignSigmoidDerivativeOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignSigmoidDerivativeOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignNumOfDiff(const Matrix<ElemType>& a, const Matrix<ElemType>& b, bool searchInCol)
{
    DecideAndMoveToRightDevice(a, b, *this);
    // WARNING: a and b must have same type
    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignNumOfDiff(*a.m_CPUMatrix, *b.m_CPUMatrix, searchInCol),
                            m_GPUMatrix->AssignNumOfDiff(*a.m_GPUMatrix, *b.m_GPUMatrix, searchInCol),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}
//[this]=tanh([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceTanh()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceTanh(),
                            m_GPUMatrix->InplaceTanh(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceTanh());

    return (*this);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignTanhOf(const Matrix<ElemType>& a)
{
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignTanhOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignTanhOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignTanhOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=softmax([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceLogSoftmax(const bool isColWise)
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceLogSoftmax(isColWise),
                            m_GPUMatrix->InplaceLogSoftmax(isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignLogSoftmaxOf(const Matrix<ElemType>& a, const bool isColWise)
{
    if (a.IsEmpty())
        LogicError("AssignLogSoftmaxOf: Matrix a is empty.");
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignLogSoftmaxOf(*a.m_CPUMatrix, isColWise),
                            m_GPUMatrix->AssignLogSoftmaxOf(*a.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//[this]=softmax([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceHardmax(const bool isColWise)
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceHardmax(isColWise),
                            m_GPUMatrix->InplaceHardmax(isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignHardmaxOf(const Matrix<ElemType>& a, const bool isColWise)
{
    if (a.IsEmpty())
        LogicError("AssignHardmaxOf: Matrix a is empty.");
    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignHardmaxOf(*a.m_CPUMatrix, isColWise),
                            m_GPUMatrix->AssignHardmaxOf(*a.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceSqrt()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceSqrt(),
                            m_GPUMatrix->InplaceSqrt(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceSqrt());

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSqrtOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignSqrtOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignSqrtOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignSqrtOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignSqrtOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=exp([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceExp()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceExp(),
                            m_GPUMatrix->InplaceExp(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceExp());

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignExpOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignExpOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignExpOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignExpOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignExpOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=exp([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceAbs()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            m_CPUMatrix->InplaceAbs(),
                            m_GPUMatrix->InplaceAbs(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceAbs());

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignAbsOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignAbsOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignAbsOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignAbsOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignAbsOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=log([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceLog()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceLog(),
                            m_GPUMatrix->InplaceLog(),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceLog());

    return *this;
}

//[this]=log([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceLog10()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceLog10(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignLogOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignLogOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignLogOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignLogOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix));

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignLog10Of(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignLogOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignLog10Of(*a.m_CPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignLogOf(*a.m_GPUSparseMatrix));

    return *this;
}

//[this]=cos([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceCosine()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceCosine(),
                            m_GPUMatrix->InplaceCosine(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignCosineOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignCosineOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignCosineOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignCosineOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//[this]= -sin([this]) element wise
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceNegativeSine()
{
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceNegativeSine(),
                            m_GPUMatrix->InplaceNegativeSine(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignNegativeSineOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignNegativeSineOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignNegativeSineOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignNegativeSineOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceTruncate(const ElemType threshold)
{
    if (IsEmpty())
        LogicError("InplaceTruncate: Matrix is empty.");

    if (sizeof(ElemType) == sizeof(float))
    {
        if (!isfinite((float) threshold))
            return *this;
    }
    else
    {
        if (!isfinite(threshold))
            return *this;
    }

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceTruncate(threshold),
                            m_GPUMatrix->InplaceTruncate(threshold),
                            m_CPUSparseMatrix->InplaceTruncate(threshold),
                            m_GPUSparseMatrix->InplaceTruncate(threshold));

    return *this;
}

template <class ElemType>
void Matrix<ElemType>::InplaceTranspose()
{
    if (IsEmpty())
        return;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->InplaceTranspose());
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
{
    assert(threshold >= 0);

    if (IsEmpty())
        LogicError("InplaceSoftThreshold: Matrix is empty.");

    if (threshold == 0)
        return *this;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceSoftThreshold(threshold),
                            m_GPUMatrix->InplaceSoftThreshold(threshold),
                            m_CPUSparseMatrix->InplaceSoftThreshold(threshold),
                            m_GPUSparseMatrix->InplaceSoftThreshold(threshold));

    return *this;
}
//Threshold truncating: this[i] = max( this[i], threshold )
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
{
    if (IsEmpty())
        LogicError("InplaceTruncateBottom: Matrix is empty.");

    if (sizeof(ElemType) == sizeof(float))
    {
        if (!isfinite((float) threshold))
            return *this;
    }
    else
    {
        if (!isfinite(threshold))
            return *this;
    }

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceTruncateBottom(threshold),
                            m_GPUMatrix->InplaceTruncateBottom(threshold),
                            m_CPUSparseMatrix->InplaceTruncateBottom(threshold),
                            m_GPUSparseMatrix->InplaceTruncateBottom(threshold));

    return *this;
}

//Threshold truncating: this[i] = max( a[i], threshold )
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignTruncateBottomOf(const Matrix<ElemType>& a, const ElemType threshold)
{
    if (a.IsEmpty())
        LogicError("AssignTruncateBottomOf: Matrix a is empty.");

    if (sizeof(ElemType) == sizeof(float))
    {
        if (!isfinite((float) threshold))
        {
            this->SetValue(a);
            return *this;
        }
    }
    else
    {
        if (!isfinite(threshold))
        {
            this->SetValue(a);
            return *this;
        }
    }

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignTruncateBottomOf(*a.m_CPUMatrix, threshold),
                            m_GPUMatrix->AssignTruncateBottomOf(*a.m_GPUMatrix, threshold),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignTruncateBottomOf(*a.m_GPUSparseMatrix, threshold));

    return *this;
}

//Threshold truncating: this[i] = min( this[i], threshold )
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::InplaceTruncateTop(const ElemType threshold)
{
    if (IsEmpty())
        LogicError("InplaceTruncateTop: Matrix is empty.");

    if (sizeof(ElemType) == sizeof(float))
    {
        if (!isfinite((float) threshold))
            return *this;
    }
    else
    {
        if (!isfinite(threshold))
            return *this;
    }

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->InplaceTruncateTop(threshold),
                            m_GPUMatrix->InplaceTruncateTop(threshold),
                            m_CPUSparseMatrix->InplaceTruncateTop(threshold),
                            m_GPUSparseMatrix->InplaceTruncateTop(threshold));

    return *this;
}
//Threshold truncating: this[i] = min( a[i], threshold )
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignTruncateTopOf(const Matrix<ElemType>& a, const ElemType threshold)
{
    if (a.IsEmpty())
        LogicError("AssignTruncateTopOf: Matrix a is empty.");

    if (sizeof(ElemType) == sizeof(float))
    {
        if (!isfinite((float) threshold))
        {
            this->SetValue(a);
            return *this;
        }
    }
    else
    {
        if (!isfinite(threshold))
        {
            this->SetValue(a);
            return *this;
        }
    }

    DecideAndMoveToRightDevice(a, *this);
    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignTruncateTopOf(*a.m_CPUMatrix, threshold),
                            m_GPUMatrix->AssignTruncateTopOf(*a.m_GPUMatrix, threshold),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->AssignTruncateTopOf(*a.m_GPUSparseMatrix, threshold));

    return *this;
}

//Threshold truncating: this[i] = 0 if abs(this[i]<threshold).
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType threshold)
{
    if (IsEmpty())
        LogicError("SetToZeroIfAbsLessThan: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->SetToZeroIfAbsLessThan(threshold),
                            m_GPUMatrix->SetToZeroIfAbsLessThan(threshold),
                            NOT_IMPLEMENTED,
                            m_GPUSparseMatrix->SetToZeroIfAbsLessThan(threshold));

    return *this;
}

//sum of all elements
template <class ElemType>
ElemType Matrix<ElemType>::SumOfElements() const
{
    if (IsEmpty())
        LogicError("SumOfElements: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->SumOfElements(),
                            return m_GPUMatrix->SumOfElements(),
                            return m_CPUSparseMatrix->SumOfElements(),
                            return m_GPUSparseMatrix->SumOfElements());
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSumOfElements(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignSumOfElements: Matrix a is empty.");

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignSumOfElements(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignSumOfElements(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
DeviceBoundNumber<ElemType> Matrix<ElemType>::Sum_AsDeviceBoundNum() const
{
    DeviceBoundNumber<ElemType> result;

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            ElemType* val = new ElemType;
                                * val = m_CPUMatrix->SumOfElements(); result.ShallowCopyFrom(val, -1); return result,
                                                                                                              return m_GPUMatrix->Sum_AsDeviceBoundNum(),
                                                                                                              NOT_IMPLEMENTED,
                                                                                                              NOT_IMPLEMENTED);
}

//sum of all elements
template <class ElemType>
ElemType Matrix<ElemType>::SumOfAbsElements() const
{
    if (IsEmpty())
        LogicError("SumOfAbsElements: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->SumOfAbsElements(),
                            return m_GPUMatrix->SumOfAbsElements(),
                            NOT_IMPLEMENTED,
                            return m_GPUSparseMatrix->SumOfAbsElements());
}

//sum of all elements
template <class ElemType>
ElemType Matrix<ElemType>::LogAddSumOfElements() const
{
    if (IsEmpty())
        LogicError("LogAddSumOfElements: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->LogAddSumOfElements(),
                            return m_GPUMatrix->LogAddSumOfElements(),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
bool Matrix<ElemType>::IsValid() const
{
    if (m_currentDataLocation == CurrentDataLocation::GPU && GetMatrixType() == MatrixType::SPARSE)
    {
        return this->m_GPUSparseMatrix->IsValid();
    }
    else
    {
        NOT_IMPLEMENTED;
    }

    return false;
}

template <class ElemType>
bool Matrix<ElemType>::IsEqualTo(const Matrix<ElemType>& a, const ElemType threshold /*= 1e-8*/) const
{
    return AreEqual(*this, a, threshold);
}

template <class ElemType>
void Matrix<ElemType>::VectorSum(const Matrix<ElemType>& a, Matrix<ElemType>& c, const bool isColWise)
{
    DecideAndMoveToRightDevice(c, a);
    if (!(a.GetMatrixType() == c.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::VectorSum(*a.m_CPUMatrix, *c.m_CPUMatrix, isColWise),
                            GPUMatrix<ElemType>::VectorSum(*a.m_GPUMatrix, *c.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::VectorNorm1(Matrix<ElemType>& c, const bool isColWise) const
{
    if (IsEmpty())
        LogicError("VectorNormInf: Matrix is empty.");

    DecideAndMoveToRightDevice(*this, c);
    c.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &c,
                            m_CPUMatrix->VectorNorm1(*c.m_CPUMatrix, isColWise),
                            m_GPUMatrix->VectorNorm1(*c.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignVectorNorm1Of(Matrix<ElemType>& a, const bool isColWise)
{
    a.VectorNorm1(*this, isColWise);
    return *this;
}

template <class ElemType>
void Matrix<ElemType>::VectorNorm2(Matrix<ElemType>& c, const bool isColWise) const
{
    if (IsEmpty())
        LogicError("VectorNorm2: Matrix is empty.");

    DecideAndMoveToRightDevice(*this, c);
    c.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &c,
                            m_CPUMatrix->VectorNorm2(*c.m_CPUMatrix, isColWise),
                            m_GPUMatrix->VectorNorm2(*c.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignVectorNorm2Of(Matrix<ElemType>& a, const bool isColWise)
{
    a.VectorNorm2(*this, isColWise);
    return *this;
}

template <class ElemType>
void Matrix<ElemType>::VectorNormInf(Matrix<ElemType>& c, const bool isColWise) const
{
    if (IsEmpty())
        LogicError("VectorNormInf: Matrix is empty.");

    DecideAndMoveToRightDevice(*this, c);
    c.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &c,
                            m_CPUMatrix->VectorNormInf(*c.m_CPUMatrix, isColWise),
                            m_GPUMatrix->VectorNormInf(*c.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignVectorNormInfOf(Matrix<ElemType>& a, const bool isColWise)
{
    a.VectorNormInf(*this, isColWise);
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignInnerProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise)
{
    InnerProduct(a, b, *this, isColWise);
    return *this;
}

//column-wise crossproduct
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignKhatriRaoProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AssignKhatriRaoProductOf: Matrix is empty.");

    assert(a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("AssignKhatriRaoProductOf: The input matrix dimensions do not match.");

    DecideAndMoveToRightDevice(a, b, *this);
    // WARNING: a and b must have same type
    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignKhatriRaoProductOf(*a.m_CPUMatrix, *b.m_CPUMatrix),
                            m_GPUMatrix->AssignKhatriRaoProductOf(*a.m_GPUMatrix, *b.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//column-wise reshaped product. Used to compute KhatriRaoProduct Gradient
//   this = reshape each column of a from (K1xK2,1) to (K1, K2)
//   if each column of a is not transposed, each (K1, K2) times each column of b (K2, frames).
//   the output is a (K1, frames) matrix
//   if each column of a is tranposed, each (K1, K2)^T times each column of b(K1, frames) and output is (K2, frames)
//column-wise crossproduct
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddColumnReshapeProductOf(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool transposeAColumn)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AddColumnReshapeProductOf: Matrix is empty.");

    assert(a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("AddColumnReshapeProductOf: The input matrix dimensions do not match.");

    DecideAndMoveToRightDevice(*this, a, b);
    // WARNING: a and b must have same type
    if (!(a.GetMatrixType() == b.GetMatrixType() && GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddColumnReshapeProductOf(*a.m_CPUMatrix, *b.m_CPUMatrix, transposeAColumn),
                            m_GPUMatrix->AddColumnReshapeProductOf(*a.m_GPUMatrix, *b.m_GPUMatrix, transposeAColumn),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddWithScaleOf(ElemType alpha, const Matrix<ElemType>& a)
{
    ScaleAndAdd(alpha, a, *this);
    return *this;
}

template <class ElemType>
ElemType Matrix<ElemType>::FrobeniusNorm() const
{
    if (IsEmpty())
        LogicError("FrobeniusNorm: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->FrobeniusNorm(),
                            return m_GPUMatrix->FrobeniusNorm(),
                            return m_CPUSparseMatrix->FrobeniusNorm(),
                            return m_GPUSparseMatrix->FrobeniusNorm());
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignFrobeniusNormOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignFrobeniusNormOf: Matrix a is empty.");

    Resize(1, 1);

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignFrobeniusNormOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignFrobeniusNormOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
ElemType Matrix<ElemType>::MatrixNormInf() const
{
    if (IsEmpty())
        LogicError("MatrixNormInf: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->MatrixNormInf(),
                            return m_GPUMatrix->MatrixNormInf(),
                            NOT_IMPLEMENTED,
                            return m_GPUSparseMatrix->MatrixNormInf());
}

template <class ElemType>
ElemType Matrix<ElemType>::MatrixNorm1() const
{
    if (IsEmpty())
        LogicError("MatrixNorm1: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->MatrixNorm1(),
                            return m_GPUMatrix->MatrixNorm1(),
                            NOT_IMPLEMENTED,
                            return m_GPUSparseMatrix->MatrixNorm1());
}

template <class ElemType>
ElemType Matrix<ElemType>::MatrixNorm0() const
{
    if (IsEmpty())
        LogicError("MatrixNorm0: Matrix is empty.");

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return m_CPUMatrix->MatrixNorm0(),
                            return m_GPUMatrix->MatrixNorm0(),
                            NOT_IMPLEMENTED,
                            return m_GPUSparseMatrix->MatrixNorm0());
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSignOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AssignSignOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AssignSignOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AssignSignOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddSignOf(const Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        LogicError("AddSignOf: Matrix a is empty.");

    DecideAndMoveToRightDevice(a, *this);
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&a,
                            this,
                            m_CPUMatrix->AddSignOf(*a.m_CPUMatrix),
                            m_GPUMatrix->AddSignOf(*a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//I decided to use Matrix<ElemType>& maxIndexes instead of integer vector because the result may be used to do additional calculation
template <class ElemType>
void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise) const
{
    if (IsEmpty())
        LogicError("VectorMax: Matrix is empty.");

    DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
    maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
    maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &maxValues,
                            m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise);
                            maxIndexes.SetDataLocation(CPU, DENSE),
                            m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise);
                            maxIndexes.SetDataLocation(GPU, DENSE),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::VectorMax(Matrix<ElemType>& maxIndexes, Matrix<ElemType>& maxValues, const bool isColWise, int topK) const
{
    if (IsEmpty())
        LogicError("VectorMax: Matrix is empty.");

    DecideAndMoveToRightDevice(*this, maxIndexes, maxValues);
    maxIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
    maxValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &maxValues,
                            m_CPUMatrix->VectorMax(*maxIndexes.m_CPUMatrix, *maxValues.m_CPUMatrix, isColWise, topK);
                            maxIndexes.SetDataLocation(CPU, DENSE),
                            m_GPUMatrix->VectorMax(*maxIndexes.m_GPUMatrix, *maxValues.m_GPUMatrix, isColWise, topK);
                            maxIndexes.SetDataLocation(GPU, DENSE),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::VectorMin(Matrix<ElemType>& minIndexes, Matrix<ElemType>& minValues, const bool isColWise) const
{
    if (IsEmpty())
        LogicError("VectorMin: Matrix is empty.");

    DecideAndMoveToRightDevice(*this, minIndexes, minValues);
    minIndexes.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);
    minValues.SwitchToMatrixType(GetMatrixType(), GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &minValues,
                            m_CPUMatrix->VectorMin(*minIndexes.m_CPUMatrix, *minValues.m_CPUMatrix, isColWise);
                            minIndexes.SetDataLocation(CPU, DENSE),
                            m_GPUMatrix->VectorMin(*minIndexes.m_GPUMatrix, *minValues.m_GPUMatrix, isColWise);
                            minIndexes.SetDataLocation(GPU, DENSE),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

#pragma endregion Member BLAS Functions

#pragma region Other helper Functions

template <class ElemType>
int Matrix<ElemType>::GetDeviceId() const
{
    if (m_currentDataLocation == CurrentDataLocation::NONE)
        return m_preferredDeviceId;

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            return CPUDEVICE,
                            return m_GPUMatrix->GetComputeDeviceId(),
                            return CPUDEVICE,
                            return m_GPUSparseMatrix->GetComputeDeviceId());
}

// TODO: Move the shared core functions to the front of this source file.
// BUGBUG: This performs a copy operation even for the output matrix that gets overwritten right away.
//         We should (1) define which is the output and (2) whether it will be completely overwritten (so we won't actually copy it).
// bring two matrices onto the same device
// If different and prefered devices are the same, move to preferred device.
// Otherwise GPU takes precedence over CPU, and if both are GPU move to a's device.
// The inputs are only distinguished in that a's GPU takes precedence over b's in case they differ.
// TODO: This is called somewhat inconsistently, sometimes with a=*this, sometimes with b=*this.
template <class ElemType>
template <class ElemType2>
void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType2>& b)
{
    int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId();
    if (deviceIdA == deviceIdB)
        return;

    int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId();

    if (preferredDeviceIdA == preferredDeviceIdB) // both prefer the same device: move to preferred
    {
        a._transferToDevice(preferredDeviceIdA);
        b._transferToDevice(preferredDeviceIdA);
    }
    else if (deviceIdA != CPUDEVICE) // one of them lives on GPU: use that
    {
        b._transferToDevice(deviceIdA);
    }
    else
    {
        a._transferToDevice(deviceIdB);
    }
}

// same but for 3 matrices
// If b and c are both on the same GPU then a will be forced to go there; otherwise a's GPU takes precedence, then b's.
template <class ElemType>
void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c)
{
    int deviceIdA = a.GetDeviceId(), deviceIdB = b.GetDeviceId(), deviceIdC = c.GetDeviceId();
    if (deviceIdA == deviceIdB && deviceIdA == deviceIdC)
        return;

    int preferredDeviceIdA = a.GetPreferredDeviceId(), preferredDeviceIdB = b.GetPreferredDeviceId(), preferredDeviceIdC = c.GetPreferredDeviceId();

    if (preferredDeviceIdA == preferredDeviceIdB && preferredDeviceIdA == preferredDeviceIdC)
    {
        a._transferToDevice(preferredDeviceIdA);
        b._transferToDevice(preferredDeviceIdA);
        c._transferToDevice(preferredDeviceIdA);
    }
    else if (deviceIdB == deviceIdC && deviceIdB != CPUDEVICE) // TODO: why not the other two combinations?
    {
        a._transferToDevice(deviceIdB); // 'a' is outvoted
    }
    else if (deviceIdA != CPUDEVICE) // one of them lives on GPU: use that
    {
        b._transferToDevice(deviceIdA);
        c._transferToDevice(deviceIdA);
    }
    else if (deviceIdB != CPUDEVICE)
    {
        a._transferToDevice(deviceIdB);
        c._transferToDevice(deviceIdB);
    }
    else
    {
        a._transferToDevice(deviceIdC);
        b._transferToDevice(deviceIdC);
    }
}

// same but for 4 matrices
template <class ElemType>
void Matrix<ElemType>::DecideAndMoveToRightDevice(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& d)
{
    // this function is only called for one operator, so for now we keep it imple
    DecideAndMoveToRightDevice(a, b, c);
    d._transferToDevice(a.GetDeviceId()); // BUGBUG: Is this correct in case a,b,c share the same preferredDevice?
}

template <class ElemType>
void Matrix<ElemType>::_transferToDevice(int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const
{
    int from_id = GetDeviceId();
    if (to_id == from_id) // nothing to do
        return;

    if (OwnBuffer())
        _transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer);
    else
        RuntimeError("Cannot move externally owned matrices to the preferred device.");
}

// this function performs data transfer and updates data location, but not the device that is stored with it
template <class ElemType>
void Matrix<ElemType>::_transferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved /*= true*/, bool emptyTransfer /* = false*/) const
{
    if (from_id < 0)
        from_id = CPUDEVICE;
    if (to_id < 0)
        to_id = CPUDEVICE;

    if (from_id == to_id)
    {
        if (from_id != GetDeviceId())
            RuntimeError("Trying to transfer matrix from device to the same device while the matrix does not live in the from device.");

        return;
    }

#define NUM_DEVICE_CHANGED_WARN 20
    if (m_numTimesDeviceChanged <= NUM_DEVICE_CHANGED_WARN &&
        (!emptyTransfer || (from_id >= 0 && to_id >= 0)))
    {
        m_numTimesDeviceChanged++;
        if (m_devicesTransferedTo[0] < CPUDEVICE)
        {
            m_devicesTransferedTo[0] = to_id;
        }
        else if (m_devicesTransferedTo[0] != to_id)
        {
            m_devicesTransferedTo[1] = to_id;
        }
    }
    if (m_numTimesDeviceChanged == NUM_DEVICE_CHANGED_WARN && m_devicesTransferedTo[1] >= CPUDEVICE)
    {
        fprintf(stderr, "WARNING: The same matrix with dim [%lu, %lu] has been transferred between different devices for %d times.\n", (unsigned long) GetNumRows(), (unsigned long) GetNumCols(), NUM_DEVICE_CHANGED_WARN);
    }

    if (m_matrixType == MatrixType::SPARSE)
    {
        if (from_id == CPUDEVICE) // from CPU to GPU
        {
            if (m_CPUSparseMatrix == NULL)
                LogicError("Can't move from CPU because I'm not there!");

            if (m_GPUSparseMatrix == NULL)
                m_GPUSparseMatrix = new GPUSparseMatrix<ElemType>(to_id, m_CPUSparseMatrix->GetFormat());
            else
                m_GPUSparseMatrix->ChangeDeviceTo(to_id);

            if (m_CPUSparseMatrix->GetNumElements() != 0 && !emptyTransfer)
            {
                m_GPUSparseMatrix->SetValue(*m_CPUSparseMatrix);
            }

            if (isBeingMoved)
            {
                delete m_CPUSparseMatrix;
                m_CPUSparseMatrix = NULL;
                SetDataLocation(GPU, SPARSE);
            }
            else
            {
                SetDataLocation(BOTH, SPARSE);
            }
        }
        else // from GPU
        {
            if (m_GPUSparseMatrix == NULL || m_GPUSparseMatrix->GetComputeDeviceId() != from_id)
                LogicError("This matrix isn't on this (or any?) GPU");

            if (to_id < 0) // to CPU
            {
                if (m_CPUSparseMatrix == NULL)
                    m_CPUSparseMatrix = new CPUSparseMatrix<ElemType>(m_GPUSparseMatrix->GetFormat());

                if (m_GPUSparseMatrix->GetNumElements() != 0 && !emptyTransfer)
                {
                    m_GPUSparseMatrix->CopyToCPUSparseMatrix(*m_CPUSparseMatrix);
                }

                if (isBeingMoved)
                {
                    delete m_GPUSparseMatrix;
                    m_GPUSparseMatrix = NULL;
                    SetDataLocation(CPU, SPARSE);
                }
                else
                {
                    SetDataLocation(BOTH, SPARSE);
                }
            }
            else // to another GPU
            {
                m_GPUSparseMatrix->ChangeDeviceTo(to_id);
            }
        }
    }
    else
#pragma omp critical
    {
        if (from_id == CPUDEVICE) // from CPU to GPU
        {
            if (m_CPUMatrix == NULL)
                LogicError("Can't move from CPU because I'm not there!");
            if (m_GPUMatrix != NULL)
                delete m_GPUMatrix;
            if (m_CPUMatrix->GetNumElements() != 0 && !emptyTransfer)
            {
                m_GPUMatrix = new GPUMatrix<ElemType>(m_CPUMatrix->GetNumRows(), m_CPUMatrix->GetNumCols(), to_id, m_CPUMatrix->GetArray(), matrixFlagNormal);
            }
            else
            {
                m_GPUMatrix = new GPUMatrix<ElemType>(to_id);
            }
            if (isBeingMoved)
            {
                delete m_CPUMatrix;
                m_CPUMatrix = NULL;
                SetDataLocation(GPU, DENSE);
            }
            else
            {
                SetDataLocation(BOTH, DENSE);
            }
        }
        else // from GPU
        {
            if (m_GPUMatrix == NULL || m_GPUMatrix->GetComputeDeviceId() != from_id)
                LogicError("This matrix isn't on this (or any?) GPU");

            if (to_id < 0) // to CPU
            {
                if (m_CPUMatrix != NULL)
                    delete m_CPUMatrix;

                if (m_GPUMatrix->GetNumElements() != 0 && !emptyTransfer)
                {
                    ElemType* arr = m_GPUMatrix->CopyToArray(); // TODO: unnecessary allocation/copy; why not make this a vector that we move over as an rvalue ref?
                    m_CPUMatrix = new CPUMatrix<ElemType>(m_GPUMatrix->GetNumRows(), m_GPUMatrix->GetNumCols(), arr, matrixFlagNormal);
                    delete[] arr;
                }
                else
                {
                    m_CPUMatrix = new CPUMatrix<ElemType>();
                }

                if (isBeingMoved)
                {
                    delete m_GPUMatrix;
                    m_GPUMatrix = NULL;
                    SetDataLocation(CPU, DENSE);
                }
                else
                {
                    SetDataLocation(BOTH, DENSE);
                }
            }
            else // to another GPU
            {
                m_GPUMatrix->ChangeDeviceTo(to_id);
            }
        }
    } // and of omp critical section
}

template <class ElemType>
void Matrix<ElemType>::TransferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const
{
    _transferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer);
    if (updatePreferredDevice)
        m_preferredDeviceId = GetDeviceId();
}
template <class ElemType>
void Matrix<ElemType>::TransferToDeviceIfNotThere(int to_id, bool isBeingMoved/*false: may leave in BOTH state*/, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const
{
    int from_id = GetDeviceId();

    if (from_id == to_id)                     // already at the right place
        return;

    if (GetCurrentMatrixLocation() == BOTH && // if currently in BOTH state
        !isBeingMoved &&                      // and leaving in BOTH state is OK
        (from_id < 0 || to_id < 0))           // and this is not about changing GPUs
    {
        return;                               // then we are good
    }

    TransferFromDeviceToDevice(from_id, to_id, isBeingMoved, emptyTransfer, updatePreferredDevice);
}

template <class ElemType>
void Matrix<ElemType>::Print(const char* matrixName, ptrdiff_t rowStart, ptrdiff_t rowEnd, ptrdiff_t colStart, ptrdiff_t colEnd) const
{
    DEVICEID_TYPE orgdevice = GetDeviceId();

    DISPATCH_MATRIX_ON_FLAG(this,
                            nullptr,
                            // CPU:
                            m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd),
                            // GPU;
                            {
                                _transferToDevice(CPUDEVICE, false, false);
                                m_CPUMatrix->Print(matrixName, rowStart, rowEnd, colStart, colEnd);
                                _transferToDevice(orgdevice, false, false);
                            },
                            // CPU, sparse:
                            m_CPUSparseMatrix->Print(matrixName),
                            // GPU, sparse:
                            {
                                _transferToDevice(CPUDEVICE, false, false);
                                m_CPUSparseMatrix->Print(matrixName);
                                _transferToDevice(orgdevice, false, false);
                            });
}

template <class ElemType>
void Matrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
{
    Print(matrixName, 0, GetNumRows() - 1, 0, GetNumCols() - 1);
}

//helpfer function used for convolution neural network
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignPackedConvolutionInput(const Matrix<ElemType>& inputSubBatch,
                                                                 const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
                                                                 const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
                                                                 const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample,
                                                                 const bool zeroPadding)
{
    DecideAndMoveToRightDevice(inputSubBatch, *this);
    SwitchToMatrixType(inputSubBatch.GetMatrixType(), inputSubBatch.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&inputSubBatch,
                            this,
                            m_CPUMatrix->AssignPackedConvolutionInput(*(inputSubBatch.m_CPUMatrix),
                                                                      inputWidth, inputHeight, inputChannels,
                                                                      outputWidth, outputHeight, outputChannels,
                                                                      kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample,
                                                                      zeroPadding),
                            m_GPUMatrix->AssignPackedConvolutionInput(*(inputSubBatch.m_GPUMatrix),
                                                                      inputWidth, inputHeight, inputChannels,
                                                                      outputWidth, outputHeight, outputChannels,
                                                                      kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample,
                                                                      zeroPadding),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

//helpfer function used for convolution neural network
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::UnpackConvolutionInput(Matrix<ElemType>& inputSubBatch,
                                                           const size_t inputWidth, const size_t inputHeight, const size_t inputChannels,
                                                           const size_t outputWidth, const size_t outputHeight, const size_t outputChannels,
                                                           const size_t kernelWidth, const size_t kernelHeight, const size_t horizontalSubsample, const size_t verticalSubsample,
                                                           const bool zeroPadding) const
{
    DecideAndMoveToRightDevice(*this, inputSubBatch);
    inputSubBatch.SwitchToMatrixType(GetMatrixType(), inputSubBatch.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            &inputSubBatch,
                            m_CPUMatrix->UnpackConvolutionInput(*(inputSubBatch.m_CPUMatrix),
                                                                inputWidth, inputHeight, inputChannels,
                                                                outputWidth, outputHeight, outputChannels,
                                                                kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample,
                                                                zeroPadding),
                            m_GPUMatrix->UnpackConvolutionInput(*(inputSubBatch.m_GPUMatrix),
                                                                inputWidth, inputHeight, inputChannels,
                                                                outputWidth, outputHeight, outputChannels,
                                                                kernelWidth, kernelHeight, horizontalSubsample, verticalSubsample,
                                                                zeroPadding),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return inputSubBatch;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignMaxPoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels,
                                                           const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                           const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                           const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
{
    DecideAndMoveToRightDevice(inputBatch, *this);
    SwitchToMatrixType(inputBatch.GetMatrixType(), inputBatch.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&inputBatch,
                            this,
                            m_CPUMatrix->AssignMaxPoolingResult(*(inputBatch.m_CPUMatrix), channels,
                                                                inputWidth, inputHeight, inputSizePerSample,
                                                                outputWidth, outputHeight, outputSizePerSample,
                                                                windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            m_GPUMatrix->AssignMaxPoolingResult(*(inputBatch.m_GPUMatrix), channels,
                                                                inputWidth, inputHeight, inputSizePerSample,
                                                                outputWidth, outputHeight, outputSizePerSample,
                                                                windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddMaxPoolingGradient(const Matrix<ElemType>& outputGradientBatch, const Matrix<ElemType>& inputBatch, const Matrix<ElemType>& outputBatch,
                                                          const size_t channels,
                                                          const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                          const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                          const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
{
    DecideAndMoveToRightDevice(*this, outputGradientBatch, inputBatch);
    outputBatch._transferToDevice(GetDeviceId());

    if (!(GetMatrixType() == outputGradientBatch.GetMatrixType() && GetMatrixType() == inputBatch.GetMatrixType() && GetMatrixType() == outputBatch.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddMaxPoolingGradient(*(outputGradientBatch.m_CPUMatrix), *(inputBatch.m_CPUMatrix), *(outputBatch.m_CPUMatrix), channels,
                                                               inputWidth, inputHeight, inputSizePerSample,
                                                               outputWidth, outputHeight, outputSizePerSample,
                                                               windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            m_GPUMatrix->AddMaxPoolingGradient(*(outputGradientBatch.m_GPUMatrix), *(inputBatch.m_GPUMatrix), *(outputBatch.m_GPUMatrix), channels,
                                                               inputWidth, inputHeight, inputSizePerSample,
                                                               outputWidth, outputHeight, outputSizePerSample,
                                                               windowWidth, windowHeight, horizontalSubsample, verticalSubsample);
                            ,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignAveragePoolingResult(const Matrix<ElemType>& inputBatch, const size_t channels,
                                                               const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                               const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                               const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
{
    DecideAndMoveToRightDevice(inputBatch, *this);
    SwitchToMatrixType(inputBatch.GetMatrixType(), inputBatch.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&inputBatch,
                            this,
                            m_CPUMatrix->AssignAveragePoolingResult(*(inputBatch.m_CPUMatrix), channels,
                                                                    inputWidth, inputHeight, inputSizePerSample,
                                                                    outputWidth, outputHeight, outputSizePerSample,
                                                                    windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            m_GPUMatrix->AssignAveragePoolingResult(*(inputBatch.m_GPUMatrix), channels,
                                                                    inputWidth, inputHeight, inputSizePerSample,
                                                                    outputWidth, outputHeight, outputSizePerSample,
                                                                    windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSoftmaxSum(const Matrix<ElemType>& a, const Matrix<ElemType>& softmax)
{
    Resize(1, 1);
    if (GetDeviceId() < 0)
        a.m_CPUMatrix->AssignSoftmaxSum(*softmax.m_CPUMatrix, *m_CPUMatrix);
    else
        a.m_GPUMatrix->AssignSoftmaxSum(*softmax.m_GPUMatrix, *m_GPUMatrix);
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignNceUnnormalizedEval(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias)
{
    // if (a.GetMatrixType() != MatrixType::SPARSE)
    //    NOT_IMPLEMENTED;

    Resize(1, 1);
    if (GetDeviceId() < 0)
        a.m_CPUMatrix->AssignNCEUnnormalizedEval(*b.m_CPUMatrix, *c.m_CPUMatrix, *bias.m_CPUMatrix, *m_CPUMatrix);
    else
        a.m_GPUMatrix->AssignNCEUnnormalizedEval(*b.m_GPUMatrix, *c.m_GPUMatrix, *m_GPUMatrix);
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignNoiseContrastiveEstimation(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, const Matrix<ElemType>& bias, Matrix<ElemType>& tmp)
{
    if (a.IsEmpty() || b.IsEmpty() || c.IsEmpty())
        LogicError("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");

    if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != GetDeviceId())
        NOT_IMPLEMENTED;

    Resize(1, 1);

    if (GetDeviceId() < 0)
    {
        size_t sampleCount = a.m_CPUMatrix->GetNumElements() / a.m_CPUMatrix->GetNumRows();
        tmp.Resize(a.GetNumRows() / 2, sampleCount);
        a.m_CPUMatrix->AssignNoiseContrastiveEstimation(*b.m_CPUMatrix, *c.m_CPUMatrix,
                                                        *bias.m_CPUMatrix, *tmp.m_CPUMatrix, *m_CPUMatrix);
    }
    else
    {
        size_t sampleCount = a.m_GPUMatrix->GetNumElements() / a.m_GPUMatrix->GetNumRows();
        tmp.Resize(a.GetNumRows() / 2, sampleCount);
        a.m_GPUMatrix->AssignNoiseContrastiveEstimation(*b.m_GPUMatrix, *c.m_GPUMatrix,
                                                        *bias.m_GPUMatrix, sampleCount, *tmp.m_GPUMatrix, *m_GPUMatrix);
    }
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignNCEDerivative(const Matrix<ElemType>& tmp, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, size_t inputIndex)
{
    if (a.IsEmpty() || b.IsEmpty() || c.IsEmpty())
        LogicError("AssignNoiseContrastiveEstimation: one of the input matrices is empty.");

    if (a.GetDeviceId() != b.GetDeviceId() || b.GetDeviceId() != c.GetDeviceId() || c.GetDeviceId() != GetDeviceId())
        NOT_IMPLEMENTED;

    assert(tmp.GetNumRows() == a.GetNumRows() / 2);
    if (GetDeviceId() < 0)
    {
        // samples                         gradient          hidden          embedding                   embedding/hidden
        a.m_CPUMatrix->AssignNCEDerivative(*tmp.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, inputIndex, *m_CPUMatrix);
    }
    else
    {
        a.m_GPUMatrix->AssignNCEDerivative(*tmp.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, inputIndex, *m_GPUMatrix);
    }
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AddAveragePoolingGradient(const Matrix<ElemType>& outputGradientBatch,
                                                              const size_t channels,
                                                              const size_t inputWidth, const size_t inputHeight, const size_t inputSizePerSample,
                                                              const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                              const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample)
{
    DecideAndMoveToRightDevice(*this, outputGradientBatch);
    if (!(GetMatrixType() == outputGradientBatch.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AddAveragePoolingGradient(*(outputGradientBatch.m_CPUMatrix), channels,
                                                                   inputWidth, inputHeight, inputSizePerSample,
                                                                   outputWidth, outputHeight, outputSizePerSample,
                                                                   windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            m_GPUMatrix->AddAveragePoolingGradient(*(outputGradientBatch.m_GPUMatrix), channels,
                                                                   inputWidth, inputHeight, inputSizePerSample,
                                                                   outputWidth, outputHeight, outputSizePerSample,
                                                                   windowWidth, windowHeight, horizontalSubsample, verticalSubsample),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

#pragma endregion Other Helper Functions

#pragma region Static BLAS Functions

template <class ElemType>
void Matrix<ElemType>::SVD(const Matrix<ElemType>& A, Matrix<ElemType>& SIGMA, Matrix<ElemType>& U, Matrix<ElemType>& VT, Matrix<ElemType>& W)
{
    if (A.IsEmpty())
        LogicError("SVD:  the input matrix is empty.");

    DecideAndMoveToRightDevice(A, SIGMA, U);
    VT._transferToDevice(A.GetDeviceId());
    W._transferToDevice(A.GetDeviceId());

    SIGMA.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);
    U.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);
    VT.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);
    W.SwitchToMatrixType(A.GetMatrixType(), A.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&A,
                            nullptr,
                            Matrix<ElemType> tA = A.DeepClone();
                            CPUMatrix<ElemType>::SVD(*tA.m_CPUMatrix, *SIGMA.m_CPUMatrix, *U.m_CPUMatrix, *VT.m_CPUMatrix, *W.m_CPUMatrix);
                            SIGMA.SetDataLocation(CPU);
                            U.SetDataLocation(CPU);
                            VT.SetDataLocation(CPU);
                            W.SetDataLocation(CPU),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

/// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c</summary>
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="transposeA">Whether matrix a is transposed</param>
/// <param name="b">Input matrix</param>
/// <param name="transposeB">Whether matrix b is transposed</param>
/// <param name="beta">Scalar</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
                                              ElemType beta, Matrix<ElemType>& c)
{
    DecideAndMoveToRightDevice(a, b, c);

    if (c.GetDeviceId() < 0) // CPU
    {
        if (a.GetMatrixType() == MatrixType::SPARSE)
            NOT_IMPLEMENTED;
        if (b.GetMatrixType() == MatrixType::SPARSE)
        {
            if (c.GetMatrixType() == MatrixType::DENSE)
            {
                CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_CPUMatrix, transposeA, *b.m_CPUSparseMatrix, transposeB, beta, *c.m_CPUMatrix);
                c.SetDataLocation(CPU, DENSE);
            }
            else if (c.GetMatrixType() == MatrixType::SPARSE)
            {
                CPUSparseMatrix<ElemType>::MultiplyAndAdd(alpha, *a.m_CPUMatrix, transposeA, *b.m_CPUSparseMatrix, transposeB, *c.m_CPUSparseMatrix);
                c.SetDataLocation(CPU, SPARSE);
            }
            else
                NOT_IMPLEMENTED;
        }
        else
        {
            c.SwitchToMatrixType(MatrixType::DENSE, matrixFormatDense, false);
            CPUMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_CPUMatrix, transposeA, *b.m_CPUMatrix, transposeB, beta, *c.m_CPUMatrix);
            c.SetDataLocation(CPU, DENSE);
        }
    }
    else // GPU operations
    {
        if (a.m_matrixType == b.m_matrixType && b.m_matrixType == c.m_matrixType && a.m_matrixType == MatrixType::DENSE) // All dense
        {
            GPUMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_GPUMatrix, transposeA, *b.m_GPUMatrix, transposeB, beta, *c.m_GPUMatrix);
            c.SetDataLocation(GPU, DENSE);
        }
        else if (a.m_matrixType == MatrixType::SPARSE && b.m_matrixType == c.m_matrixType && b.m_matrixType == MatrixType::DENSE) // Sparse*Dense+Dense
        {
            GPUMatrix<ElemType> second = transposeB ? b.m_GPUMatrix->Transpose() : *b.m_GPUMatrix;
            GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_GPUSparseMatrix, transposeA, second, false, beta, *c.m_GPUMatrix);
            c.SetDataLocation(GPU, DENSE);
        }
        else if (a.m_matrixType == MatrixType::DENSE && b.m_matrixType == MatrixType::SPARSE && c.m_matrixType == MatrixType::DENSE) // Dense*Sparse + Dense
        {
            // if (b.m_GPUSparseMatrix->GetFormat() == MatrixFormat::matrixFormatSparseCSR)
            // {
            GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_GPUMatrix, transposeA, *b.m_GPUSparseMatrix, transposeB, beta, *c.m_GPUMatrix);
            // }
            // else
            // {
            //    GPUMatrix<ElemType> firstDummy = transposeA ? a.m_GPUMatrix->Transpose()*alpha : (*a.m_GPUMatrix)*alpha;
            //    GPUMatrix<ElemType> & first= firstDummy;                // GCC does not support mixing refs and non-refs
            //    GPUSparseMatrix<ElemType> secondDummy = transposeB ? b.m_GPUSparseMatrix->Transpose() : *b.m_GPUSparseMatrix;
            //    GPUSparseMatrix<ElemType> & second = secondDummy;
            //    if (beta==0)
            //    {
            //        GPUSparseMatrix<ElemType>::Multiply(first,second,*c.m_GPUMatrix);
            //    }
            //    else
            //    {
            //        Matrix<ElemType> tmp(c.GetNumRows(),c.GetNumCols(),(DEVICEID_TYPE)c.GetDeviceId());
            //        GPUSparseMatrix<ElemType>::Multiply(first,second,*tmp.m_GPUMatrix);
            //        c=tmp+c*beta;
            //    }
            // }
            c.SetDataLocation(GPU, DENSE);
        }
        else if (a.m_matrixType == MatrixType::DENSE && b.m_matrixType == MatrixType::SPARSE && c.m_matrixType == MatrixType::SPARSE) // h -> u0
        {
            // new GPU sparse matrix code
            GPUSparseMatrix<ElemType>::MultiplyAndAdd(alpha, *a.m_GPUMatrix, transposeA, *b.m_GPUSparseMatrix, transposeB, *c.m_GPUSparseMatrix);
            c.SetDataLocation(GPU, SPARSE);
        }
        else if (a.m_matrixType == b.m_matrixType && b.m_matrixType == c.m_matrixType && a.m_matrixType == MatrixType::SPARSE)
        {
            GPUSparseMatrix<ElemType> firstDummy = alpha == 1 ? *a.m_GPUSparseMatrix : (*a.m_GPUSparseMatrix) * alpha;
            GPUSparseMatrix<ElemType>& first = firstDummy; // By Malcolm.. gcc doesn't support auto
            if (beta == 0)
            {
                GPUSparseMatrix<ElemType>::Multiply(first, transposeA, *b.m_GPUSparseMatrix, transposeB, *c.m_GPUSparseMatrix);
                c.SetDataLocation(GPU, SPARSE);
            }
            else
            {
                GPUSparseMatrix<ElemType> tmp(b.m_GPUSparseMatrix->GetComputeDeviceId());
                GPUSparseMatrix<ElemType>::Multiply(first, transposeA, *b.m_GPUSparseMatrix, transposeB, tmp);
                *c.m_GPUSparseMatrix = tmp + (*c.m_GPUSparseMatrix) * beta;
                c.SetDataLocation(GPU, SPARSE);
            }
        }
        else if (a.m_matrixType == b.m_matrixType && a.m_matrixType == MatrixType::DENSE && c.m_matrixType == MatrixType::SPARSE)
        {
            GPUMatrix<ElemType> tmp(a.m_GPUMatrix->GetComputeDeviceId());
            GPUSparseMatrix<ElemType> tmpSparse(a.m_GPUMatrix->GetComputeDeviceId());
            GPUMatrix<ElemType>::MultiplyAndWeightedAdd(alpha, *a.m_GPUMatrix, transposeA, *b.m_GPUMatrix, transposeB, beta, tmp);
            tmpSparse.SetValue(tmp);
            *c.m_GPUSparseMatrix = tmpSparse + (*c.m_GPUSparseMatrix) * beta;
            c.SetDataLocation(GPU, SPARSE);
        }
        else
            NOT_IMPLEMENTED;
    }
}

template <class ElemType>
/*static*/ void Matrix<ElemType>::Multiply1x1AndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType beta, Matrix<ElemType>& c)
{
    // special case: a is a 1x1 matrix
    // The only alternative is to Get00Elements(), which makes things inefficient.
    if (a.GetNumElements() != 1)
        InvalidArgument("Multiply1x1AndWeightedAdd: first arg must be a scalar.");

    DISPATCH_MATRIX_ON_FLAG(&c,
                            nullptr,
                            CPUMatrix<ElemType>::Multiply1x1AndWeightedAdd(alpha, *a.m_CPUMatrix, *b.m_CPUMatrix, beta, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::Multiply1x1AndWeightedAdd(alpha, *a.m_GPUMatrix, *b.m_GPUMatrix, beta, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

/// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c =  op(a) * op(b) + c</summary>
/// <param name="a">Input matrix</param>
/// <param name="transposeA">Whether matrix a is transposed</param>
/// <param name="b">Input matrix</param>
/// <param name="transposeB">Whether matrix b is transposed</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::MultiplyAndAdd(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
                                      Matrix<ElemType>& c)
{
    return Matrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 1.0, c);
}

/// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c =  op(a) * op(b)</summary>
/// <param name="a">Input matrix</param>
/// <param name="transposeA">Whether matrix a is transposed</param>
/// <param name="b">Input matrix</param>
/// <param name="transposeB">Whether matrix b is transposed</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::Multiply(const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
                                Matrix<ElemType>& c)
{
    return Matrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, transposeA, b, transposeB, 0.0, c);
}

/// <summary>Matrix-matrix multiply with col-major matrices (a and b are not transposed): c =  a * b</summary>
/// <param name="a">Input matrix</param>
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::Multiply(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c)
{
    return Matrix<ElemType>::MultiplyAndWeightedAdd(1.0, a, false, b, false, 0.0, c);
}

/// <summary>1-D Convolution with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c. MultiplyAndWeightedAdd is just a special case of this.</summary>
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="transposeA">Whether matrix a is transposed</param>
/// <param name="b">Input matrix</param>
/// <param name="transposeB">Whether matrix b is transposed</param>
/// <param name="beta">Scalar</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const Matrix<ElemType>& a, const bool transposeA, const Matrix<ElemType>& b, const bool transposeB,
                                              ElemType beta, Matrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
{
    DecideAndMoveToRightDevice(a, b, c);

    if (c.GetDeviceId() >= 0 /*GPU*/ && a.GetMatrixType() == MatrixType::DENSE && b.GetMatrixType() == MatrixType::SPARSE && c.GetMatrixType() == MatrixType::DENSE)
    {
        GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(alpha, *a.m_GPUMatrix, transposeA, *b.m_GPUSparseMatrix, transposeB, beta, *c.m_GPUMatrix, numChannels, horizontalSubsample, padding, channelwise);
    }
    else
    {
        NOT_IMPLEMENTED;
    }
}

/// <summary>Matrix-scalar multiply with col-major matrices: c = alpha * a + c</summary>
/// if a is a column vector, add to all columns of c
/// if a is a row vector, add to all rows of c
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
/*static*/ void Matrix<ElemType>::ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c)
{
    if (a.IsEmpty() || c.IsEmpty())
        LogicError("ScaleAndAdd:  one of the input matrices is empty.");

    DecideAndMoveToRightDevice(c, a);

    if (a.GetMatrixType() == c.GetMatrixType())
    {
        DISPATCH_MATRIX_ON_FLAG(&c,
                                &c,
                                CPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
                                GPUMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
                                NOT_IMPLEMENTED,
                                GPUSparseMatrix<ElemType> b = move(*c.m_GPUSparseMatrix);
                                GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, b, *c.m_GPUSparseMatrix));
    }
    else
    {
        DISPATCH_MATRIX_ON_FLAG(&c,
                                nullptr,
                                CPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_CPUSparseMatrix, *c.m_CPUMatrix);
                                c.SetDataLocation(CPU),
                                if (a.m_GPUSparseMatrix->GetFormat() == MatrixFormat::matrixFormatSparseCSC)
                                {
                                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, 1, *c.m_GPUMatrix, *c.m_GPUMatrix);
                                } else // new GPU sparse matrix code
                                {
                                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUSparseMatrix, *c.m_GPUMatrix);
                                } c.SetDataLocation(GPU),
                                NOT_IMPLEMENTED,
                                {
                                    c.m_GPUMatrix = new GPUMatrix<ElemType>(c.m_GPUSparseMatrix->CopyToDenseMatrix());
                                    GPUSparseMatrix<ElemType>::ScaleAndAdd(alpha, *a.m_GPUMatrix, 1, *c.m_GPUSparseMatrix, *c.m_GPUMatrix);
                                    delete c.m_GPUSparseMatrix;
                                    c.m_GPUSparseMatrix = NULL;
                                    c.SetDataLocation(GPU, DENSE);
                                });
    }
}

/// <summary>Matrix-scalar multiply with col-major matrices: c = alpha * a + beta * c</summary>
/// if a is a column vector, add to all columns of c
/// if a is a row vector, add to all rows of c
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="beta">Scalar</param>
/// <param name="c">Resulting matrix, caller is responsible for allocating this</param>
template <class ElemType>
/*static*/ void Matrix<ElemType>::ScaleAndAdd(ElemType alpha, const Matrix<ElemType>& a, ElemType beta, Matrix<ElemType>& c)
{
    if (beta == 1)
        ScaleAndAdd(alpha, a, c);
    else if (beta == 0)
    {
        Scale(alpha, a, c);
    }
    else
    {
        ScaleAndAdd(alpha / beta, a, c); // c1=alpha/beta * a + c
        Scale(beta, c);                  // c/beta * beta
    }
}

// tensor swapping and addition: c <- keepWeight * b + scaleFactor * swap_dimensions(a, S, K)
// where
//  - a is interpreted as a tensor of dimension (D x S x M x K x T)         // column-major, as usual
//  - b and c as a tensor of dimension          (D x K x M x S x T)   // note: K and S swapped
// The main point of this function is to reshuffle a tensor w.r.t. two dimensions that get swapped in memory,
// but for gradients, we will need to add, hence the keepWeight.
// Notes:
//  - c and b may be the same (in-place operation is expressly allowed).
//  - D, M, and/or T may be 1. For example, D == M == T == 1 implements a 2D matrix transpose from (S x K) to (K x S).
//  - If keepWeight == 0, then b will just get overwritten (straight assignment, b may be uninitialized or contain NaNs).
//  - The original matrix dimensions are ignored except that sizes must match (rows x cols == D x S x M x K x T).
//    For diagnostics purposes, this function also enforces the rows % D == 0 and cols % T == 0, but this is not a functional requirement and can be removed if that helps.
//  - Dense matrices only.
// TODO: Handle these cases:
//  - no swapping happening  --just do a block copy
//  - swapping can be implemented by cuDNN  --do so
template <class ElemType>
/*static*/ void Matrix<ElemType>::TensorShuffleScaleAndAdd(ElemType keepWeight, const Matrix<ElemType>& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const Matrix<ElemType>& b, Matrix<ElemType>& c)
{
    if (a.GetNumElements() != c.GetNumElements() || b.GetNumElements() != c.GetNumElements()) // allocations must match (but not dimensions, since we reinterpret the dimensions anyway)
        InvalidArgument("TensorShuffleScaleAndAdd: a, b, and c must have same number of elements.");
    if (c.IsEmpty()) // operating on empty minibatch slices is perfectly cromulent
        return;

    // sanity checks for current use cases--these are not strictly necessary and can be deleted
    if (a.GetNumRows() % D != 0 || b.GetNumRows() % D != 0 || c.GetNumRows() % D != 0)
        InvalidArgument("TensorShuffleScaleAndAdd: a, b, and c are meant to have a row dimension that is a multiple of D.");
    if (a.GetNumCols() % T != 0 || b.GetNumCols() % T != 0 || c.GetNumCols() % T != 0)
        InvalidArgument("TensorShuffleScaleAndAdd: a, b, and c are meant to have a column dimension that is a multiple of T.");

    DecideAndMoveToRightDevice(a, b, c);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            nullptr,
                            CPUMatrix<ElemType>::TensorShuffleScaleAndAdd(keepWeight, *a.m_CPUMatrix, D, S, M, K, T, scaleFactor, *b.m_CPUMatrix, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::TensorShuffleScaleAndAdd(keepWeight, *a.m_GPUMatrix, D, S, M, K, T, scaleFactor, *b.m_GPUMatrix, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            GPUSparseMatrix<ElemType>::TensorShuffleScaleAndAdd(keepWeight, *a.m_GPUSparseMatrix, D, S, M, K, T, scaleFactor, *b.m_GPUSparseMatrix, *c.m_GPUSparseMatrix));
}

/// <summary>c += alpha * (a-b)</summary>
/// if a, b, c  must have same dim
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::AddScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c)
{
    DecideAndMoveToRightDevice(c, a, b);
    if (!(a.GetMatrixType() == b.GetMatrixType() && a.GetMatrixType() == c.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::AddScaledDifference(alpha, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::AddScaledDifference(alpha, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

/// <summary> c = alpha * (a-b)</summary>
/// if a, b, c  must have same dim
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::AssignScaledDifference(const ElemType alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c)
{
    DecideAndMoveToRightDevice(a, b, c);

    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::AssignScaledDifference(alpha, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::AssignScaledDifference(alpha, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

/// <summary>c += alpha * (a-b)</summary>
/// if a, b, c  must have same dim
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::AddScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c)
{
    DecideAndMoveToRightDevice(c, a, b);
    alpha._transferToDevice(c.GetDeviceId());

    if (!(a.GetMatrixType() == b.GetMatrixType() && a.GetMatrixType() == c.GetMatrixType() && a.GetMatrixType() == alpha.GetMatrixType()))
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::AddScaledDifference(*alpha.m_CPUMatrix, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::AddScaledDifference(*alpha.m_GPUMatrix, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

/// <summary> c = alpha * (a-b)</summary>
/// if a, b, c  must have same dim
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::AssignScaledDifference(const Matrix<ElemType>& alpha, const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c)
{
    DecideAndMoveToRightDevice(a, b, alpha);
    c._transferToDevice(a.GetDeviceId());

    if (!(a.GetMatrixType() == b.GetMatrixType() && a.GetMatrixType() == alpha.GetMatrixType()))
        NOT_IMPLEMENTED;

    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            nullptr,
                            CPUMatrix<ElemType>::AssignScaledDifference(*alpha.m_CPUMatrix, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::AssignScaledDifference(*alpha.m_GPUMatrix, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

//c[ci,cj] += a[ai,aj]
template <class ElemType>
void Matrix<ElemType>::AddElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj)
{
    DecideAndMoveToRightDevice(c, a);

    if (c.GetMatrixType() != a.GetMatrixType())
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::AddElementToElement(*a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
                            GPUMatrix<ElemType>::AddElementToElement(*a.m_GPUMatrix, ai, aj, *c.m_GPUMatrix, ci, cj),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

//c[ci,cj] = a[ai,aj]
template <class ElemType>
void Matrix<ElemType>::AssignElementToElement(const Matrix<ElemType>& a, const size_t ai, const size_t aj, Matrix<ElemType>& c, const size_t ci, const size_t cj)
{
    DecideAndMoveToRightDevice(c, a);

    if (c.GetMatrixType() != a.GetMatrixType())
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::AssignElementToElement(*a.m_CPUMatrix, ai, aj, *c.m_CPUMatrix, ci, cj),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

//for each column of this, we add row slice of a starting from startIndex
template <class ElemType>
void Matrix<ElemType>::MinusOneAt(Matrix<ElemType>& a, const size_t position)
{
    DISPATCH_MATRIX_ON_FLAG(&a,
                            &a,
                            CPUMatrix<ElemType>::MinusOneAt(*a.m_CPUMatrix, position),
                            GPUMatrix<ElemType>::MinusOneAt(*a.m_GPUMatrix, position),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

/// <summary>Matrix-scalar multiply with col-major matrices: c = alpha * a</summary>
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
void Matrix<ElemType>::Scale(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c)
{
    DecideAndMoveToRightDevice(c, a);

    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    if (alpha == 0)
    {
        c.Resize(a);
        c.SetValue(0); // this is a little faster, and also does not propagate NaNs, which we'd expect from 'beta' parameters
        return;
    }
    else
        DISPATCH_MATRIX_ON_FLAG(&c,
                                &c,
                                CPUMatrix<ElemType>::Scale(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
                                GPUMatrix<ElemType>::Scale(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
                                NOT_IMPLEMENTED, * c.m_GPUSparseMatrix = (*a.m_GPUSparseMatrix) * alpha);
}

/// <summary>Matrix-scalar multiply with col-major matrices: a = alpha * a</summary>
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
template <class ElemType>
void Matrix<ElemType>::Scale(ElemType alpha, Matrix<ElemType>& a)
{
    if (alpha == 0)
        a.SetValue(0); // this is a little faster, and also does not propagate NaNs, which we'd expect from 'beta' parameters
    else if (a.IsEmpty())
        return;
    else
        DISPATCH_MATRIX_ON_FLAG(&a,
                                &a,
                                CPUMatrix<ElemType>::Scale(alpha, *a.m_CPUMatrix),
                                GPUMatrix<ElemType>::Scale(alpha, *a.m_GPUMatrix),
                                NOT_IMPLEMENTED,
                                GPUSparseMatrix<ElemType>::Scale(alpha, *a.m_GPUSparseMatrix));
}

/// <summary>Matrix scalar matrix multiply with col-major matrices: a = alpha[0,0] * a</summary>
/// <param name="alpha">1x1 matrix</param>
/// <param name="a">Input matrix</param>
template <class ElemType>
void Matrix<ElemType>::Scale(const Matrix<ElemType>& alpha, Matrix<ElemType>& a)
{
    if (a.IsEmpty())
        return;

    DecideAndMoveToRightDevice(a, alpha);

    if (a.GetMatrixType() != alpha.GetMatrixType())
        NOT_IMPLEMENTED;

    DISPATCH_MATRIX_ON_FLAG(&a,
                            nullptr,
                            CPUMatrix<ElemType>::Scale(*alpha.m_CPUMatrix, *a.m_CPUMatrix),
                            GPUMatrix<ElemType>::Scale(*alpha.m_GPUMatrix, *a.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::InnerProduct(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("InnerProduct:  one of the input matrix is empty.");

    DecideAndMoveToRightDevice(a, b, c);

    if (a.GetMatrixType() != b.GetMatrixType())
        NOT_IMPLEMENTED;

    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::InnerProduct(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, isColWise),
                            GPUMatrix<ElemType>::InnerProduct(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, isColWise),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
ElemType Matrix<ElemType>::InnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("InnerProductOfMatrices:  one of the input matrices is empty.");

    DecideAndMoveToRightDevice(a, b);

    if (a.GetMatrixType() == b.GetMatrixType())
    {
        DISPATCH_MATRIX_ON_FLAG(&a,
                                nullptr,
                                return CPUMatrix<ElemType>::InnerProductOfMatrices(*a.m_CPUMatrix, *b.m_CPUMatrix),
                                return GPUMatrix<ElemType>::InnerProductOfMatrices(*a.m_GPUMatrix, *b.m_GPUMatrix),
                                NOT_IMPLEMENTED,
                                NOT_IMPLEMENTED);
    }
    else
    {
        DISPATCH_MATRIX_ON_FLAG(&a,
                                nullptr,
                                NOT_IMPLEMENTED,
                                return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(*a.m_GPUMatrix, *b.m_GPUSparseMatrix),
                                NOT_IMPLEMENTED,
                                return GPUSparseMatrix<ElemType>::InnerProductOfMatrices(*a.m_GPUSparseMatrix, *b.m_GPUMatrix));
    }
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignInnerProductOfMatrices(const Matrix<ElemType>& a, const Matrix<ElemType>& b)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("InnerProductOfMatrices:  one of the input matrices is empty.");

    Resize(1, 1);

    DecideAndMoveToRightDevice(a, b, *this);

    if (a.GetMatrixType() == b.GetMatrixType())
    {
        SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

        DISPATCH_MATRIX_ON_FLAG(&a,
                                this,
                                m_CPUMatrix->SetValue(CPUMatrix<ElemType>::InnerProductOfMatrices(*a.m_CPUMatrix, *b.m_CPUMatrix)),
                                m_GPUMatrix->AssignInnerProductOfMatrices(*a.m_GPUMatrix, *b.m_GPUMatrix),
                                NOT_IMPLEMENTED,
                                NOT_IMPLEMENTED);
    }
    else
    {
        NOT_IMPLEMENTED;
    }

    return *this;
}

template <class ElemType>
void Matrix<ElemType>::ElementWisePower(ElemType alpha, const Matrix<ElemType>& a, Matrix<ElemType>& c)
{
    if (a.IsEmpty())
        return;

    DecideAndMoveToRightDevice(a, c);
    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            nullptr,
                            CPUMatrix<ElemType>::ElementWisePower(alpha, *a.m_CPUMatrix, *c.m_CPUMatrix),
                            GPUMatrix<ElemType>::ElementWisePower(alpha, *a.m_GPUMatrix, *c.m_GPUMatrix),
                            NOT_IMPLEMENTED,
                            GPUSparseMatrix<ElemType>::ElementWisePower(alpha, *a.m_GPUSparseMatrix, *c.m_GPUSparseMatrix));
}

template <class ElemType>
bool Matrix<ElemType>::AreEqual(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const ElemType threshold /*= 1e-8*/)
{
    if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
        return false;

    DecideAndMoveToRightDevice(a, b);

    if (a.GetMatrixType() == b.GetMatrixType())
    {
        DISPATCH_MATRIX_ON_FLAG(&a,
                                nullptr,
                                return CPUMatrix<ElemType>::AreEqual(*a.m_CPUMatrix, *b.m_CPUMatrix, threshold),
                                return GPUMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUMatrix, threshold),
                                return CPUSparseMatrix<ElemType>::AreEqual(*a.m_CPUSparseMatrix, *b.m_CPUSparseMatrix, threshold),
                                return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUSparseMatrix, threshold));
    }
    else
    {
        DISPATCH_MATRIX_ON_FLAG(&a,
                                nullptr,
                                NOT_IMPLEMENTED;
                                return false,
                                       return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUMatrix, *b.m_GPUSparseMatrix, threshold),
                                       NOT_IMPLEMENTED;
                                return false,
                                       return GPUSparseMatrix<ElemType>::AreEqual(*a.m_GPUSparseMatrix, *b.m_GPUMatrix, threshold));
    }
}

template <class ElemType>
bool Matrix<ElemType>::HasElement(const Matrix<ElemType>& a, const ElemType value)
{
    if (a.IsEmpty())
        return false;

    DISPATCH_MATRIX_ON_FLAG(&a,
                            &a,
                            return CPUMatrix<ElemType>::HasElement(*a.m_CPUMatrix, value),
                            return GPUMatrix<ElemType>::HasElement(*a.m_GPUMatrix, value),
                            NOT_IMPLEMENTED;
                            return false,
                                   NOT_IMPLEMENTED;
                            return false);
}

// diagnostics helper to check if matrix has a NaN
// This is very slow.
template <class ElemType>
bool Matrix<ElemType>::HasNan(const char* name) const
{
    // Not implemented for sparse matrices.
    // Return false as a workaround to at
    // least evaluate the dense matrices.
    if (m_matrixType == MatrixType::SPARSE)
        return false;

    if (IsEmpty())
        return false;

    // if GPU then first detect NaN there, will be faster
    if (GetDeviceId() != CPUDEVICE)
    {
        Matrix<ElemType> sum(GetDeviceId());
        sum.AssignSumOfElements(*this);
        auto x = sum.Get00Element();
        if (!std::isnan(x))
            return false;
    }

    // const auto & us = *this;
    const Matrix<ElemType>& us = *this;

    foreach_coord (i, j, us)
        if (std::isnan(us(i, j)))
        {
            fprintf(stderr, "HasNan: NaN detected at %s (%ld,%ld) in (%d,%d) matrix\n", name, i, j, (int) GetNumRows(), (int) GetNumCols());
            return true;
        }
    return false;
}
#define CheckNan(m) m.HasNan(#m)

// another diagnostics helper to check if matrix has a NaN
// This is used at load and save time. This test is slow.

template <class ElemType>
size_t Matrix<ElemType>::CountNanInf() const
{
    const auto& us = *this;
    size_t n = 0; // number of NaNs/INF found
    foreach_coord (i, j, us)
    {
        auto val = us(i, j);
        if (std::isnan(val) || !std::isfinite(val))
            n++;
    }
    return n;
}

// TODO: these are scalar operations--why are they in Matrix?
template <class ElemType>
ElemType Matrix<ElemType>::Exp10(ElemType num)
{
    return (ElemType) exp(num * 2.302585093);
}

template <class ElemType>
ElemType Matrix<ElemType>::Mod(ElemType x, ElemType y)
{
    assert(y > 0);
    if (y <= 0)
        LogicError("y is smaller than zero");

    return x - y * floor(x / y);
}

// TODO: use static LogAdd() as defined in TensorOps.h
//       Not doing this currently because that one uses ElemType for all ops, while this one uses double inside. Must compare before making this change.
template <class ElemType>
ElemType Matrix<ElemType>::LogAdd(ElemType x, ElemType y)
{
    ElemType temp, diff, z;

    if (x < y)
    {
        temp = x;
        x = y;
        y = temp; // TODO: ::swap(x,y)?
    }
    diff = y - x;
    if (diff < MINLOGEXP)
    {
        return (ElemType)((x < LSMALL) ? LZERO : x);
    }
    else
    {
        z = exp(diff);
        return (ElemType)(x + log(1.0 + z));
    }
}

//Matrix<ElemType>& Matrix<ElemType>::Shift(const Matrix<ElemType>& a, size_t shift)
//[this]= (a right shift by n), padded with zeros
// shift left, shift needs to be negative value
// shift right, shift needs to be positive value
// BUGBUG: Leaves uninitialized values in the opened-up columns.
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::Shift(const Matrix<ElemType>& a, int shift)
{
    if (a.IsEmpty())
        LogicError("Shift: Matrix is empty.");
    else
        LogicError("Shift: BUGBUG This function currently leaves uninitialized values. Fix the code or contact fseide@microsoft.com.");

    auto& us = *this;
    if (this != &a)
    {
        Resize(a.GetNumRows(), a.GetNumCols());
    }

    long n = (long) GetNumCols();

    if (shift >= 0 && shift < n)
        us.ColumnSlice(shift, n - shift).SetValue(a.ColumnSlice(0, n - shift));
    if (shift < 0 && shift > -n)
        us.ColumnSlice(0, n + shift).SetValue(a.ColumnSlice(-shift, n + shift));
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift, size_t negnumber)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AssignElementProductOfWithShiftNeg: Matrix is empty.");

    assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("The input matrix dimensions do not match.");

    if (a.GetNumRows() != 1)
        InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector.");

    DecideAndMoveToRightDevice(a, b, *this);
    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, shift, negnumber),
                            m_GPUMatrix->AssignElementProductOfWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, shift, negnumber),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
    return *this;
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignInnerProductOfWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, const bool isColWise, size_t shift, size_t negnumber)
{
    InnerProductWithShiftNeg(a, b, *this, isColWise, shift, negnumber);
    return *this;
}

template <class ElemType>
void Matrix<ElemType>::InnerProductWithShiftNeg(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, const bool isColWise, size_t shift, size_t negnumber)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("InnerProduct:  one of the input matrix is empty.");

    DecideAndMoveToRightDevice(a, b, c);

    if (a.GetMatrixType() != b.GetMatrixType())
        NOT_IMPLEMENTED;

    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::InnerProductWithShiftNeg(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, isColWise, shift, negnumber),
                            GPUMatrix<ElemType>::InnerProductWithShiftNeg(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, negnumber),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::GetARowByIndex(const Matrix<ElemType>& a, size_t index)
{
    if (a.IsEmpty())
        LogicError("GetARowByIndex: Matrix is empty.");

    // WARNING: a and this must have same type
    if (!(GetMatrixType() == a.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->GetARowByIndex(*a.m_CPUMatrix, index),
                            m_GPUMatrix->GetARowByIndex(*a.m_GPUMatrix, index),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

template <class ElemType>
void Matrix<ElemType>::ConductRowElementMultiplyWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, Matrix<ElemType>& c, size_t shift, bool bFirstmatrixfixed)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("InnerProduct:  one of the input matrix is empty.");

    DecideAndMoveToRightDevice(a, b, c);

    if (a.GetMatrixType() != b.GetMatrixType())
        NOT_IMPLEMENTED;

    c.SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(&c,
                            &c,
                            CPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, shift, bFirstmatrixfixed),
                            GPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, shift, bFirstmatrixfixed),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignElementProductOfWithShift(const Matrix<ElemType>& a, const Matrix<ElemType>& b, size_t shift)
{
    if (a.IsEmpty() || b.IsEmpty())
        LogicError("AssignElementProductOfWithShift: Matrix is empty.");

    assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols());
    if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()))
        InvalidArgument("The input matrix dimensions do not match.");

    if (a.GetNumRows() != 1)
        InvalidArgument("AssignElementProductOfWithShiftNeg: The input matrix must be a row vector.");

    DecideAndMoveToRightDevice(a, b, *this);
    if (!(a.GetMatrixType() == b.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(a.GetMatrixType(), a.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignElementProductOfWithShift(*a.m_CPUMatrix, *b.m_CPUMatrix, shift),
                            m_GPUMatrix->AssignElementProductOfWithShift(*a.m_GPUMatrix, *b.m_GPUMatrix, shift),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
    return *this;
}

template <class ElemType>
void Matrix<ElemType>::RCRFBackwardCompute(const Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
                                           Matrix<ElemType>& functionValues, const Matrix<ElemType>& lbls,
                                           const Matrix<ElemType>& pos_scores, const Matrix<ElemType>& pair_scores, const int shift)
{
    DecideAndMoveToRightDevice(alpha, beta);
    functionValues._transferToDevice(alpha.GetDeviceId());
    beta._transferToDevice(alpha.GetDeviceId());

    DISPATCH_MATRIX_ON_FLAG(&alpha,
                            &beta,
                            CPUMatrix<ElemType>::RCRFBackwardCompute(
                                *alpha.m_CPUMatrix,
                                *beta.m_CPUMatrix,
                                *lbls.m_CPUMatrix,
                                *pair_scores.m_CPUMatrix),
                            GPUMatrix<ElemType>::RCRFBackwardCompute(
                                *alpha.m_GPUMatrix,
                                *beta.m_GPUMatrix,
                                *lbls.m_GPUMatrix,
                                *pos_scores.m_GPUMatrix,
                                *pair_scores.m_GPUMatrix, shift),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::RCRFTransGrdCompute(const Matrix<ElemType>& lbls,
                                           const Matrix<ElemType>& alpha,
                                           const Matrix<ElemType>& beta,
                                           const Matrix<ElemType>& pair_scores,
                                           Matrix<ElemType>& grd,
                                           const int startLbl,
                                           const int shift)
{
    DecideAndMoveToRightDevice(alpha, grd);
    grd._transferToDevice(alpha.GetDeviceId());

    DISPATCH_MATRIX_ON_FLAG(&alpha,
                            &grd,
                            CPUMatrix<ElemType>::RCRFTransGrdCompute(
                                *lbls.m_CPUMatrix,
                                *alpha.m_CPUMatrix,
                                *beta.m_CPUMatrix,
                                *pair_scores.m_CPUMatrix,
                                *grd.m_CPUMatrix),
                            GPUMatrix<ElemType>::RCRFTransGrdCompute(
                                *lbls.m_GPUMatrix,
                                *alpha.m_GPUMatrix,
                                *beta.m_GPUMatrix,
                                *pair_scores.m_GPUMatrix,
                                *grd.m_GPUMatrix,
                                startLbl,
                                shift),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType& threshhold)
{
    DecideAndMoveToRightDevice(*this, label, gamma);

    if (label.GetNumCols() != gamma.GetNumCols() || label.GetNumRows() != gamma.GetNumRows())
        LogicError("DropFrame: label matrix is not in the same size as gamm matrix.");
    SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->DropFrame(*label.m_CPUMatrix, *gamma.m_CPUMatrix, threshhold),
                            m_GPUMatrix->DropFrame(*label.m_GPUMatrix, *gamma.m_GPUMatrix, threshhold),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);

    return *this;
}

/// <summary> c = alpha * (a-b)</summary>
/// if a, b, c  must have same dim
/// <param name="alpha">Scalar</param>
/// <param name="a">Input matrix</param>
/// <param name="b">Input matrix</param>
/// <param name="c">Resulting matrix, user is responsible for allocating this</param>
template <class ElemType>
Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label,
                                                        const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha)
{
    DecideAndMoveToRightDevice(label, dnnoutput, gamma);

    if (!(label.GetMatrixType() == gamma.GetMatrixType()))
        NOT_IMPLEMENTED;

    SwitchToMatrixType(label.GetMatrixType(), label.GetFormat(), false);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_CPUMatrix, *dnnoutput.m_CPUMatrix, *gamma.m_CPUMatrix, alpha),
                            m_GPUMatrix->AssignSequenceError(hsmoothingWeight, *label.m_GPUMatrix, *dnnoutput.m_GPUMatrix, *gamma.m_GPUMatrix, alpha),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
    return *this;
}
#pragma endregion Static BLAS Functions

// TensorView currently does not interface with sparse matrices. For now, we just catch this and throw.
template <class ElemType>
static bool VerifyIsDense(const Matrix<ElemType>& a)
{
    if (a.GetMatrixType() != DENSE)
        RuntimeError("TensorOp: Tensor operations are currently not supported for sparse matrices.");
    return true;
}

template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op,
                                const array<size_t, 2>& offsets,
                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
{
    VerifyIsDense(*this) && VerifyIsDense(a);

    DecideAndMoveToRightDevice(*this, a);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op,
                                const array<size_t, 3>& offsets,
                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
{
    VerifyIsDense(*this) && VerifyIsDense(a) && VerifyIsDense(b);

    DecideAndMoveToRightDevice(*this, a, b);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template <class ElemType>
void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, const Matrix<ElemType>& c, ElemType alpha, ElementWiseOperator op,
                                const array<size_t, 4>& offsets,
                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
{
    VerifyIsDense(*this) && VerifyIsDense(a) && VerifyIsDense(b) && VerifyIsDense(c);

    DecideAndMoveToRightDevice(*this, a, b, c);

    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, *b.m_CPUMatrix, *c.m_CPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            m_GPUMatrix->TensorOp(beta, *a.m_GPUMatrix, *b.m_GPUMatrix, *c.m_GPUMatrix, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
}

template class Matrix<float>;
template class Matrix<double>;

// We use Matrix<char> as the backing store for QuantizedMatrix, and also as a flag matrix.
// Let's explicitly instantiate the methods we need for that purpose
template Matrix<char>::Matrix(DEVICEID_TYPE);
template Matrix<char>::Matrix(Matrix<char>&&);
template Matrix<char>::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat);
template Matrix<char>::Matrix(const size_t numRows, const size_t numCols, char* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags, const size_t nnz);
template Matrix<char>::~Matrix();
template Matrix<char>& Matrix<char>::operator=(Matrix<char>&& moveFrom);
template char* Matrix<char>::BufferPointer() const;
template int Matrix<char>::GetDeviceId() const;
template size_t Matrix<char>::GetNumElements() const;
template Matrix<char> Matrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
template void Matrix<char>::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const;
template void Matrix<char>::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const;
template size_t Matrix<char>::GetNumRows() const;
template size_t Matrix<char>::GetNumCols() const;
template void Matrix<char>::SetValue(const char);
template void Matrix<char>::SetValue(size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
template void Matrix<char>::SetValue(const Matrix<char>&, MatrixFormat);
template bool Matrix<char>::IsEmpty() const;
template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);

}}}
Browse the archive

https://github.com/Microsoft/CNTK