// // Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE.md file in the project root for full license information. // // CPUMatrix.cpp : full implementation of all matrix functions on the CPU side // #include "stdafx.h" #include "Basics.h" #include "File.h" #include "CPUMatrix.h" #include "TensorOps.h" #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #define NOMINMAX #include "Windows.h" #else #ifndef max #define max(a, b) (((a) > (b)) ? (a) : (b)) #endif #include #endif #ifdef LEAKDETECT #include #endif #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this #pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons #ifndef USE_MKL // use ACML as default. // Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above // from http://developer.amd.com/tools/cpu-development/amd-core-math-library-acml/acml-downloads-resources/ // Install the ifort64_mp variant (compiled with intel compiler) of the library // Set Environment variable ACML_PATH to C:\AMD\acml5.3.1\ifort64_mp or the folder you installed acml // to point to your folder for the include file and link library #include // requires ACML 5.3.1 and above #else // requires MKL 10.0 and above #include #endif #ifndef USE_MKL // MKL has one additional parameter for different matrix order #define BLAS_COLMAJOR #else #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor, #endif #define SWAP(a, b) \ { \ (a) ^= (b); \ (b) ^= (a); \ (a) ^= (b); \ } #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing namespace Microsoft { namespace MSR { namespace CNTK { int MATH_API TracingGPUMemoryAllocator::m_traceLevel = 0; void TracingGPUMemoryAllocator::SetTraceLevel(int traceLevel) { m_traceLevel = traceLevel; } bool TracingGPUMemoryAllocator::IsTraceEnabled() { return (m_traceLevel > 0); } #pragma region Helpful Enum Definitions enum class MatrixOrder { RowMajor = 101, // row-major arrays ColMajor = 102 // column-major arrays }; enum class MatrixTranspose : char { NoTrans = 'N', // trans='N' Trans = 'T', // trans='T' ConjTrans = 'C' // trans='C' }; enum class SymMatrixType : char { Up = 'U', // symmetric matrix is stored in the upper part Low = 'L', // symmetric matrix is stored in thelower part Full = 'F', // full populated NotSymmetric = 'N' // not a symmetric matrix }; enum class MatrixOpSide : char { Left = 'L', // left multiply Right = 'R', // right multiply }; #pragma endregion Helpful Enum Definitions #pragma region Constructors and Destructor //should only be used by constructors. template void CPUMatrix::ZeroInit() { m_computeDevice = CPUDEVICE; m_pArray = nullptr; m_numRows = 0; m_numCols = 0; m_elemSizeAllocated = 0; m_matrixName = NULL; m_format = matrixFormatDense; m_externalBuffer = false; } template CPUMatrix::CPUMatrix() { ZeroInit(); } //matrixName is used to verify that correct matrix is read. template CPUMatrix::CPUMatrix(FILE* f, const char* matrixName) { ZeroInit(); ReadFromFile(f, matrixName); } // helper to allocate an array of ElemType // Use this instead of new[] to get NaN initialization for debugging. template static ElemType* NewArray(size_t n) { ElemType* p = new ElemType[n](); #if 0 // _DEBUG ElemType nan = Matrix::MakeNan(__LINE__); for (size_t i = 0; i < n; i++) p[i] = nan; #endif return p; } template CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols) { ZeroInit(); m_numRows = numRows; m_numCols = numCols; m_elemSizeAllocated = GetNumElements(); if (m_elemSizeAllocated != 0) m_pArray = NewArray(m_elemSizeAllocated); } template CPUMatrix::CPUMatrix(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags) { ZeroInit(); SetValue(numRows, numCols, pArray, matrixFlags); } //copy constructor, deep copy template CPUMatrix::CPUMatrix(const CPUMatrix& deepCopyFrom) { ZeroInit(); if (!deepCopyFrom.IsEmpty()) SetValue(deepCopyFrom); SetMatrixName(deepCopyFrom.m_matrixName); } //assignment operator, deep copy template CPUMatrix& CPUMatrix::operator=(const CPUMatrix& deepCopyFrom) { Clear(); if (!deepCopyFrom.IsEmpty()) SetValue(deepCopyFrom); SetMatrixName(deepCopyFrom.m_matrixName); return *this; } //move constructor, shallow copy template CPUMatrix::CPUMatrix(CPUMatrix&& moveFrom) { m_computeDevice = moveFrom.m_computeDevice; m_numRows = moveFrom.m_numRows; m_numCols = moveFrom.m_numCols; m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_pArray = moveFrom.m_pArray; // shallow copy the pointer m_matrixName = moveFrom.m_matrixName; m_format = moveFrom.m_format; m_externalBuffer = moveFrom.m_externalBuffer; // release the pointer from the source object so that the destructor won't release it twice moveFrom.ZeroInit(); } //move assignment operator, shallow copy template CPUMatrix& CPUMatrix::operator=(CPUMatrix&& moveFrom) { if (this != &moveFrom) { if (OwnBuffer() && m_pArray != nullptr) delete[] m_pArray; // always delete the data pointer since we will use the pointer from moveFrom m_computeDevice = moveFrom.m_computeDevice; m_numRows = moveFrom.m_numRows; m_numCols = moveFrom.m_numCols; m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_pArray = moveFrom.m_pArray; m_format = moveFrom.m_format; m_externalBuffer = moveFrom.m_externalBuffer; // release the pointer from the source object so that the destructor won't release it twice moveFrom.ZeroInit(); } return *this; } template CPUMatrix::~CPUMatrix() { Clear(); } template void CPUMatrix::Clear() { if (m_pArray != nullptr && OwnBuffer()) { delete[] m_pArray; m_pArray = nullptr; m_elemSizeAllocated = 0; } BaseMatrix::Clear(); ZeroInit(); } #pragma endregion Constructors and Destructor #pragma region Basic Operators template CPUMatrix CPUMatrix::ColumnSlice(size_t startColumn, size_t numCols) const { // if (numCols == 0) // LogicError("The slice cannot have 0 columns."); if (startColumn + numCols > m_numCols) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols); CPUMatrix slice; slice.m_externalBuffer = true; // memory of a slice is managed externally. slice.m_numRows = m_numRows; slice.m_numCols = numCols; slice.m_elemSizeAllocated = slice.GetNumElements(); slice.m_pArray = m_pArray + startColumn * m_numRows; slice.m_format = m_format; return slice; } // set this(:, 0:numCols-1) = fromMatrix(:, startColumn : startColumn+numCols-1) // TODO: why not say *this = ColumnSlice()? template CPUMatrix& CPUMatrix::AssignColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { // if (numCols == 0) // LogicError("The slice cannot have 0 columns."); if (startColumn + numCols > fromMatrix.m_numCols) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) fromMatrix.m_numCols); Clear(); SetOwnBuffer(false); // memory of a slice is managed externally. m_numRows = fromMatrix.m_numRows; m_numCols = numCols; m_elemSizeAllocated = GetNumElements(); m_pArray = fromMatrix.m_pArray + startColumn * m_numRows; return *this; } // set this(: , startColumn:startColumn+numCols-1)= fromMatrix; template CPUMatrix& CPUMatrix::SetColumnSlice(const CPUMatrix& fromMatrix, size_t startColumn, size_t numCols) { // if (numCols == 0) // LogicError("The slice cannot have 0 columns."); if (startColumn + numCols > m_numCols) LogicError("The slice is out of range of the destination matrix."); if (numCols > fromMatrix.GetNumCols()) InvalidArgument("The slice (%d) is out of range of the source matrix (%d).", (int) numCols, (int) fromMatrix.GetNumCols()); if (m_numRows != fromMatrix.m_numRows) LogicError("The number of rows in source and destination matrices do not match"); // SetOwnBuffer(false); memcpy(m_pArray + startColumn * m_numRows, fromMatrix.m_pArray, numCols * m_numRows * sizeof(ElemType)); return *this; } template void CPUMatrix::CopyColumnsStrided(const CPUMatrix& fromMatrix, size_t numCols, size_t srcNumColsStride, size_t destNumColsStride) { if ((((numCols - 1) * srcNumColsStride) + 1) > fromMatrix.m_numCols) LogicError("The numCols to copy and srcNumColsStride specified is out of range of the source matrix."); if ((((numCols - 1) * destNumColsStride) + 1) > m_numCols) LogicError("The numCols to copy and srcNumColsStride specified is out of range of the destination matrix."); if (m_numRows != fromMatrix.m_numRows) LogicError("The number of rows in source and destination matrices do not match"); long n = (long) numCols, m = (long) m_numRows; auto& us = *this; #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (size_t i = 0; i < (m & ~3); i += 4) { us(i, j * destNumColsStride) = fromMatrix(i, j * srcNumColsStride); us(i + 1, j * destNumColsStride) = fromMatrix(i + 1, j * srcNumColsStride); us(i + 2, j * destNumColsStride) = fromMatrix(i + 2, j * srcNumColsStride); us(i + 3, j * destNumColsStride) = fromMatrix(i + 3, j * srcNumColsStride); } // handle remaining for (size_t i = m & ~3; i < m; i++) { us(i, j * destNumColsStride) = fromMatrix(i, j * srcNumColsStride); } } } //for each column of a, we add all rows of a to this starting from startIndex template CPUMatrix& CPUMatrix::AssignToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.GetNumRows() != numRows) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); if (startIndex + numRows > GetNumRows()) LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddToRowSliceValuesOf: columns does not match."); long n = (long) a.GetNumCols(), m = (long) numRows; auto& us = *this; #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (size_t i = 0, startRow = startIndex; i < (m & ~3); i += 4, startRow += 4) { us(startRow, j) = a(i, j); us(startRow + 1, j) = a(i + 1, j); us(startRow + 2, j) = a(i + 2, j); us(startRow + 3, j) = a(i + 3, j); } // handle remaining stuffs for (size_t i = m & ~3, startRow = startIndex + (m & ~3); i < m; i++, startRow++) { us(startRow, j) = a(i, j); } } return *this; } //for each column of a, we assign numRows starting from startIndex to this template CPUMatrix& CPUMatrix::AssignRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) { if (startIndex + numRows > a.GetNumRows()) LogicError("AssignRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); Resize(numRows, a.GetNumCols()); long n = (long) a.GetNumCols(); // note: OpenMP requires loop indices to be long, not size_t long k = (long) a.GetNumRows(); #pragma omp parallel for for (long j = 0; j < n; j++) { // memory copy might be faster? memcpy(m_pArray + j * numRows, a.m_pArray + j * k + startIndex, sizeof(ElemType) * numRows); // //four-way unrolling // for (long i=0, startRow = startIndex; i<(m & ~3); i+=4, startRow+=4) // { // us(i,j) = a(startRow,j); // us(i+1,j) = a(startRow+1,j); // us(i+2,j) = a(startRow+2,j); // us(i+3,j) = a(startRow+3,j); // } // //handle remaining stuffs // for (long i=m & ~3, startRow = startIndex+(m & ~3); i CPUMatrix& CPUMatrix::AddToRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddToRowSliceValuesOf: input matrix a is empty."); if (a.GetNumRows() != numRows) LogicError("AddToRowSliceValuesOf: a.GetNumRows() != numRows."); if (startIndex + numRows > GetNumRows()) LogicError("AddToRowSliceValuesOf: startIndex + numRows exceeds GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddToRowSliceValuesOf: columns does not match."); long n = (long) a.GetNumCols(), m = (long) numRows; auto& us = *this; #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0, startRow = (long) startIndex; i < (m & ~3); i += 4, startRow += 4) { us(startRow, j) += a(i, j); us(startRow + 1, j) += a(i + 1, j); us(startRow + 2, j) += a(i + 2, j); us(startRow + 3, j) += a(i + 3, j); } // handle remaining stuffs for (long i = m & ~3, startRow = (long) startIndex + (m & ~3); i < m; i++, startRow++) { us(startRow, j) += a(i, j); } } return *this; } //for each column of this, we add row slice of a starting from startIndex template CPUMatrix& CPUMatrix::AddWithRowSliceValuesOf(const CPUMatrix& a, const size_t startIndex, const size_t numRows) { if (a.IsEmpty()) LogicError("AddWithRowSliceValuesOf: input matrix a is empty."); if (GetNumRows() != numRows) LogicError("AddWithRowSliceValuesOf: GetNumRows() != numRows."); if (startIndex + numRows > a.GetNumRows()) LogicError("AddWithRowSliceValuesOf: startIndex + numRows exceeds a.GetNumRows()."); if (a.GetNumCols() != GetNumCols()) LogicError("AddWithRowSliceValuesOf: columns does not match."); long n = (long) a.GetNumCols(), m = (long) numRows; auto& us = *this; #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0, startRow = (long) startIndex; i < (m & ~3); i += 4, startRow += 4) { us(i, j) += a(startRow, j); us(i + 1, j) += a(startRow + 1, j); us(i + 2, j) += a(startRow + 2, j); us(i + 3, j) += a(startRow + 3, j); } // handle remaining stuffs for (long i = m & ~3, startRow = (long) startIndex + (m & ~3); i < m; i++, startRow++) { us(i, j) += a(startRow, j); } } return *this; } template CPUMatrix CPUMatrix::Diagonal() const { if (m_numRows != m_numCols) LogicError("Diagonal can be called only for square matrix. (rows=%d, cols=%d)", (int) m_numRows, (int) m_numCols); CPUMatrix diag(1, m_numCols); auto& us = *this; #pragma omp parallel for for (long i = 0; i < m_numRows; i++) { diag(0, (size_t) i) = us(i, i); } return diag; } template void CPUMatrix::MinusOneAt(CPUMatrix& c, const size_t position) { if (position < c.GetNumElements()) c.m_pArray[position] -= 1.0; else RuntimeError("MinusOneAt: position is out of CPU matrix size"); } template CPUMatrix& CPUMatrix::AssignRepeatOf(const CPUMatrix& a, const size_t numRowRepeats, const size_t numColRepeats) { if (this == &a) LogicError("AssignRepeatOf: a is the same as [this]. Does not support inplace repeat."); if (a.IsEmpty()) LogicError("AssignRepeatOf: Matrix a is empty."); Resize(a.GetNumRows() * numRowRepeats, a.GetNumCols() * numColRepeats); long n = (long) a.GetNumCols(), m = (long) a.GetNumRows(); auto& us = *this; #pragma omp parallel for for (long q = 0; q < numColRepeats; q++) { for (long p = 0; p < numRowRepeats; p++) { long colOffset = q * n; for (long j = 0; j < n; j++, colOffset++) { long rowOffset = p * m; // four-way unrolling for (long i = 0; i < (m & ~3); i += 4, rowOffset += 4) { us(rowOffset, colOffset) = a(i, j); us(rowOffset + 1, colOffset) = a(i + 1, j); us(rowOffset + 2, colOffset) = a(i + 2, j); us(rowOffset + 3, colOffset) = a(i + 3, j); } // handle remaining stuffs for (long i = m & ~3; i < m; i++, rowOffset++) { us(rowOffset, colOffset) = a(i, j); } } } } return *this; } template CPUMatrix& CPUMatrix::AddToRowRepeatValuesOf(const CPUMatrix& a, const size_t numRepeats) { if (a.IsEmpty()) LogicError("AddToRowRepeatValuesOf: input matrix a is empty."); if (a.GetNumRows() != GetNumRows() * numRepeats) LogicError("AddToRowRepeatValuesOf: a.GetNumRows() != GetNumRows() * numRepeats."); long n = (long) a.GetNumCols(), m = (long) GetNumRows(); auto& us = *this; #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { for (long k = 0; k < numRepeats; k++) { us(i, j) += a(k * m + i, j); us(i + 1, j) += a(k * m + i + 1, j); us(i + 2, j) += a(k * m + i + 2, j); us(i + 3, j) += a(k * m + i + 3, j); } } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { for (long k = 0; k < numRepeats; k++) { us(i, j) += a(k * m + i, j); } } } return *this; } template CPUMatrix& CPUMatrix::AssignPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { a; posNumber; negNumber; shiftNumber; NOT_IMPLEMENTED; } template CPUMatrix& CPUMatrix::AddFoldedPositiveAndShiftedNegSample(const CPUMatrix& a, const size_t posNumber, const size_t negNumber, const size_t shiftNumber) { a; posNumber; negNumber; shiftNumber; NOT_IMPLEMENTED; } template CPUMatrix CPUMatrix::Transpose() { if (IsEmpty()) LogicError("Transpose: Matrix is empty."); CPUMatrix c; c.AssignTransposeOf(*this); return c; } template CPUMatrix& CPUMatrix::AssignTransposeOf(const CPUMatrix& a) { if (this == &a) LogicError("AssignTransposeOf: a is the same as [this]. Does not support inplace transpose."); if (a.IsEmpty()) LogicError("AssignTransposeOf: Matrix a is empty."); Resize(a.GetNumCols(), a.GetNumRows()); long n = (long) a.GetNumCols(), m = (long) a.GetNumRows(); auto& us = *this; #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(j, i) = a(i, j); us(j, i + 1) = a(i + 1, j); us(j, i + 2) = a(i + 2, j); us(j, i + 3) = a(i + 3, j); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(j, i) = a(i, j); } } return *this; } template void CPUMatrix::SetValue(const ElemType v) { if (IsEmpty()) LogicError("SetValue: Matrix is empty."); bool isFinite = std::numeric_limits::is_integer || std::isfinite((double) v); if (isFinite && v == 0) { memset(m_pArray, 0, sizeof(ElemType) * GetNumElements()); } else { long m = (long) GetNumElements(); // 2-way thread parallelism is sufficient for the memory bound // operation of just setting the values of an array. const unsigned SETVALUE_NUM_THREADS = 2; #pragma omp parallel for num_threads(SETVALUE_NUM_THREADS) // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { m_pArray[i] = v; m_pArray[i + 1] = v; m_pArray[i + 2] = v; m_pArray[i + 3] = v; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { m_pArray[i] = v; } } } template void CPUMatrix::MaskColumnsValue(const CPUMatrix& columnsMask, ElemType val) { if (GetNumCols() != columnsMask.GetNumCols()) RuntimeError("Matrix and column mask must have equal number of columns"); auto& us = *this; long n = (long) GetNumCols(), m = (long) GetNumRows(); #pragma omp parallel for for (long j = 0; j < n; j++) { if (columnsMask(0, j) == 1) continue; // four-way unrolling for (size_t i = 0; i < (m & ~3); i += 4) { us(i, j) = val; us(i + 1, j) = val; us(i + 2, j) = val; us(i + 3, j) = val; } // handle remaining for (size_t i = m & ~3; i < m; i++) { us(i, j) = val; } } } template void CPUMatrix::SetColumn(const ElemType* colPointer, size_t j) { if (IsEmpty()) LogicError("SetColumn: Matrix is empty."); if (colPointer == NULL) return; auto& us = *this; long m = (long) GetNumRows(); #pragma omp parallel for // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = colPointer[i]; us(i + 1, j) = colPointer[i + 1]; us(i + 2, j) = colPointer[i + 2]; us(i + 3, j) = colPointer[i + 3]; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = colPointer[i]; } } template void CPUMatrix::SetColumn(const ElemType val, size_t j) { if (IsEmpty()) LogicError("SetColumn: Matrix is empty."); auto& us = *this; long m = (long) GetNumRows(); #pragma omp parallel for // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = val; us(i + 1, j) = val; us(i + 2, j) = val; us(i + 3, j) = val; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = val; } } template void CPUMatrix::SetColumn(const CPUMatrix& valMat, size_t j) { if (IsEmpty()) LogicError("SetColumn: Matrix is empty."); assert(valMat.GetNumRows() == GetNumRows() && valMat.GetNumCols() == 1); auto& us = *this; long m = (long) GetNumRows(); #pragma omp parallel for // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = valMat(i, 0); us(i + 1, j) = valMat(i + 1, 0); us(i + 2, j) = valMat(i + 2, 0); us(i + 3, j) = valMat(i + 3, 0); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = valMat(i, 0); } } template void CPUMatrix::SetValue(const CPUMatrix& deepCopyFrom) { if (this == &deepCopyFrom) return; Resize(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols()); memcpy(m_pArray, deepCopyFrom.m_pArray, deepCopyFrom.GetNumElements() * sizeof(ElemType)); } template void CPUMatrix::SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags) { if (pArray == nullptr) InvalidArgument("Invalid pArray."); m_format = matrixFormatDense; m_computeDevice = CPUDEVICE; // if it's externally managed, then populate the structure if (matrixFlags & matrixFlagDontOwnBuffer) { // free previous array allocation if any before overwriting if (m_pArray != nullptr) delete[] m_pArray; m_pArray = pArray; m_numRows = numRows; m_numCols = numCols; m_elemSizeAllocated = GetNumElements(); m_externalBuffer = true; } else { Resize(numRows, numCols); if (IsEmpty()) { InvalidArgument("NumRows or NumCols is 0. Nothing to copy"); } else { if (!(matrixFlags & matrixFormatRowMajor)) // compatible to internal structure { memcpy(m_pArray, pArray, GetNumElements() * sizeof(ElemType)); } else // need to transpose { auto& us = *this; if (sizeof(ElemType) == sizeof(double)) { #pragma omp parallel for foreach_column (j, us) { #ifndef USE_MKL dcopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); #else cblas_dcopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); #endif } } else { #pragma omp parallel for foreach_column (j, us) { { #pragma warning(suppress : 4244) #ifndef USE_MKL scopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); #else cblas_scopy((int) numRows, reinterpret_cast(pArray + j), (int) numCols, reinterpret_cast(m_pArray + LocateColumn(j)), 1); #endif } } } } } } } template void CPUMatrix::SetDiagonalValue(const ElemType v) { if (IsEmpty()) LogicError("SetDiagonalValue: Matrix is empty."); if (GetNumRows() != GetNumCols()) LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); auto& us = *this; long m = (long) GetNumRows(); #pragma omp parallel for // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, i) = v; us(i + 1, i + 1) = v; us(i + 2, i + 2) = v; us(i + 3, i + 3) = v; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, i) = v; } } template void CPUMatrix::SetDiagonalValue(const CPUMatrix& vector) { if (IsEmpty() || vector.IsEmpty()) LogicError("SetDiagonalValue: Matrix is empty."); if (GetNumRows() != GetNumCols()) LogicError("SetDiagonalValue: NumRows and NumCols do not agree."); if (vector.GetNumRows() != 1 && vector.GetNumCols() != 1) LogicError("SetDiagonalValue: input vector must be a vector."); if (vector.GetNumElements() == 1) // reduce to simple form SetDiagonalValue(vector(0, 0)); else if (vector.GetNumRows() != GetNumRows()) LogicError("SetDiagonalValue: input vector's dimension does not agree with [this]."); else { auto& us = *this; long m = (long) GetNumRows(); if (vector.GetNumRows() == 1) // row vector { #pragma omp parallel for // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, i) = vector(0, i); us(i + 1, i + 1) = vector(0, i + 1); us(i + 2, i + 2) = vector(0, i + 2); us(i + 3, i + 3) = vector(0, i + 3); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, i) = vector(0, i); } } else { #pragma omp parallel for // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, i) = vector(i, 0); us(i + 1, i + 1) = vector(i + 1, 0); us(i + 2, i + 2) = vector(i + 2, 0); us(i + 3, i + 3) = vector(i + 3, 0); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, i) = vector(i, 0); } } } } template void CPUMatrix::SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed) { if (IsEmpty()) LogicError("SetUniformRandomValue: Matrix is empty."); #ifdef _MSC_VER // TODO: check if available under GCC/Linux std::ranlux64_base_01 generator; generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); #else std::default_random_engine generator(seed); #endif std::uniform_real_distribution r(low, high); long m = (long) GetNumElements(); // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { m_pArray[i] = r(generator); m_pArray[i + 1] = r(generator); m_pArray[i + 2] = r(generator); m_pArray[i + 3] = r(generator); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { m_pArray[i] = r(generator); } } template void CPUMatrix::SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) { if (sigma <= 0) InvalidArgument("SetUniformRandomValue: sigma must be a positive value."); if (IsEmpty()) LogicError("SetUniformRandomValue: Matrix is empty."); auto& us = *this; #ifdef _MSC_VER // TODO: check if available under GCC/Linux std::ranlux64_base_01 generator; generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); #else std::default_random_engine generator(seed); #endif std::normal_distribution r(mean, sigma); // #pragma omp parallel for // is it thread safe? foreach_coord (i, j, us) { us(i, j) = r(generator); } } template void CPUMatrix::AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed) { if (sigma <= 0) InvalidArgument("SetUniformRandomValue: sigma must be a positive value."); if (IsEmpty()) LogicError("SetUniformRandomValue: Matrix is empty."); auto& us = *this; #ifdef _MSC_VER // TODO: check if available under GCC/Linux std::ranlux64_base_01 generator; generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); #else std::default_random_engine generator(seed); #endif std::normal_distribution r(mean, sigma); long m = (long) GetNumRows(), n = (long) GetNumCols(); for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = r(generator); us(i + 1, j) = r(generator); us(i + 2, j) = r(generator); us(i + 3, j) = r(generator); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = r(generator); } } } //maskRate: percentage of values masked out (similar to dropout rate) //scaleValue: which scale value to set to the left ones (unmasked items). template void CPUMatrix::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed) { if (IsEmpty()) LogicError("SetUniformRandomValue: Matrix is empty."); auto& us = *this; #ifdef _MSC_VER // TODO: check if available under GCC/Linux std::ranlux64_base_01 generator; generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); #else std::default_random_engine generator(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed); #endif std::uniform_real_distribution r(0, 1); long m = (long) GetNumRows(), n = (long) GetNumCols(); ElemType v; for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { v = r(generator); us(i, j) = v <= maskRate ? 0 : scaleValue; v = r(generator); us(i + 1, j) = v <= maskRate ? 0 : scaleValue; v = r(generator); us(i + 2, j) = v <= maskRate ? 0 : scaleValue; v = r(generator); us(i + 3, j) = v <= maskRate ? 0 : scaleValue; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { v = r(generator); us(i, j) = v <= maskRate ? 0 : scaleValue; } } } template ElemType CPUMatrix::Adagrad(CPUMatrix& gradients, const bool needAveMultiplier) { ElemType aveMultiplier = 0; if (IsEmpty() || gradients.GetNumCols() != GetNumCols() || gradients.GetNumRows() != GetNumRows()) { Resize(gradients.GetNumRows(), gradients.GetNumCols()); SetValue(0.0); } assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols()); ElemType *a = m_pArray, *d_v = gradients.m_pArray; size_t n = GetNumElements(); const ElemType floor = 1e-16f; ElemType a0, a1, a2, a3; // disable omp here because aveMultiper needs to be added atomically. however, it seems the result is incorrect even if rmp atomic and amp critical are used. // #pragma omp parallel for for (long i = 0; i < (n & ~3); i += 4) // four-way unrolling { a[i] += d_v[i] * d_v[i]; a[i + 1] += d_v[i + 1] * d_v[i + 1]; a[i + 2] += d_v[i + 2] * d_v[i + 2]; a[i + 3] += d_v[i + 3] * d_v[i + 3]; a0 = sqrt(a[i] + floor); a1 = sqrt(a[i + 1] + floor); a2 = sqrt(a[i + 2] + floor); a3 = sqrt(a[i + 3] + floor); d_v[i] /= a0; d_v[i + 1] /= a1; d_v[i + 2] /= a2; d_v[i + 3] /= a3; if (needAveMultiplier) { aveMultiplier += 1 / a0 + 1 / a1 + 1 / a2 + 1 / a3; } } // get the last few elements if any for (long i = n & ~3; i < n; i++) { a[i] += d_v[i] * d_v[i]; a0 = sqrt(a[i] + floor); d_v[i] /= a0; if (needAveMultiplier) { aveMultiplier += 1 / a0; } } if (needAveMultiplier && n > 0) return aveMultiplier / n; else return 1; } template void CPUMatrix::FSAdagrad(CPUMatrix& gradients, CPUMatrix& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul) { size_t numColsNeeded = 2 * gradients.GetNumCols(); if (IsEmpty() || (GetNumCols() < numColsNeeded)) { Resize(gradients.GetNumRows(), numColsNeeded); SetValue(0.0); } assert((GetNumRows() == gradients.GetNumRows()) && (GetNumCols() == numColsNeeded)); size_t n = gradients.GetNumElements(); ElemType* grad = gradients.m_pArray; ElemType* smoothAda = m_pArray; ElemType* smoothMom = m_pArray + n; ElemType* val = functionValues.m_pArray; #pragma omp parallel for // TODO: Unroll 4-times for better performance leveraging vectorization for (long i = 0; i < n; i++) { ElemType g = grad[i]; ElemType adaSqr = adaWeight * smoothAda[i] + (1.0f - adaWeight) * g * g; smoothAda[i] = adaSqr; if (adaSqr != 0.0f) { ElemType ada = sqrt(adaSqr); ElemType w = adaMul * ((ElemType) 1.0 / ada); if (w > 10.0f) w = 10.0f; g *= w; } if (momentum > 0.0f) { g = momentum * smoothMom[i] + (1.0f - momentum) * g; smoothMom[i] = g; } g *= learnRatePerSample; val[i] -= g; } } template ElemType CPUMatrix::RmsProp(CPUMatrix& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier) { const ElemType floor = 1e-6f; size_t n = gradients.GetNumElements(); ElemType* curr_grad = gradients.m_pArray; if (IsEmpty() || GetNumCols() < gradients.GetNumCols() * 3) { Resize(gradients.GetNumRows(), gradients.GetNumCols() * 3); SetValue(0.0); ElemType* avars = m_pArray; // accumulated variances for RMS scaling ElemType* steps = m_pArray + 2 * n; // current step size // initialize moving average of gradient-squared for (long i = 0; i < n; i++) avars[i] = curr_grad[i] * curr_grad[i]; // initialize starting step size for (long i = 0; i < n; i++) steps[i] = ElemType(0.02); } ElemType* avars = m_pArray; // accumulated variances for RMS scaling ElemType* signs = m_pArray + n; // sign of previous gradient ElemType* steps = m_pArray + 2 * n; // current step size assert(GetNumRows() == gradients.GetNumRows() && GetNumCols() == gradients.GetNumCols() * 3); ElemType ONE_MINUS_GAMMA = ElemType(1.0) - RMS_GAMMA; // int upd[] = { // 2,2,0, // 2,2,0, // 1,1,1, // 2,2,0, // 1,2,1, // 0,2,2, // 1,1,1, // 0,2,2, // 0,2,2, // }; // for (long i=0; ineg, 1->zero, 2->pos // const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0)); // // signs[i] contains three consecutive grad_sign // signs[i] = 3*(int(signs[i]) % 9) + grad_sign; // switch(upd[int(signs[i])]) // { // case 0: // steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); // break; // case 2: // steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); // break; // } // curr_grad[i] *= steps[i] / sqrt(avars[i] + floor); // } ElemType aveMultiplier = 0, a; for (long i = 0; i < n; i++) { avars[i] = RMS_GAMMA * avars[i] + ONE_MINUS_GAMMA * (curr_grad[i] * curr_grad[i]); const int grad_sign = (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0)); if (signs[i] * grad_sign > 0) steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX); else steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN); a = steps[i] / sqrt(avars[i] + floor); curr_grad[i] *= a; signs[i] = (ElemType) grad_sign; if (needAveMultiplier) aveMultiplier += a; } if (needAveMultiplier) return aveMultiplier / n; else return 1; } template void CPUMatrix::Reshape(const size_t numRows, const size_t numCols) { assert(numRows * numCols == GetNumElements()); if (numRows * numCols != GetNumElements()) InvalidArgument("Reshape: Total number of elements does not match."); m_numRows = numRows; m_numCols = numCols; } // Resize() -- change matrix size // This function is cheap if the matrix size does not change. // Current content is not preserved. // BUGBUG: There is code that relies on zero initialization (without, we get subtle variations of output). That is wrong--we should initialize to QNaN and see where it fails. // If growOnly is true, resize will not reallocate memory if the current memory is large enough (i.e., will not shrink). // If this object does not own its memory then new memory cannot be allocated (one can still shrink and/or reshape). template void CPUMatrix::Resize(const size_t numRows, const size_t numCols, bool growOnly /*=true*/) { if (m_numRows == numRows && m_numCols == numCols) return; size_t numElements = numRows * numCols; if (numElements > m_elemSizeAllocated || // grow allocation (!growOnly && (numElements != m_elemSizeAllocated))) // shrink allocation (not if 'growOnly') { // reallocate buffer ElemType* pArray = nullptr; if (numElements > 0) { if (!OwnBuffer()) LogicError("Resize: Resizing an matrix you don't own is not supported."); pArray = NewArray(numElements); } // success: update the object if (OwnBuffer()) delete[] m_pArray; else assert(pArray == nullptr); // (if !OwnBuffer we can still resize to 0) m_pArray = pArray; m_elemSizeAllocated = numElements; } // success m_numRows = numRows; m_numCols = numCols; } // allocated by the callee but should be deleted by the caller // TODO: change to use STL vector instead template ElemType* CPUMatrix::CopyToArray() const { size_t numElements = GetNumElements(); if (numElements != 0) { ElemType* arrayCopyTo = NewArray(numElements); memcpy(arrayCopyTo, m_pArray, sizeof(ElemType) * numElements); return arrayCopyTo; } else { return nullptr; } } //memory will be allocated by the callee if not enough but need to be deleted by the caller after it's done //return number of elements copied template size_t CPUMatrix::CopyToArray(ElemType*& arrayCopyTo, size_t& currentArraySize) const { size_t numElements = GetNumElements(); if (numElements > currentArraySize) { delete arrayCopyTo; arrayCopyTo = NewArray(numElements); currentArraySize = numElements; } if (numElements != 0) { memcpy(arrayCopyTo, m_pArray, sizeof(ElemType) * numElements); } return numElements; } template void CPUMatrix::CopySection(size_t /*numRows*/, size_t /*numCols*/, ElemType* /*dst*/, size_t /*colStride*/) const { // REVIEW alexeyk: currently not used by CPU, but implement when possible. RuntimeError("Not implemented."); } template inline size_t CPUMatrix::LocateColumn(const size_t col) const { assert(col < m_numCols); return col * m_numRows; // matrix in column-wise storage } template inline size_t CPUMatrix::LocateElement(const size_t row, const size_t col) const { assert(row < m_numRows); return LocateColumn(col) + row; // matrix in column-wise storage } #pragma endregion Basic Operators #pragma region Member BLAS Functions template CPUMatrix& CPUMatrix::operator+=(ElemType alpha) { return AssignSumOf(alpha, *this); } template CPUMatrix CPUMatrix::operator+(ElemType alpha) const { CPUMatrix c(GetNumRows(), GetNumCols()); c.AssignSumOf(alpha, *this); return c; } template CPUMatrix& CPUMatrix::AssignSumOf(const ElemType alpha, const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignSumOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = alpha + a(i, j); us(i + 1, j) = alpha + a(i + 1, j); us(i + 2, j) = alpha + a(i + 2, j); us(i + 3, j) = alpha + a(i + 3, j); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = alpha + a(i, j); } } return *this; } //if [this] and a have same dimension then [this]=[this]+a //if a is a column vector, add to all columns of [this] //if a is a row vector, add to all rows of [this] //if a is a scalar, add it to all elements. template CPUMatrix& CPUMatrix::operator+=(const CPUMatrix& a) { // if (a.GetNumElements() == 1) // *this += a(0,0); // else ScaleAndAdd(1, a, *this); return *this; } //if [this] and a have same dimension then OUTPUT=[this]+a //if a is a column vector, add to all columns of [this] //if a is a row vector, add to all rows of [this] template CPUMatrix CPUMatrix::operator+(const CPUMatrix& a) const { if (GetNumElements() == 1) { CPUMatrix c(a); c += (*this)(0, 0); return c; } else if (a.GetNumElements() == 1) { CPUMatrix c(*this); c += a(0, 0); return c; } else { CPUMatrix c(*this); // this implementation will introduce a copy overhead. but make resue of the code c += a; return c; } } template CPUMatrix& CPUMatrix::AssignSumOf(const CPUMatrix& a, const CPUMatrix& b) { if (a.GetNumElements() == 1) { SetValue(b); (*this) += a; } else { SetValue(a); (*this) += b; } return *this; } template CPUMatrix& CPUMatrix::operator-=(ElemType alpha) { return AssignDifferenceOf(*this, alpha); } template CPUMatrix CPUMatrix::operator-(ElemType alpha) const { CPUMatrix c(GetNumRows(), GetNumCols()); c.AssignDifferenceOf(*this, alpha); return c; } template CPUMatrix& CPUMatrix::AssignDifferenceOf(const ElemType alpha, const CPUMatrix& a) { auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = alpha - a(i, j); us(i + 1, j) = alpha - a(i + 1, j); us(i + 2, j) = alpha - a(i + 2, j); us(i + 3, j) = alpha - a(i + 3, j); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = alpha - a(i, j); } } return *this; } template CPUMatrix& CPUMatrix::AssignDifferenceOf(const CPUMatrix& a, const ElemType alpha) { auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = a(i, j) - alpha; us(i + 1, j) = a(i + 1, j) - alpha; us(i + 2, j) = a(i + 2, j) - alpha; us(i + 3, j) = a(i + 3, j) - alpha; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = a(i, j) - alpha; } } return *this; } //if [this] and a have same dimension then [this]=[this]-a //if a is a column vector, minus it from all columns of [this] //if a is a row vector, minus it from all rows of [this] template CPUMatrix& CPUMatrix::operator-=(const CPUMatrix& a) { ScaleAndAdd(-1, a, *this); return *this; } //if [this] and a have same dimension then output=[this]-a //if a is a column vector, minus it from all columns of [this] //if a is a row vector, minus it from all rows of [this] template CPUMatrix CPUMatrix::operator-(const CPUMatrix& a) const { CPUMatrix c(*this); // this implementation will introduce a copy overhead. but make resue of the code c -= a; return c; } template CPUMatrix& CPUMatrix::AssignDifferenceOf(const CPUMatrix& a, const CPUMatrix& b) { if (this != &a) { Resize(a.GetNumRows(), a.GetNumCols()); SetValue(a); } (*this) -= b; return *this; } template CPUMatrix& CPUMatrix::operator*=(ElemType alpha) { Scale(alpha, *this); return *this; } template CPUMatrix CPUMatrix::operator*(ElemType alpha) const { CPUMatrix c(GetNumRows(), GetNumCols()); Scale(alpha, *this, c); return c; } template CPUMatrix& CPUMatrix::AssignProductOf(const ElemType alpha, const CPUMatrix& a) { Scale(alpha, a, *this); return *this; } // [this]=a*b template CPUMatrix& CPUMatrix::AssignProductOf(const CPUMatrix& a, const bool transposeA, const CPUMatrix& b, const bool transposeB) { if (a.GetNumElements() == 1) { if (transposeB) AssignTransposeOf(b); (*this) *= a(0, 0); } else if (b.GetNumElements() == 1) { if (transposeA) AssignTransposeOf(a); (*this) *= b(0, 0); } else Multiply(a, transposeA, b, transposeB, *this); return *this; } template CPUMatrix CPUMatrix::operator*(const CPUMatrix& a) const { auto& us = *this; if (GetNumElements() == 1) { CPUMatrix c; c.AssignProductOf(us(0, 0), a); return c; } else if (a.GetNumElements() == 1) { CPUMatrix c; c.AssignProductOf(a(0, 0), us); return c; } else { CPUMatrix c; Multiply(*this, a, c); return c; } } template CPUMatrix& CPUMatrix::operator/=(ElemType alpha) { (*this) *= 1 / alpha; return (*this); } template CPUMatrix CPUMatrix::operator/(ElemType alpha) const { return ((*this) * (1 / alpha)); } //element-wise power template CPUMatrix& CPUMatrix::operator^=(ElemType alpha) { auto& us = *this; ElementWisePower(alpha, us, us); return us; } //element-wise power template CPUMatrix CPUMatrix::operator^(ElemType alpha) const { CPUMatrix c(GetNumRows(), GetNumCols()); ElementWisePower(alpha, *this, c); return c; } template CPUMatrix& CPUMatrix::AssignElementPowerOf(const CPUMatrix& a, const ElemType power) { ElementWisePower(power, a, *this); return *this; } //[this]=[this] .* a (we cannot override operator .* in c++) template CPUMatrix& CPUMatrix::ElementMultiplyWith(const CPUMatrix& a) { return AssignElementProductOf(*this, a); } //[this]=[this] .* a (we cannot override operator .* in c++) template CPUMatrix& CPUMatrix::ElementDivideBy(const CPUMatrix& a) { return AssignElementDivisionOf(*this, a); } //[this]=a .* b template CPUMatrix& CPUMatrix::AssignElementProductOf(const CPUMatrix& a, const CPUMatrix& b) { if (a.IsEmpty() || b.IsEmpty()) LogicError("AssignElementProductOf: Matrix is empty."); assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) InvalidArgument("AssignElementProductOf: The input matrix dimensions do not match."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = a(i, j) * b(i, j); us(i + 1, j) = a(i + 1, j) * b(i + 1, j); us(i + 2, j) = a(i + 2, j) * b(i + 2, j); us(i + 3, j) = a(i + 3, j) * b(i + 3, j); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = a(i, j) * b(i, j); } } return *this; } //[this] +=a .* b template CPUMatrix& CPUMatrix::AddElementProductOf(const CPUMatrix& a, const CPUMatrix& b) { if (a.IsEmpty() || b.IsEmpty()) LogicError("AddElementProductOf: Matrix is empty."); assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) InvalidArgument("AddElementProductOf : The input matrix dimensions do not match."); if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == GetNumCols())) InvalidArgument("AddElementProductOf : The input matrix dimensions do not match [this]."); auto& us = *this; long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) += a(i, j) * b(i, j); us(i + 1, j) += a(i + 1, j) * b(i + 1, j); us(i + 2, j) += a(i + 2, j) * b(i + 2, j); us(i + 3, j) += a(i + 3, j) * b(i + 3, j); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) += a(i, j) * b(i, j); } } return *this; } //[this]=a ./ b // TODO: This clips the divisor by a small value. Is that really what one would want? template CPUMatrix& CPUMatrix::AssignElementDivisionOf(const CPUMatrix& a, const CPUMatrix& b) { if (a.IsEmpty() || b.IsEmpty()) LogicError("AssignElementDivisionOf: Matrix is empty."); assert(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols()); if (!(a.GetNumRows() == b.GetNumRows() && a.GetNumCols() == b.GetNumCols())) InvalidArgument("AssignElementDivisionOf : The input matrix dimensions do not match."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); ElemType smallValue = EPS_IN_INVERSE; #pragma omp parallel for foreach_coord (i, j, us) { ElemType v = b(i, j); if (v >= 0 && v < smallValue) us(i, j) = a(i, j) / smallValue; else if (v < 0 && v > -smallValue) us(i, j) = a(i, j) / (-smallValue); else us(i, j) = a(i, j) / v; } return *this; } template CPUMatrix& CPUMatrix::ColumnElementMultiplyWith(const CPUMatrix& a) { if (a.IsEmpty() || IsEmpty()) LogicError("ColumnElementMultiplyWith: Matrix is empty."); assert(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1); if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) InvalidArgument("ColumnElementMultiplyWith: The input matrix should be a col vector and match [this]'s rows."); auto& us = *this; long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) *= a(i, 0); us(i + 1, j) *= a(i + 1, 0); us(i + 2, j) *= a(i + 2, 0); us(i + 3, j) *= a(i + 3, 0); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) *= a(i, 0); } } return *this; } template CPUMatrix& CPUMatrix::RowElementMultiplyWith(const CPUMatrix& a) { if (a.IsEmpty() || IsEmpty()) LogicError("RowElementMultiplyWith: Matrix is empty."); assert(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()); if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) InvalidArgument("RowElementMultiplyWith: The input matrix should be a row vector and match [this]'s columns."); auto& us = *this; long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { ElemType v = a(0, j); // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) *= v; us(i + 1, j) *= v; us(i + 2, j) *= v; us(i + 3, j) *= v; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) *= v; } } return *this; } template CPUMatrix& CPUMatrix::RowElementDivideBy(const CPUMatrix& a) { if (a.IsEmpty() || IsEmpty()) LogicError("RowElementDivideBy: Matrix is empty."); assert(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols()); if (!(a.GetNumRows() == 1 && a.GetNumCols() == GetNumCols())) InvalidArgument("RowElementDivideBy: The input matrix should be a row vector and match [this]'s columns."); auto& us = *this; long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { ElemType v = a(0, j); if (v >= 0 && v < EPS_IN_INVERSE) v = EPS_IN_INVERSE; else if (v < 0 && v > -EPS_IN_INVERSE) v = (-EPS_IN_INVERSE); // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) /= v; us(i + 1, j) /= v; us(i + 2, j) /= v; us(i + 3, j) /= v; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) /= v; } } return *this; } template CPUMatrix& CPUMatrix::ColumnElementDivideBy(const CPUMatrix& a) { if (a.IsEmpty() || IsEmpty()) LogicError("ColumnElementDivideBy: Matrix is empty."); assert(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1); if (!(a.GetNumRows() == GetNumRows() && a.GetNumCols() == 1)) InvalidArgument("ColumnElementDivideBy: The input matrix should be a col vector and match [this]'s rows."); auto& us = *this; long m = (long) GetNumRows(), n = (long) GetNumCols(); ElemType smallValue = EPS_IN_INVERSE; #pragma omp parallel for for (long j = 0; j < n; j++) { for (long i = 0; i < m; i++) { ElemType v = a(i, 0); if (v >= 0 && v < smallValue) us(i, j) /= smallValue; else if (v < 0 && v > -smallValue) us(i, j) /= (-smallValue); else us(i, j) /= v; } } return *this; } //[this]=1 ./ a template CPUMatrix& CPUMatrix::ElementInverse() { return AssignElementInverseOf(*this); } template CPUMatrix& CPUMatrix::AssignElementInverseOf(const CPUMatrix& a) { ElemType smallValue = EPS_IN_INVERSE; if (a.IsEmpty()) LogicError("AssignElementInverseOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); #pragma omp parallel for foreach_coord (i, j, us) { if (a(i, j) < 0 && a(i, j) > -smallValue) us(i, j) = 1 / (-smallValue); else if (a(i, j) >= 0 && a(i, j) < smallValue) us(i, j) = 1 / smallValue; else us(i, j) = 1 / a(i, j); } return *this; } //[this]=sigmoid([this]) element wise template CPUMatrix& CPUMatrix::InplaceSigmoid() { return AssignSigmoidOf(*this); } template CPUMatrix& CPUMatrix::AssignSigmoidOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignSigmoidOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); #pragma omp parallel for foreach_coord (i, j, us) { if (a(i, j) >= 0) us(i, j) = 1 / (1 + exp(-a(i, j))); else { ElemType v = exp(a(i, j)); us(i, j) = v / (1 + v); } } return *this; } template CPUMatrix& CPUMatrix::InplaceLinearRectifierDerivative() { return AssignLinearRectifierDerivativeOf(*this); } template CPUMatrix& CPUMatrix::AssignLinearRectifierDerivativeOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignLinearRectifierDerivativeOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = a(i, j) > 0.0f ? 1.0f : 0.0f; us(i + 1, j) = a(i + 1, j) > 0.0f ? 1.0f : 0.0f; us(i + 2, j) = a(i + 2, j) > 0.0f ? 1.0f : 0.0f; us(i + 3, j) = a(i + 3, j) > 0.0f ? 1.0f : 0.0f; } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = a(i, j) > 0.0f ? 1.0f : 0.0f; } } return *this; } template CPUMatrix& CPUMatrix::InplaceSigmoidDerivative() { return AssignSigmoidDerivativeOf(*this); } template CPUMatrix& CPUMatrix::AssignSigmoidDerivativeOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignSigmoidDerivativeOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { ElemType v = a(i, j); us(i, j) = v * (1 - v); ElemType v1 = a(i + 1, j); us(i + 1, j) = v1 * (1 - v1); ElemType v2 = a(i + 2, j); us(i + 2, j) = v2 * (1 - v2); ElemType v3 = a(i + 3, j); us(i + 3, j) = v3 * (1 - v3); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { ElemType v = a(i, j); us(i, j) = v * (1 - v); } } return *this; } //[this]=tanh([this]) element wise template CPUMatrix& CPUMatrix::InplaceTanh() { return AssignTanhOf(*this); } template CPUMatrix& CPUMatrix::AssignTanhOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignTanhOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = tanh(a(i, j)); us(i + 1, j) = tanh(a(i + 1, j)); us(i + 2, j) = tanh(a(i + 2, j)); us(i + 3, j) = tanh(a(i + 3, j)); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = tanh(a(i, j)); } } return *this; } //[this]=softmax([this]) element wise template CPUMatrix& CPUMatrix::InplaceLogSoftmax(const bool isColWise) { return AssignLogSoftmaxOf(*this, isColWise); } template CPUMatrix& CPUMatrix::AssignLogSoftmaxOf(const CPUMatrix& a, const bool isColWise) { if (a.IsEmpty()) LogicError("AssignLogSoftmaxOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); if (isColWise) { #pragma omp parallel for foreach_column (j, a) { // we need to extract max before applying exp to avoid overflow ElemType maxV = a(0, j); foreach_row (i, a) maxV = max(maxV, a(i, j)); ElemType sum = 0; foreach_row (i, a) sum += exp(us(i, j) = a(i, j) - maxV); sum = log(sum); foreach_row (i, us) us(i, j) -= sum; } } else { #pragma omp parallel for foreach_row (i, a) { // we need to extract max before applying exp to avoid overflow ElemType maxV = a(i, 0); foreach_column (j, a) maxV = max(maxV, a(i, j)); ElemType sum = 0; foreach_column (j, a) sum += exp(us(i, j) = a(i, j) - maxV); sum = log(sum); foreach_column (j, us) us(i, j) -= sum; } } return *this; } //[this]=hardmax([this]) //the max element is 1 else is 0 template CPUMatrix& CPUMatrix::InplaceHardmax(const bool isColWise) { return AssignHardmaxOf(*this, isColWise); } template CPUMatrix& CPUMatrix::AssignHardmaxOf(const CPUMatrix& a, const bool isColWise) { if (a.IsEmpty()) LogicError("AssignHardmaxOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); if (isColWise) { #pragma omp parallel for foreach_column (j, a) { // we need to extract max ElemType maxV = a(0, j); long maxI = 0; foreach_row (i, a) { if (maxV < a(i, j)) { maxV = a(i, j); maxI = i; } } foreach_row (i, us) us(i, j) = (i == maxI) ? 1.0f : 0.0f; } } else { #pragma omp parallel for foreach_row (i, a) { // we need to extract max ElemType maxV = a(i, 0); long maxJ = 0; foreach_column (j, a) { if (maxV < a(i, j)) { maxV = a(i, j); maxJ = j; } } foreach_column (j, us) us(i, j) = (j == maxJ) ? 1.0f : 0.0f; } } return *this; } //[this]=sqrt([this]) element wise template CPUMatrix& CPUMatrix::InplaceSqrt() { return AssignSqrtOf(*this); } //to prevent negative values caused by floating operations, we force inputs to be >=0 //this may, however, hide problems in the caller. template CPUMatrix& CPUMatrix::AssignSqrtOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignSqrtOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = sqrt(max((ElemType)0, a(i, j))); us(i + 1, j) = sqrt(max((ElemType)0, a(i + 1, j))); us(i + 2, j) = sqrt(max((ElemType)0, a(i + 2, j))); us(i + 3, j) = sqrt(max((ElemType)0, a(i + 3, j))); } // remaining for (long i = m & ~3; i < m; i++) { us(i, j) = sqrt(max((ElemType)0, a(i, j))); } } return *this; } //[this]=exp([this]) element wise template CPUMatrix& CPUMatrix::InplaceExp() { return AssignExpOf(*this); } template CPUMatrix& CPUMatrix::AssignExpOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignExpOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = exp(a(i, j)); us(i + 1, j) = exp(a(i + 1, j)); us(i + 2, j) = exp(a(i + 2, j)); us(i + 3, j) = exp(a(i + 3, j)); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = exp(a(i, j)); } } return *this; } //[this]=exp([this]) element wise template CPUMatrix& CPUMatrix::InplaceAbs() { return AssignAbsOf(*this); } template CPUMatrix& CPUMatrix::AssignAbsOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignAbsOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); long m = (long) GetNumRows(), n = (long) GetNumCols(); #pragma omp parallel for for (long j = 0; j < n; j++) { // four-way unrolling for (long i = 0; i < (m & ~3); i += 4) { us(i, j) = abs(a(i, j)); us(i + 1, j) = abs(a(i + 1, j)); us(i + 2, j) = abs(a(i + 2, j)); us(i + 3, j) = abs(a(i + 3, j)); } // handle remaining stuffs for (long i = m & ~3; i < m; i++) { us(i, j) = abs(a(i, j)); } } return *this; } //[this]=log([this]) element wise template CPUMatrix& CPUMatrix::InplaceLog() { return AssignLogOf(*this); } //[this]=log([this]) element wise template CPUMatrix& CPUMatrix::InplaceLog10() { return AssignLog10Of(*this); } template CPUMatrix& CPUMatrix::AssignLogOf(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignLogOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); #pragma omp parallel for foreach_coord (i, j, a) { const ElemType v = a(i, j); if (v < EPS_IN_LOG) { us(i, j) = LOG_OF_EPS_IN_LOG; } else us(i, j) = log(v); } return *this; } template CPUMatrix& CPUMatrix::AssignLog10Of(const CPUMatrix& a) { if (a.IsEmpty()) LogicError("AssignLogOf: Matrix a is empty."); auto& us = *this; if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); #pragma omp parallel for foreach_coord (i, j, a) { const ElemType v = a(i, j); if (v <= 0) LogicError("AssignLogOf: Log can only applied to numbers larger than 0."); else if (v < EPS_IN_LOG) { us(i, j) = LOG10_OF_EPS_IN_LOG; } else us(i, j) = log10(v); } return *this; } //[this]=cos([this]) element wise template CPUMatrix& CPUMatrix