https://github.com/Microsoft/CNTK
Tip revision: e9162e18e1c51b88f5c92ac6eb6623943eae353e authored by Amit Agarwal on 08 May 2016, 01:37:50 UTC
Synchronize batch norm nodes' running means and variances across parallel workers at end of each epoch to ensure accurate restartability across epochs and add and end-to-end test for this scenario
Synchronize batch norm nodes' running means and variances across parallel workers at end of each epoch to ensure accurate restartability across epochs and add and end-to-end test for this scenario
Tip revision: e9162e1
CPUSparseMatrix.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <stdio.h>
#include "CPUMatrix.h"
//#include "GPUMatrix.h"
//#include "GPUSparseMatrix.h"
#include <map>
#include <unordered_map>
#ifdef _WIN32
#ifdef MATH_EXPORTS
#define MATH_API __declspec(dllexport)
#else
#define MATH_API __declspec(dllimport)
#endif
#endif /* Linux - already defined in CPUMatrix.h */
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class MATH_API CPUSparseMatrix : public BaseMatrix<ElemType>
{
typedef BaseMatrix<ElemType> Base;
using Base::m_numRows;
using Base::m_numCols;
using Base::m_sliceViewOffset;
using Base::SetBuffer;
using Base::HasExternalBuffer;
using Base::GetNumStorageRows;
using Base::SetNumStorageRows;
using Base::GetNumStorageCols;
using Base::SetNumStorageCols;
using Base::SetComputeDeviceId;
using Base::SetSizeAllocated;
using Base::GetSizeAllocated;
using Base::GetCompIndex;
using Base::SetCompIndex;
using Base::GetUnCompIndex;
using Base::SetUnCompIndex;
using Base::GetCompIndexSize;
using Base::SetCompIndexSize;
using Base::GetColIdx;
using Base::SetColIdx;
using Base::GetBlockSize;
using Base::SetBlockSize;
using Base::GetBlockIds;
using Base::SetBlockIds;
using Base::GetBlockIdShift;
using Base::SetBlockIdShift;
using Base::ZeroInit;
using Base::ZeroValues;
using Base::m_sob;
using Base::ShallowCopyFrom;
using Base::VerifyResizable;
public:
using Base::VerifyWritable;
using Base::GetComputeDeviceId;
using Base::Buffer;
using Base::GetNumRows;
using Base::GetNumCols;
using Base::GetNumElements;
using Base::OwnBuffer;
using Base::GetFormat;
using Base::SetFormat;
using Base::IsEmpty;
private:
void ZeroInit();
void CheckInit(const MatrixFormat format);
public:
explicit CPUSparseMatrix(const MatrixFormat format);
CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deepCopyFrom); // copy constructor, deep copy
CPUSparseMatrix<ElemType>& operator=(const CPUSparseMatrix<ElemType>& deepCopyFrom); // assignment operator, deep copy
CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom); // move constructor, shallow copy
CPUSparseMatrix<ElemType>& operator=(CPUSparseMatrix<ElemType>&& moveFrom); // move assignment operator, shallow copy
~CPUSparseMatrix();
public:
void SetValue(const size_t row, const size_t col, ElemType val);
//void SetValue(const CPUMatrix<ElemType>& /*val*/);
//void SetValue(const GPUMatrix<ElemType>& /*val*/);
void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
//void SetValue(const GPUSparseMatrix<ElemType>& /*val*/);
void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);
size_t BufferSize() const
{
return GetSizeAllocated() * sizeof(ElemType);
}
ElemType* Data() const;
ElemType* Data();
inline size_t GetNumElemAllocated() const
{
return GetSizeAllocated();
}
CPUSparseMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
CPUMatrix<ElemType> CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
void AssignColumnSliceToDense(CPUMatrix<ElemType>& slice, size_t startColumn, size_t numCols) const;
CPUMatrix<ElemType> DiagonalToDense() const;
void SetGaussianRandomValue(const ElemType /*mean*/, const ElemType /*sigma*/, unsigned long /*seed*/)
{
NOT_IMPLEMENTED;
}
void SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCCol, const CPUSPARSE_INDEX_TYPE* h_Row, const ElemType* h_Val,
const size_t nz, const size_t numRows, const size_t numCols);
static void MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c);
static void MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c);
static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& c);
static bool AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold = 1e-8);
// sum(vec(a).*vec(b))
static ElemType InnerProductOfMatrices(const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/)
{
NOT_IMPLEMENTED;
}
static void AddScaledDifference(const ElemType /*alpha*/, const CPUSparseMatrix<ElemType>& /*a*/, const CPUMatrix<ElemType>& /*b*/, CPUMatrix<ElemType>& /*c*/,
bool /*bDefaultZero*/)
{
NOT_IMPLEMENTED;
}
static void AddScaledDifference(const ElemType /*alpha*/, const CPUMatrix<ElemType>& /*a*/, const CPUSparseMatrix<ElemType>& /*b*/, CPUMatrix<ElemType>& /*c*/,
bool /*bDefaultZero*/)
{
NOT_IMPLEMENTED;
}
int GetComputeDeviceId() const
{
return -1;
}
// Allocate actually allocates the storage space for numNZElemToReserve elements. This is different than resizing, which changes the dimensions of the underlying matrix.
// Unfortunately numRows/numCols need to be passed in in the case of various matrix formats (e.g., SparseCSC), because some of the dimensions allocated depend on the
// dimensions of the matrix.
void Allocate(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, const bool growOnly = true, bool keepExistingValues = false); // matrix format will affect the size to allocate
// RequireSizeAndAllocate is required by SpasreMatrix since resizing the dimensions and allocating storage are different operations. Since a Resize can entail changing
// MatrixFormat, we offer an overload for that case.
void RequireSizeAndAllocate(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true, bool keepExistingValues = true); // matrix format will affect the size to allocate
// Otherwise we will just use the current MatrixFormat.
void RequireSizeAndAllocate(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, const bool growOnly = true, bool keepExistingValues = false);
// Sparse matrix RequireSize is similar to dense matrix RequireSize in that it will only allocate the minimum amount of storage requried to successfully create the matrix.
// This is required because some formats (e.g., SparseCSC) require the SecondaryIndexLocation to have valid data in order to compute m_nz. Otherwise this method would not
// update the storage at all.
void RequireSize(const size_t numRows, const size_t numCols, const MatrixFormat format, const bool growOnly = true);
// Allows RequireSize to be called without modifying the MatrixFormat.
void RequireSize(const size_t numRows, const size_t numCols, const bool growOnly = true);
// Resizes the dimensions of the underlying sparse matrix object. Since the caller may have a hint for m_nz, we allow that to be passed, but this is terrible design. In the
// future if we want better separation between allocation and resizing, this interface should be updated to be less clunky.
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly = true); // matrix format will affect the size to allocate
void Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve = 10000, const bool growOnly = true);
void Reset();
const ElemType operator()(const size_t row, const size_t col) const
{
if (col >= m_numCols || row >= m_numRows)
{
RuntimeError("Position outside matrix dimensions");
}
if (GetFormat() == MatrixFormat::matrixFormatSparseCSC)
{
size_t start = SecondaryIndexLocation()[col];
size_t end = SecondaryIndexLocation()[col + 1];
for (size_t p = start; p < end; p++)
{
size_t i = MajorIndexLocation()[p];
if (i == row)
{
return ((ElemType*)Buffer())[p];
}
}
return 0;
}
else
{
NOT_IMPLEMENTED;
}
}
public:
void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum);
ElemType Adagrad(CPUMatrix<ElemType>& c, const bool needAveMultiplier);
public:
CPUSparseMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
CPUSparseMatrix<ElemType>& InplaceTruncateBottom(const ElemType threshold);
CPUSparseMatrix<ElemType>& InplaceTruncate(const ElemType threshold);
CPUSparseMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
ElemType FrobeniusNorm() const; // useful for comparing CPU and GPU results
ElemType SumOfAbsElements() const; // sum of all abs(elements)
ElemType SumOfElements() const; // sum of all elements
public:
void Print(const char* matrixName, ptrdiff_t rowStart, ptrdiff_t rowEnd, ptrdiff_t colStart, ptrdiff_t colEnd) const;
void Print(const char* matrixName = NULL) const; // print whole matrix. can be expensive
public:
const ElemType* NzValues() const
{
return Data();
}
inline ElemType* NzValues()
{
return Data();
}
size_t NzCount() const
{
if (GetFormat() == matrixFormatSparseCSC)
return GetCompIndex()[GetNumCols()] - GetCompIndex()[0];
else if (GetFormat()== matrixFormatSparseCSR)
return GetCompIndex()[GetNumRows()] - GetCompIndex()[0];
else if (GetFormat() == matrixFormatSparseBlockCol)
return GetBlockSize() * GetNumRows();
else
NOT_IMPLEMENTED;
}
size_t NzSize() const
{
return sizeof(ElemType) * NzCount();
} // actual number of element bytes in use
CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const
{
return GetUnCompIndex() + GetCompIndex()[m_sliceViewOffset];
} // this is the major index, row/col ids in CSC/CSR format
size_t MajorIndexCount() const
{
return NzCount();
}
size_t MajorIndexSize() const
{
return sizeof(CPUSPARSE_INDEX_TYPE) * MajorIndexCount();
} // actual number of major index bytes in use
CPUSPARSE_INDEX_TYPE* SecondaryIndexLocation() const
{
return GetCompIndex() + m_sliceViewOffset;
} // this is the compressed index, col/row in CSC/CSR format
size_t SecondaryIndexCount() const
{
if (GetFormat() & matrixFormatCompressed)
{
size_t cnt = (GetFormat() & matrixFormatRowMajor) ? m_numRows : m_numCols;
if (cnt > 0)
cnt++; // add an extra element on the end for the "max" value
return cnt;
}
else
return NzCount(); // COO format
}
// get size for compressed index
size_t SecondaryIndexSize() const
{
return (SecondaryIndexCount()) * sizeof(CPUSPARSE_INDEX_TYPE);
}
// the column and row locations will swap based on what format we are in. Full index always follows the data array
CPUSPARSE_INDEX_TYPE* RowLocation() const
{
return (GetFormat() & matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation();
}
size_t RowSize() const
{
return (GetFormat() & matrixFormatRowMajor) ? SecondaryIndexSize() : MajorIndexSize();
}
CPUSPARSE_INDEX_TYPE* ColLocation() const
{
return (GetFormat() & matrixFormatRowMajor) ? MajorIndexLocation() : SecondaryIndexLocation();
}
size_t ColSize() const
{
return (GetFormat() & matrixFormatRowMajor) ? MajorIndexSize() : SecondaryIndexSize();
} // actual number of bytes in use
};
typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
typedef CPUSparseMatrix<double> CPUDoubleSparseMatrix;
} } }