https://github.com/Microsoft/CNTK
Tip revision: 9a1335c01594f984d3739e040fe5c6c7ac11b11b authored by Spandan Tiwari on 17 August 2017, 17:41:32 UTC
Merge branch 'master' of https://github.com/Microsoft/cntk into sptiwari/UpdateFasterRcnn
Merge branch 'master' of https://github.com/Microsoft/cntk into sptiwari/UpdateFasterRcnn
Tip revision: 9a1335c
MatrixPool.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <string>
#include <stdexcept>
#include <vector>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <algorithm>
#include <stdlib.h>
#include "Basics.h"
#include "Matrix.h"
#include "ComputationNode.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
struct MemRequestInfo
{
DEVICEID_TYPE deviceId; // which device to allocate data
std::vector<shared_ptr<Matrix<ElemType>>*> pMatrixPtrs; // memory pointers
size_t matrixSize; // memory size
bool mbScale; // whether the memory shall be scaled by minibatch size
bool isWorkSpace; // workspace memory or not, by workspace we indicate whether a memory space will be released very shortly after allocation
int allocStep; // at what step counter memory allocation is requested
int releaseStep; // at what step counter memory release is requested
int memoryId; // integer indexing the memory buffer ID
MemRequestInfo(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, bool isWorkSpace, int allocStep)
:deviceId(deviceId), matrixSize(matrixSize), mbScale(mbScale), isWorkSpace(isWorkSpace), allocStep(allocStep), releaseStep(INT_MAX), memoryId(-1)
{
pMatrixPtrs.push_back(pMatrixPtr);
}
void SetReleaseStep(int step) { releaseStep = step; }
void SetMemoryId(int id) { memoryId = id; }
};
template <class ElemType>
struct greater_than_mem_req_size
{
inline bool operator() (const MemRequestInfo<ElemType>& info1, const MemRequestInfo<ElemType>& info2)
{
return (info1.matrixSize > info2.matrixSize);
}
};
struct MemAllocInfo
{
int memoryId;
size_t memorySize;
vector<pair<int, int>> occupancy;
MemAllocInfo(int memoryId, size_t memorySize, vector<pair<int, int>> occ)
:memoryId(memoryId), memorySize(memorySize), occupancy(occ)
{
}
};
// MatrixPool -- class to support memory sharing
// Despite the gather general name of this class, it is specifically designed to support the memory sharing of ComputationNodes.
// Note: see #define SUPRESS_MEMSHARING below as for how to temporarily disable memory sharing altogether, for debugging
class MatrixPool
{
public:
typedef const void* AliasNodePtr; // use as an identifier in place of ComputationNodeBasePtr to avoid include order issue
protected:
vector<MemRequestInfo<float>> m_memRequestInfoFloatVec;
vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
set<DEVICEID_TYPE> m_deviceIDSet;
int m_stepCounter;
template <class ElemType>
vector<MemRequestInfo<ElemType>>& GetMemRequestInfoVec();
// MatrixPool allows a bunch of node to share one matrix
struct AliasInfo
{
void* pMatrixPtr;
size_t totalCount;
size_t releaseCount;
AliasInfo(size_t total = 0)
: pMatrixPtr(nullptr), totalCount(total), releaseCount(0)
{
}
};
unordered_map<AliasNodePtr, AliasInfo> m_aliasGroups;
unordered_map<AliasNodePtr, AliasNodePtr> m_aliasLookup;
public:
void Reset()
{
m_stepCounter = 0;
m_aliasGroups.clear();
m_aliasLookup.clear();
};
template <class ElemType>
MemRequestInfo<ElemType>* GetMemInfo(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
// iterate through the vector and find the pointer memInfo
for (auto& memInfo : memInfoVec)
{
if (memInfo.pMatrixPtrs[0] == pMatrixPtr)
return &memInfo;
}
return nullptr;
}
template <class ElemType>
void RequestRelease(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
{
auto memInfo = GetMemInfo(pMatrixPtr);
if (memInfo != nullptr)
{
memInfo->SetReleaseStep(m_stepCounter);
}
m_stepCounter++;
}
// isWorkSpace is a flag indicating a memory is temporary and will be released very shortly. In the current implementation, all workspace
// memories will have their own pool. This is a design proven to be useful for the workspace memory in convolution.
// matrixSize is an estimate of the required memory to be allocated. Note we don't allocate any memory at the time of request. Instead, a
// global memory allocation optimziation is run to improve memory efficiency
// mbScale is another flag indicating if the size of the memory will scale w.r.t. the minibatch size. Unfortunately, at the time of memory
// request and pointer assignment, we don't known the minibatch size. Thus our memory sharing algorithm is sub-optimal.
template <class ElemType>
void RequestAllocate(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, bool isWorkSpace)
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
MemRequestInfo<ElemType> memInfo(deviceId, pMatrixPtr, matrixSize, mbScale, isWorkSpace, m_stepCounter);
memInfoVec.push_back(memInfo);
m_deviceIDSet.insert(deviceId);
m_stepCounter++;
// assign some temporary pointer, they will be replaced later unless the matrix is sparse
*pMatrixPtr = make_shared<Matrix<ElemType>>(deviceId);
}
void OptimizedMemoryAllocation()
{
// MatrixPool is not templated, so we call both float and double versions here
OptimizedMemoryAllocationFunc<float>();
OptimizedMemoryAllocationFunc<double>();
return;
}
void SetAliasInfo(
const unordered_map<AliasNodePtr, unordered_set<AliasNodePtr>>& groupMap,
const unordered_map<AliasNodePtr, AliasNodePtr>& rootLookupMap)
{
m_aliasLookup.clear();
for (const auto& pair : groupMap)
{
m_aliasGroups.insert(std::make_pair(pair.first, AliasInfo(pair.second.size())));
for (const auto& child : pair.second)
{
if (rootLookupMap.find(child) == rootLookupMap.end())
InvalidArgument("group nodes should be in lookupMap");
}
}
for (const auto& pair : rootLookupMap)
{
if (groupMap.find(pair.second) == groupMap.end())
InvalidArgument("lookup root should be group key");
}
m_aliasLookup = rootLookupMap;
}
template <class ElemType>
void RequestAliasedRelease(AliasNodePtr node)
{
const auto iter = m_aliasLookup.find(node);
if (iter == m_aliasLookup.end())
LogicError("node not aliased");
auto parent = iter->second;
auto& aliasInfo = m_aliasGroups[parent];
if (aliasInfo.pMatrixPtr == nullptr)
LogicError("double releasing aliased matrix, or releasing before any allocation for the matrix");
if (aliasInfo.releaseCount >= aliasInfo.totalCount)
LogicError("number of alias instances exceeded expectation");
aliasInfo.releaseCount++;
if (aliasInfo.releaseCount == aliasInfo.totalCount)
{
RequestRelease((shared_ptr<Matrix<ElemType>>*)aliasInfo.pMatrixPtr);
aliasInfo.pMatrixPtr = nullptr;
}
}
template <class ElemType>
void RequestAliasedAllocate(DEVICEID_TYPE deviceId, AliasNodePtr node, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale)
{
const auto iter = m_aliasLookup.find(node);
if (iter == m_aliasLookup.end())
LogicError("node not aliased");
auto parent = iter->second;
auto& aliasInfo = m_aliasGroups[parent];
if (aliasInfo.pMatrixPtr == nullptr)
{
// first allocation for the group
aliasInfo.pMatrixPtr = pMatrixPtr;
RequestAllocate(deviceId, pMatrixPtr, matrixSize, mbScale, false);
}
else
{
auto aliasRootMatrixPtr = (shared_ptr<Matrix<ElemType>>*)aliasInfo.pMatrixPtr;
*pMatrixPtr = *aliasRootMatrixPtr;
GetMemInfo<ElemType>(aliasRootMatrixPtr)->pMatrixPtrs.push_back(pMatrixPtr);
}
}
private:
bool CheckOverlap(pair<int, int>occ, vector<pair<int, int>>&occVec)
{
bool bRet = false;
for (auto& o : occVec)
{
if (occ.first <= o.second && occ.second >= o.first)
{
bRet = true;
break;
}
}
//#define SUPRESS_MEMSHARING // #define this to disable memory sharing by always return true
// TODO: Make this a runtime option.
#ifdef SUPRESS_MEMSHARING
bRet = true;
#endif
return bRet;
}
template <class ElemType>
void OptimizedMemoryAllocationFunc()
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
if (memInfoVec.empty())
return;
// remove all requests that has been marked as sparse matrices, those will not participate in memory sharing
for (auto iter = memInfoVec.begin(); iter != memInfoVec.end(); )
{
bool hasSparse = false;
for (auto matPtr : iter->pMatrixPtrs)
{
if ((*matPtr)->GetMatrixType() == SPARSE)
{
hasSparse = true;
break;
}
}
if (hasSparse)
memInfoVec.erase(iter);
else
iter++;
}
// sort the memory request from largest size to smallest
std::sort(memInfoVec.begin(), memInfoVec.end(), greater_than_mem_req_size<ElemType>());
std::vector<bool> workspaceFlagVec = {true, false};
for (auto& devId : m_deviceIDSet)
{
for (auto wsFlag : workspaceFlagVec) // we allocate the workspace memory pointers first, and they are not shared with the non-workspace memory requests
{
// memAllocInfoVec is a sorted list of memory allocations from smallest to largest in memory size
vector<MemAllocInfo> memAllocInfoVec;
int memoryCounter = 0;
// we start with memory request that is scalable with minibatch size(usually those require larger memory size)
for (auto& memInfo : memInfoVec)
{
// check if it's the proper device
if (memInfo.deviceId != devId || memInfo.isWorkSpace != wsFlag || !memInfo.mbScale)
continue;
if (!memAllocInfoVec.empty())
{
// since we assign from highest memory to lowest, every memory that has been allocated can accommodate the
// current memory request, unless there is a conflict (overlap)
auto iter = memAllocInfoVec.begin();
while (iter != memAllocInfoVec.end() && CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
iter++;
if (iter == memAllocInfoVec.end())
{
// no current memory can be assigned, need to create a new one
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
// insert in the front of the vector to maintain sorted order
memAllocInfoVec.insert(memAllocInfoVec.begin(), ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
else
{
iter->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
memInfo.SetMemoryId(iter->memoryId);
}
}
else
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
}
// rescan the request list and this time allocate for those that doesn't depend on minibatch size
for (auto& memInfo : memInfoVec)
{
// check if it's the proper device
if (memInfo.deviceId != devId || memInfo.isWorkSpace != wsFlag || memInfo.mbScale)
continue;
if (!memAllocInfoVec.empty())
{
// the memory allocation vector is sorted by size. We find the largest available buffer that doesn't have time overlap
auto workingAlloc = memAllocInfoVec.end();
for (auto iter = memAllocInfoVec.begin(); iter != memAllocInfoVec.end(); iter++)
{
if (!CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
workingAlloc = iter;
}
if (workingAlloc == memAllocInfoVec.end()) // nothing works
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma); // add as the last one
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
else
{
workingAlloc->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
memInfo.SetMemoryId(workingAlloc->memoryId);
}
}
else
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
}
// now assign the actual pointers
for (int i = 0; i < memoryCounter; i++)
{
auto matrixPtr = make_shared<Matrix<ElemType>>(devId);
if (!matrixPtr) // this can't really happen, because we haven't started allocating memory yet
LogicError("MatrixPool: failed to get a valid matrix.");
for (auto& memInfo : memInfoVec)
{
if (memInfo.deviceId == devId && memInfo.isWorkSpace == wsFlag && memInfo.memoryId == i)
{
for (auto pOutMatrixPtr : memInfo.pMatrixPtrs)
{
*pOutMatrixPtr = matrixPtr;
}
}
}
}
}
}
}
};
}}}