https://github.com/Microsoft/CNTK
Tip revision: 16a41cef30894ca92667bd93079cd6fa11b3e92d authored by Sayan Pathak on 02 November 2017, 16:10:10 UTC
Added super resolution tutorial contributed by Borna with added code to minimize test downloads, fix tests, added documentation and small editorial changes to LSGAN tutorial
Added super resolution tutorial contributed by Borna with added code to minimize test downloads, fix tests, added documentation and small editorial changes to LSGAN tutorial
Tip revision: 16a41ce
MatrixPool.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <string>
#include <stdexcept>
#include <vector>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <algorithm>
#include <stdlib.h>
#include "Basics.h"
#include "Matrix.h"
#include "ComputationNode.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
struct MemRequestInfo
{
DEVICEID_TYPE deviceId; // which device to allocate data
std::vector<shared_ptr<Matrix<ElemType>>*> pMatrixPtrs; // memory pointers
size_t matrixSize; // memory size
bool mbScale; // whether the memory shall be scaled by minibatch size
bool isWorkSpace; // workspace memory or not, by workspace we indicate whether a memory space will be released very shortly after allocation
int allocStep; // at what step counter memory allocation is requested
int releaseStep; // at what step counter memory release is requested
int memoryId; // integer indexing the memory buffer ID
MemRequestInfo(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, bool isWorkSpace, int allocStep)
:deviceId(deviceId), matrixSize(matrixSize), mbScale(mbScale), isWorkSpace(isWorkSpace), allocStep(allocStep), releaseStep(INT_MAX), memoryId(-1)
{
pMatrixPtrs.push_back(pMatrixPtr);
}
void SetReleaseStep(int step) { releaseStep = step; }
void SetMemoryId(int id) { memoryId = id; }
};
template <class ElemType>
struct greater_than_mem_req_size
{
inline bool operator() (const MemRequestInfo<ElemType>& info1, const MemRequestInfo<ElemType>& info2)
{
return (info1.matrixSize > info2.matrixSize);
}
};
struct MemAllocInfo
{
int memoryId;
size_t memorySize;
vector<pair<int, int>> occupancy;
MemAllocInfo(int memoryId, size_t memorySize, vector<pair<int, int>> occ)
:memoryId(memoryId), memorySize(memorySize), occupancy(occ)
{
}
};
// MatrixPool -- class to support memory sharing
// Despite the gather general name of this class, it is specifically designed to support the memory sharing of ComputationNodes.
// Note: see #define SUPRESS_MEMSHARING below as for how to temporarily disable memory sharing altogether, for debugging
class MatrixPool
{
public:
typedef const void* AliasNodePtr; // use as an identifier in place of ComputationNodeBasePtr to avoid include order issue
protected:
vector<MemRequestInfo<float>> m_memRequestInfoFloatVec;
vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
set<DEVICEID_TYPE> m_deviceIDSet;
int m_stepCounter;
template <class ElemType>
vector<MemRequestInfo<ElemType>>& GetMemRequestInfoVec();
// MatrixPool allows a bunch of node to share one matrix
struct AliasInfo
{
void* pMatrixPtr;
size_t totalCount;
size_t releaseCount;
AliasInfo(size_t total = 0)
: pMatrixPtr(nullptr), totalCount(total), releaseCount(0)
{
}
};
unordered_map<AliasNodePtr, AliasInfo> m_aliasGroups;
unordered_map<AliasNodePtr, AliasNodePtr> m_aliasLookup;
public:
void Reset()
{
m_stepCounter = 0;
m_aliasGroups.clear();
m_aliasLookup.clear();
};
template <class ElemType>
MemRequestInfo<ElemType>* GetMemInfo(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
// iterate through the vector and find the pointer memInfo
for (auto& memInfo : memInfoVec)
{
if (memInfo.pMatrixPtrs[0] == pMatrixPtr)
return &memInfo;
}
return nullptr;
}
template <class ElemType>
void RequestRelease(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
{
auto memInfo = GetMemInfo(pMatrixPtr);
if (memInfo != nullptr)
{
memInfo->SetReleaseStep(m_stepCounter);
}
m_stepCounter++;
}
// isWorkSpace is a flag indicating a memory is temporary and will be released very shortly. In the current implementation, all workspace
// memories will have their own pool. This is a design proven to be useful for the workspace memory in convolution.
// matrixSize is an estimate of the required memory to be allocated. Note we don't allocate any memory at the time of request. Instead, a
// global memory allocation optimziation is run to improve memory efficiency
// mbScale is another flag indicating if the size of the memory will scale w.r.t. the minibatch size. Unfortunately, at the time of memory
// request and pointer assignment, we don't known the minibatch size. Thus our memory sharing algorithm is sub-optimal.
template <class ElemType>
void RequestAllocate(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, bool isWorkSpace)
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
MemRequestInfo<ElemType> memInfo(deviceId, pMatrixPtr, matrixSize, mbScale, isWorkSpace, m_stepCounter);
memInfoVec.push_back(memInfo);
m_deviceIDSet.insert(deviceId);
m_stepCounter++;
// assign some temporary pointer, they will be replaced later unless the matrix is sparse
*pMatrixPtr = make_shared<Matrix<ElemType>>(deviceId);
}
void OptimizedMemoryAllocation()
{
// MatrixPool is not templated, so we call both float and double versions here
OptimizedMemoryAllocationFunc<float>();
OptimizedMemoryAllocationFunc<double>();
return;
}
void SetAliasInfo(
const unordered_map<AliasNodePtr, unordered_set<AliasNodePtr>>& groupMap,
const unordered_map<AliasNodePtr, AliasNodePtr>& rootLookupMap)
{
m_aliasLookup.clear();
for (const auto& pair : groupMap)
{
m_aliasGroups.insert(std::make_pair(pair.first, AliasInfo(pair.second.size())));
for (const auto& child : pair.second)
{
if (rootLookupMap.find(child) == rootLookupMap.end())
InvalidArgument("group nodes should be in lookupMap");
}
}
for (const auto& pair : rootLookupMap)
{
if (groupMap.find(pair.second) == groupMap.end())
InvalidArgument("lookup root should be group key");
}
m_aliasLookup = rootLookupMap;
}
template <class ElemType>
void RequestAliasedRelease(AliasNodePtr node)
{
const auto iter = m_aliasLookup.find(node);
if (iter == m_aliasLookup.end())
LogicError("node not aliased");
auto parent = iter->second;
auto& aliasInfo = m_aliasGroups[parent];
if (aliasInfo.pMatrixPtr == nullptr)
LogicError("double releasing aliased matrix, or releasing before any allocation for the matrix");
if (aliasInfo.releaseCount >= aliasInfo.totalCount)
LogicError("number of alias instances exceeded expectation");
aliasInfo.releaseCount++;
if (aliasInfo.releaseCount == aliasInfo.totalCount)
{
RequestRelease((shared_ptr<Matrix<ElemType>>*)aliasInfo.pMatrixPtr);
aliasInfo.pMatrixPtr = nullptr;
}
}
template <class ElemType>
void RequestAliasedAllocate(DEVICEID_TYPE deviceId, AliasNodePtr node, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale)
{
const auto iter = m_aliasLookup.find(node);
if (iter == m_aliasLookup.end())
LogicError("node not aliased");
auto parent = iter->second;
auto& aliasInfo = m_aliasGroups[parent];
if (aliasInfo.pMatrixPtr == nullptr)
{
// first allocation for the group
aliasInfo.pMatrixPtr = pMatrixPtr;
RequestAllocate(deviceId, pMatrixPtr, matrixSize, mbScale, false);
}
else
{
auto aliasRootMatrixPtr = (shared_ptr<Matrix<ElemType>>*)aliasInfo.pMatrixPtr;
*pMatrixPtr = *aliasRootMatrixPtr;
GetMemInfo<ElemType>(aliasRootMatrixPtr)->pMatrixPtrs.push_back(pMatrixPtr);
}
}
private:
bool CheckOverlap(pair<int, int>occ, vector<pair<int, int>>&occVec)
{
bool bRet = false;
for (auto& o : occVec)
{
if (occ.first <= o.second && occ.second >= o.first)
{
bRet = true;
break;
}
}
//#define SUPRESS_MEMSHARING // #define this to disable memory sharing by always return true
// TODO: Make this a runtime option.
#ifdef SUPRESS_MEMSHARING
bRet = true;
#endif
return bRet;
}
template <class ElemType>
void OptimizedMemoryAllocationFunc()
{
vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
if (memInfoVec.empty())
return;
// remove all requests that has been marked as sparse matrices, those will not participate in memory sharing
for (auto iter = memInfoVec.begin(); iter != memInfoVec.end(); )
{
bool hasSparse = false;
for (auto matPtr : iter->pMatrixPtrs)
{
if ((*matPtr)->GetMatrixType() == SPARSE)
{
hasSparse = true;
break;
}
}
if (hasSparse)
memInfoVec.erase(iter);
else
iter++;
}
// sort the memory request from largest size to smallest
std::sort(memInfoVec.begin(), memInfoVec.end(), greater_than_mem_req_size<ElemType>());
std::vector<bool> workspaceFlagVec = {true, false};
for (auto& devId : m_deviceIDSet)
{
for (auto wsFlag : workspaceFlagVec) // we allocate the workspace memory pointers first, and they are not shared with the non-workspace memory requests
{
// memAllocInfoVec is a sorted list of memory allocations from smallest to largest in memory size
vector<MemAllocInfo> memAllocInfoVec;
int memoryCounter = 0;
// we start with memory request that is scalable with minibatch size(usually those require larger memory size)
for (auto& memInfo : memInfoVec)
{
// check if it's the proper device
if (memInfo.deviceId != devId || memInfo.isWorkSpace != wsFlag || !memInfo.mbScale)
continue;
if (!memAllocInfoVec.empty())
{
// since we assign from highest memory to lowest, every memory that has been allocated can accommodate the
// current memory request, unless there is a conflict (overlap)
auto iter = memAllocInfoVec.begin();
while (iter != memAllocInfoVec.end() && CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
iter++;
if (iter == memAllocInfoVec.end())
{
// no current memory can be assigned, need to create a new one
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
// insert in the front of the vector to maintain sorted order
memAllocInfoVec.insert(memAllocInfoVec.begin(), ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
else
{
iter->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
memInfo.SetMemoryId(iter->memoryId);
}
}
else
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
}
// rescan the request list and this time allocate for those that doesn't depend on minibatch size
for (auto& memInfo : memInfoVec)
{
// check if it's the proper device
if (memInfo.deviceId != devId || memInfo.isWorkSpace != wsFlag || memInfo.mbScale)
continue;
if (!memAllocInfoVec.empty())
{
// the memory allocation vector is sorted by size. We find the largest available buffer that doesn't have time overlap
auto workingAlloc = memAllocInfoVec.end();
for (auto iter = memAllocInfoVec.begin(); iter != memAllocInfoVec.end(); iter++)
{
if (!CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
workingAlloc = iter;
}
if (workingAlloc == memAllocInfoVec.end()) // nothing works
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma); // add as the last one
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
else
{
workingAlloc->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
memInfo.SetMemoryId(workingAlloc->memoryId);
}
}
else
{
vector<pair<int, int>> occ;
occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
memAllocInfoVec.push_back(ma);
memInfo.SetMemoryId(memoryCounter);
memoryCounter++;
}
}
// now assign the actual pointers
for (int i = 0; i < memoryCounter; i++)
{
auto matrixPtr = make_shared<Matrix<ElemType>>(devId);
if (!matrixPtr) // this can't really happen, because we haven't started allocating memory yet
LogicError("MatrixPool: failed to get a valid matrix.");
for (auto& memInfo : memInfoVec)
{
if (memInfo.deviceId == devId && memInfo.isWorkSpace == wsFlag && memInfo.memoryId == i)
{
for (auto pOutMatrixPtr : memInfo.pMatrixPtrs)
{
*pOutMatrixPtr = matrixPtr;
}
}
}
}
}
}
}
};
}}}