https://github.com/Microsoft/CNTK
Tip revision: 2a409312994822702b865ca9f7e7ee3a31a35ceb authored by Mark Hillebrand on 18 January 2016, 08:34:43 UTC
License change
License change
Tip revision: 2a40931
LUSequenceReader.h
//
// <copyright file="LUSequenceReader.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// LUSequenceReader.h - Include file for the MTK and MLF format of features and samples
#pragma once
//#define LEAKDETECT
#include "DataReader.h"
#include "DataWriter.h"
#include "LUSequenceParser.h"
#include "commandArgUtil.h" // for intargvector
#include <string>
#include <map>
#include <vector>
#include "minibatchsourcehelpers.h"
namespace Microsoft { namespace MSR { namespace CNTK {
#ifdef DBG_SMT
#define CACHE_BLOG_SIZE 2
#else
#define CACHE_BLOG_SIZE 50000
#endif
#define STRIDX2CLS L"idx2cls"
#define CLASSINFO L"classinfo"
#define MAX_STRING 100000
#define NULLLABEL 65532
enum LabelKind
{
labelNone = 0, // no labels to worry about
labelCategory = 1, // category labels, creates mapping tables
labelNextWord = 2, // sentence mapping (predicts next word)
labelOther = 3, // some other type of label
};
enum ReaderMode
{
Plain = 0, // no class info
Class = 1, // category labels, creates mapping tables
};
template<class ElemType>
class LUSequenceReader : public IDataReader<ElemType>
{
protected:
bool m_idx2clsRead;
bool m_clsinfoRead;
std::wstring m_file;
public:
using LabelType = wstring;
using LabelIdType = long;
long nwords, dims, nsamps, nglen, nmefeats;
int m_seed;
bool mRandomize;
public:
/// deal with OOV
map<LabelType, LabelType> mWordMapping;
string mWordMappingFn;
LabelType mUnkStr;
public:
/// accumulated number of sentneces read so far
unsigned long mTotalSentenceSofar;
protected:
LUBatchLUSequenceParser<ElemType, LabelType> m_parser;
size_t m_mbSize; // size of minibatch requested
size_t m_mbStartSample; // starting sample # of the next minibatch
size_t m_epochSize; // size of an epoch
size_t m_epoch; // which epoch are we on
size_t m_epochStartSample; // the starting sample for the epoch
size_t m_totalSamples; // number of samples in the dataset
size_t m_featureDim; // feature dimensions for extra features
size_t m_featureCount; // total number of non-zero features (in labelsDim + extra features dim)
/// for language modeling, the m_featureCount = 1, since there is only one nonzero element
size_t m_readNextSampleLine; // next sample to read Line
size_t m_readNextSample; // next sample to read
size_t m_seqIndex; // index into the m_sequence array
bool m_labelFirst; // the label is the first element in a line
intargvector m_wordContext;
enum LabelInfoType
{
labelInfoMin = 0,
labelInfoIn = labelInfoMin,
labelInfoOut,
labelInfoMax
};
std::wstring m_labelsName[labelInfoMax];
std::wstring m_featuresName;
std::wstring m_labelsCategoryName[labelInfoMax];
std::wstring m_labelsMapName[labelInfoMax];
std::wstring m_sequenceName;
ElemType* m_featuresBuffer;
ElemType* m_labelsBuffer;
LabelIdType* m_labelsIdBuffer;
size_t* m_sequenceBuffer;
bool m_endReached;
int m_traceLevel;
// feature and label data are parallel arrays
std::vector<std::vector<vector<LabelIdType>>> m_featureWordContext;
std::vector<vector<LabelIdType>> m_featureData;
std::vector<LabelIdType> m_labelIdData;
std::vector<ElemType> m_labelData;
std::vector<size_t> m_sequence;
// we have two one for input and one for output
struct LabelInfo
{
LabelKind type; // labels are categories, create mapping table
map<LabelType, LabelIdType> word4idx;
map<LabelIdType, LabelType> idx4word;
LabelIdType idMax; // maximum label ID we have encountered so far
long dim; // maximum label ID we will ever see (used for array dimensions)
LabelType beginSequence; // starting sequence string (i.e. <s>)
LabelType endSequence; // ending sequence string (i.e. </s>)
bool busewordmap; /// whether using wordmap to map unseen words to unk
std::wstring mapName;
std::wstring fileToWrite; // set to the path if we need to write out the label file
bool isproposal; /// whether this is for proposal generation
ReaderMode readerMode;
/**
word class info saved in file in format below
! 29
# 58
$ 26
where the first column is the word and the second column is the class id, base 0
*/
map<wstring, long> word4cls;
map<long, long> idx4class;
Matrix<ElemType>* m_id2classLocal; // CPU version
Matrix<ElemType>* m_classInfoLocal; // CPU version
int mNbrClasses;
bool m_clsinfoRead;
} m_labelInfo[labelInfoMax];
// caching support
DataReader<ElemType>* m_cachingReader;
DataWriter<ElemType>* m_cachingWriter;
ConfigParameters m_readerConfig;
void InitCache(const ConfigParameters& config);
void UpdateDataVariables();
void LMSetupEpoch();
size_t RecordsToRead(size_t mbStartSample, bool tail=false);
void ReleaseMemory();
void WriteLabelFile();
void LoadLabelFile(const std::wstring &filePath, std::vector<LabelType>& retLabels);
LabelIdType GetIdFromLabel(const LabelType& label, LabelInfo& labelInfo);
bool GetIdFromLabel(const vector<LabelIdType>& label, vector<LabelIdType>& val);
bool CheckIdFromLabel(const LabelType& labelValue, const LabelInfo& labelInfo, unsigned & labelId);
bool SentenceEnd();
public:
void Init(const ConfigParameters& ){};
void ChangeMaping(const map<LabelType, LabelType>& maplist,
const LabelType& unkstr,
map<LabelType, LabelIdType> & word4idx);
void Destroy() {};
LUSequenceReader() {
m_featuresBuffer=NULL; m_labelsBuffer=NULL; m_clsinfoRead = false; m_idx2clsRead = false;
}
~LUSequenceReader(){};
void StartMinibatchLoop(size_t , size_t , size_t = requestDataSize) {};
void SetNumParallelSequences(const size_t /*mz*/) {};
void SentenceEnd(std::vector<size_t> &/*sentenceEnd*/) {};
virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0);
public:
int GetSentenceEndIdFromOutputLabel();
};
template<class ElemType>
class BatchLUSequenceReader : public LUSequenceReader<ElemType>
{
public:
using LabelType = wstring;
using LabelIdType = long;
using LUSequenceReader<ElemType>::mWordMappingFn;
using LUSequenceReader<ElemType>::m_cachingReader;
using LUSequenceReader<ElemType>::mWordMapping;
using LUSequenceReader<ElemType>::mUnkStr;
using LUSequenceReader<ElemType>::m_cachingWriter;
using LUSequenceReader<ElemType>::m_featuresName;
using LUSequenceReader<ElemType>::m_labelsName;
using LUSequenceReader<ElemType>::labelInfoMin;
using LUSequenceReader<ElemType>::labelInfoMax;
using LUSequenceReader<ElemType>::m_featureDim;
using LUSequenceReader<ElemType>::m_labelInfo;
// using LUSequenceReader<ElemType>::m_labelInfoIn;
using LUSequenceReader<ElemType>::m_mbStartSample;
using LUSequenceReader<ElemType>::m_epoch;
using LUSequenceReader<ElemType>::m_totalSamples;
using LUSequenceReader<ElemType>::m_epochStartSample;
using LUSequenceReader<ElemType>::m_seqIndex;
using LUSequenceReader<ElemType>::m_endReached;
using LUSequenceReader<ElemType>::m_readNextSampleLine;
using LUSequenceReader<ElemType>::m_readNextSample;
using LUSequenceReader<ElemType>::m_traceLevel;
using LUSequenceReader<ElemType>::m_wordContext;
using LUSequenceReader<ElemType>::m_featureCount;
using typename LUSequenceReader<ElemType>::LabelInfo;
using LUSequenceReader<ElemType>::labelInfoIn;
using LUSequenceReader<ElemType>::labelInfoOut;
// using LUSequenceReader<ElemType>::arrayLabels;
using LUSequenceReader<ElemType>::m_readerConfig;
using LUSequenceReader<ElemType>::m_featuresBuffer;
using LUSequenceReader<ElemType>::m_labelsBuffer;
using LUSequenceReader<ElemType>::m_labelsIdBuffer;
using LUSequenceReader<ElemType>::m_mbSize;
using LUSequenceReader<ElemType>::m_epochSize;
using LUSequenceReader<ElemType>::m_featureData;
using LUSequenceReader<ElemType>::m_sequence;
using LUSequenceReader<ElemType>::m_labelData;
using LUSequenceReader<ElemType>::m_labelIdData;
using LUSequenceReader<ElemType>::m_idx2clsRead;
using LUSequenceReader<ElemType>::m_clsinfoRead;
using LUSequenceReader<ElemType>::m_featureWordContext;
using LUSequenceReader<ElemType>::LoadLabelFile;
using LUSequenceReader<ElemType>::ReleaseMemory;
using LUSequenceReader<ElemType>::LMSetupEpoch;
using LUSequenceReader<ElemType>::ChangeMaping;
using LUSequenceReader<ElemType>::GetIdFromLabel;
using LUSequenceReader<ElemType>::InitCache;
using LUSequenceReader<ElemType>::mRandomize;
using LUSequenceReader<ElemType>::m_seed;
using LUSequenceReader<ElemType>::mTotalSentenceSofar;
using LUSequenceReader<ElemType>::GetSentenceEndIdFromOutputLabel;
private:
size_t mLastProcssedSentenceId ;
size_t mBlgSize;
size_t mPosInSentence;
vector<size_t> mToProcess;
size_t mLastPosInSentence;
size_t mNumRead ;
std::vector<vector<LabelIdType>> m_featureTemp;
std::vector<LabelIdType> m_labelTemp;
bool mSentenceEnd;
bool mSentenceBegin;
public:
vector<bool> mProcessed;
LUBatchLUSequenceParser<ElemType, LabelType> m_parser;
BatchLUSequenceReader() : m_pMBLayout(make_shared<MBLayout>()){
mLastProcssedSentenceId = 0;
mBlgSize = 1;
mLastPosInSentence = 0;
mNumRead = 0;
mSentenceEnd = false;
mSentenceBegin = true;
mIgnoreSentenceBeginTag = false;
}
~BatchLUSequenceReader();
void Init(const ConfigParameters& readerConfig);
void Reset();
/// return length of sentences size
size_t FindNextSentences(size_t numSentences);
bool DataEnd(EndDataType endDataType);
void SetSentenceEnd(int wrd, int pos, int actualMbSize);
void SetSentenceBegin(int wrd, int pos, int actualMbSize);
void SetSentenceBegin(int wrd, size_t pos, size_t actualMbSize) { SetSentenceBegin(wrd, (int)pos, (int)actualMbSize); } // TODO: clean this up
void SetSentenceEnd(int wrd, size_t pos, size_t actualMbSize) { SetSentenceEnd(wrd, (int)pos, (int)actualMbSize); }
void SetSentenceBegin(size_t wrd, size_t pos, size_t actualMbSize) { SetSentenceBegin((int)wrd, (int)pos, (int)actualMbSize); }
void SetSentenceEnd(size_t wrd, size_t pos, size_t actualMbSize) { SetSentenceEnd((int)wrd, (int)pos, (int)actualMbSize); }
size_t GetLabelOutput(std::map<std::wstring,
Matrix<ElemType>*>& matrices, LabelInfo& labelInfo, size_t actualmbsize);
void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
bool EnsureDataAvailable(size_t mbStartSample);
size_t GetNumParallelSequences();
void SetNumParallelSequences(const size_t mz);
void CopyMBLayoutTo(MBLayoutPtr pMBLayout);
public:
void GetClassInfo(LabelInfo& lblInfo);
void ReadLabelInfo(const wstring & vocfile,
map<wstring, long> & word4idx,
bool readClass,
map<wstring, long>& word4cls,
map<long, wstring>& idx4word,
map<long, long>& idx4class,
int & mNbrCls);
void LoadWordMapping(const ConfigParameters& readerConfig);
bool CanReadFor(wstring nodeName); /// return true if this reader can output for a node with name nodeName
vector<size_t> ReturnToProcessId() { return mToProcess; }
void SetToProcessId(const vector<size_t>& tp) { mToProcess = tp; }
void SetRandomSeed(int seed)
{
m_seed = seed;
}
public:
/**
for sequential reading data, useful for beam search decoding
*/
/// this is for frame-by-frame reading of data.
/// data is first read into these matrices and then if needed is column-by-column retrieved
map<wstring, Matrix<ElemType>> mMatrices;
bool GetFrame(std::map<std::wstring, Matrix<ElemType>*>& matrices, const size_t tidx, vector<size_t>& history);
/// create proposals
void InitProposals(map<wstring, Matrix<ElemType>*>& pMat);
public:
bool mEqualLengthOutput;
bool mAllowMultPassData;
/// return length of sentences size
vector<size_t> mSentenceLength;
size_t mMaxSentenceLength;
vector<int> mSentenceBeginAt;
vector<int> mSentenceEndAt;
MBLayoutPtr m_pMBLayout;
/// by default it is false
/// if true, reader will set to ((int) MinibatchPackingFlags::None) for time positions that are orignally correspond to ((int) MinibatchPackingFlags::SequenceStart)
/// set to true so that a current minibatch can uses state activities from the previous minibatch.
/// default will have truncated BPTT, which only does BPTT inside a minibatch
bool mIgnoreSentenceBeginTag;
};
template<class ElemType>
class MultiIOBatchLUSequenceReader : public BatchLUSequenceReader<ElemType>
{
private:
map<wstring, BatchLUSequenceReader<ElemType>*> mReader;
bool mCheckDictionaryKeys;
std::map<std::wstring, BatchLUSequenceReader<ElemType>*> nameToReader;
public:
MultiIOBatchLUSequenceReader()
{
mCheckDictionaryKeys = true;
nameToReader.clear();
}
~MultiIOBatchLUSequenceReader()
{
for (typename map<wstring, BatchLUSequenceReader<ElemType>*>::iterator p = mReader.begin(); p != mReader.end(); p++)
{
delete[] p->second;
}
};
bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples);
void CopyMBLayoutTo(MBLayoutPtr pMBLayout);
size_t GetNumParallelSequences();
void Init(const ConfigParameters& readerConfig);
public:
void SetRandomSeed(int);
public:
int GetSentenceEndIdFromOutputLabel();
bool DataEnd(EndDataType endDataType);
/// create proposals
void InitProposals(map<wstring, Matrix<ElemType>*>& pMat);
bool GetProposalObs(std::map<std::wstring, Matrix<ElemType>*>& matrices, const size_t tidx, vector<size_t>& history);
};
}}}