// // // Copyright (c) Microsoft Corporation. All rights reserved. // // // LUSequenceReader.h - Include file for the MTK and MLF format of features and samples #pragma once //#define LEAKDETECT #include "DataReader.h" #include "DataWriter.h" #include "LUSequenceParser.h" #include "commandArgUtil.h" // for intargvector #include #include #include #include "minibatchsourcehelpers.h" namespace Microsoft { namespace MSR { namespace CNTK { #ifdef DBG_SMT #define CACHE_BLOG_SIZE 2 #else #define CACHE_BLOG_SIZE 50000 #endif #define STRIDX2CLS L"idx2cls" #define CLASSINFO L"classinfo" #define MAX_STRING 100000 #define NULLLABEL 65532 enum LabelKind { labelNone = 0, // no labels to worry about labelCategory = 1, // category labels, creates mapping tables labelNextWord = 2, // sentence mapping (predicts next word) labelOther = 3, // some other type of label }; enum ReaderMode { Plain = 0, // no class info Class = 1, // category labels, creates mapping tables }; template class LUSequenceReader : public IDataReader { protected: bool m_idx2clsRead; bool m_clsinfoRead; std::wstring m_file; public: using LabelType = wstring; using LabelIdType = long; long nwords, dims, nsamps, nglen, nmefeats; int m_seed; bool mRandomize; public: /// deal with OOV map mWordMapping; string mWordMappingFn; LabelType mUnkStr; public: /// accumulated number of sentneces read so far unsigned long mTotalSentenceSofar; protected: LUBatchLUSequenceParser m_parser; size_t m_mbSize; // size of minibatch requested size_t m_mbStartSample; // starting sample # of the next minibatch size_t m_epochSize; // size of an epoch size_t m_epoch; // which epoch are we on size_t m_epochStartSample; // the starting sample for the epoch size_t m_totalSamples; // number of samples in the dataset size_t m_featureDim; // feature dimensions for extra features size_t m_featureCount; // total number of non-zero features (in labelsDim + extra features dim) /// for language modeling, the m_featureCount = 1, since there is only one nonzero element size_t m_readNextSampleLine; // next sample to read Line size_t m_readNextSample; // next sample to read size_t m_seqIndex; // index into the m_sequence array bool m_labelFirst; // the label is the first element in a line intargvector m_wordContext; enum LabelInfoType { labelInfoMin = 0, labelInfoIn = labelInfoMin, labelInfoOut, labelInfoMax }; std::wstring m_labelsName[labelInfoMax]; std::wstring m_featuresName; std::wstring m_labelsCategoryName[labelInfoMax]; std::wstring m_labelsMapName[labelInfoMax]; std::wstring m_sequenceName; ElemType* m_featuresBuffer; ElemType* m_labelsBuffer; LabelIdType* m_labelsIdBuffer; size_t* m_sequenceBuffer; bool m_endReached; int m_traceLevel; // feature and label data are parallel arrays std::vector>> m_featureWordContext; std::vector> m_featureData; std::vector m_labelIdData; std::vector m_labelData; std::vector m_sequence; // we have two one for input and one for output struct LabelInfo { LabelKind type; // labels are categories, create mapping table map word4idx; map idx4word; LabelIdType idMax; // maximum label ID we have encountered so far long dim; // maximum label ID we will ever see (used for array dimensions) LabelType beginSequence; // starting sequence string (i.e. ~~) LabelType endSequence; // ending sequence string (i.e.~~ ) bool busewordmap; /// whether using wordmap to map unseen words to unk std::wstring mapName; std::wstring fileToWrite; // set to the path if we need to write out the label file bool isproposal; /// whether this is for proposal generation ReaderMode readerMode; /** word class info saved in file in format below ! 29 # 58 $ 26 where the first column is the word and the second column is the class id, base 0 */ map word4cls; map idx4class; Matrix* m_id2classLocal; // CPU version Matrix* m_classInfoLocal; // CPU version int mNbrClasses; bool m_clsinfoRead; } m_labelInfo[labelInfoMax]; // caching support DataReader* m_cachingReader; DataWriter* m_cachingWriter; ConfigParameters m_readerConfig; void InitCache(const ConfigParameters& config); void UpdateDataVariables(); void LMSetupEpoch(); size_t RecordsToRead(size_t mbStartSample, bool tail=false); void ReleaseMemory(); void WriteLabelFile(); void LoadLabelFile(const std::wstring &filePath, std::vector& retLabels); LabelIdType GetIdFromLabel(const LabelType& label, LabelInfo& labelInfo); bool GetIdFromLabel(const vector& label, vector& val); bool CheckIdFromLabel(const LabelType& labelValue, const LabelInfo& labelInfo, unsigned & labelId); bool SentenceEnd(); public: void Init(const ConfigParameters& ){}; void ChangeMaping(const map& maplist, const LabelType& unkstr, map & word4idx); void Destroy() {}; LUSequenceReader() { m_featuresBuffer=NULL; m_labelsBuffer=NULL; m_clsinfoRead = false; m_idx2clsRead = false; } ~LUSequenceReader(){}; void StartMinibatchLoop(size_t , size_t , size_t = requestDataSize) {}; void SetNbrSlicesEachRecurrentIter(const size_t /*mz*/) {}; void SentenceEnd(std::vector &/*sentenceEnd*/) {}; virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0); public: int GetSentenceEndIdFromOutputLabel(); }; template class BatchLUSequenceReader : public LUSequenceReader { public: using LabelType = wstring; using LabelIdType = long; using LUSequenceReader::mWordMappingFn; using LUSequenceReader::m_cachingReader; using LUSequenceReader::mWordMapping; using LUSequenceReader::mUnkStr; using LUSequenceReader::m_cachingWriter; using LUSequenceReader::m_featuresName; using LUSequenceReader::m_labelsName; using LUSequenceReader::labelInfoMin; using LUSequenceReader::labelInfoMax; using LUSequenceReader::m_featureDim; using LUSequenceReader::m_labelInfo; // using LUSequenceReader::m_labelInfoIn; using LUSequenceReader::m_mbStartSample; using LUSequenceReader::m_epoch; using LUSequenceReader::m_totalSamples; using LUSequenceReader::m_epochStartSample; using LUSequenceReader::m_seqIndex; using LUSequenceReader::m_endReached; using LUSequenceReader::m_readNextSampleLine; using LUSequenceReader::m_readNextSample; using LUSequenceReader::m_traceLevel; using LUSequenceReader::m_wordContext; using LUSequenceReader::m_featureCount; using typename LUSequenceReader::LabelInfo; using LUSequenceReader::labelInfoIn; using LUSequenceReader::labelInfoOut; // using LUSequenceReader::arrayLabels; using LUSequenceReader::m_readerConfig; using LUSequenceReader::m_featuresBuffer; using LUSequenceReader::m_labelsBuffer; using LUSequenceReader::m_labelsIdBuffer; using LUSequenceReader::m_mbSize; using LUSequenceReader::m_epochSize; using LUSequenceReader::m_featureData; using LUSequenceReader::m_sequence; using LUSequenceReader::m_labelData; using LUSequenceReader::m_labelIdData; using LUSequenceReader::m_idx2clsRead; using LUSequenceReader::m_clsinfoRead; using LUSequenceReader::m_featureWordContext; using LUSequenceReader::LoadLabelFile; using LUSequenceReader::ReleaseMemory; using LUSequenceReader::LMSetupEpoch; using LUSequenceReader::ChangeMaping; using LUSequenceReader::GetIdFromLabel; using LUSequenceReader::InitCache; using LUSequenceReader::mRandomize; using LUSequenceReader::m_seed; using LUSequenceReader::mTotalSentenceSofar; using LUSequenceReader::GetSentenceEndIdFromOutputLabel; private: size_t mLastProcssedSentenceId ; size_t mBlgSize; size_t mPosInSentence; vector mToProcess; size_t mLastPosInSentence; size_t mNumRead ; std::vector> m_featureTemp; std::vector m_labelTemp; bool mSentenceEnd; bool mSentenceBegin; public: vector mProcessed; LUBatchLUSequenceParser m_parser; BatchLUSequenceReader() : mtSentenceBegin(CPUDEVICE){ mLastProcssedSentenceId = 0; mBlgSize = 1; mLastPosInSentence = 0; mNumRead = 0; mSentenceEnd = false; mSentenceBegin = true; mIgnoreSentenceBeginTag = false; } ~BatchLUSequenceReader(); void Init(const ConfigParameters& readerConfig); void Reset(); /// return length of sentences size size_t FindNextSentences(size_t numSentences); bool DataEnd(EndDataType endDataType); void SetSentenceEnd(int wrd, int pos, int actualMbSize); void SetSentenceBegin(int wrd, int pos, int actualMbSize); void SetSentenceBegin(int wrd, size_t pos, size_t actualMbSize) { SetSentenceBegin(wrd, (int)pos, (int)actualMbSize); } // TODO: clean this up void SetSentenceEnd(int wrd, size_t pos, size_t actualMbSize) { SetSentenceEnd(wrd, (int)pos, (int)actualMbSize); } void SetSentenceBegin(size_t wrd, size_t pos, size_t actualMbSize) { SetSentenceBegin((int)wrd, (int)pos, (int)actualMbSize); } void SetSentenceEnd(size_t wrd, size_t pos, size_t actualMbSize) { SetSentenceEnd((int)wrd, (int)pos, (int)actualMbSize); } size_t GetLabelOutput(std::map*>& matrices, LabelInfo& labelInfo, size_t actualmbsize); void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize); bool GetMinibatch(std::map*>& matrices); bool EnsureDataAvailable(size_t mbStartSample); size_t NumberSlicesInEachRecurrentIter(); void SetNbrSlicesEachRecurrentIter(const size_t mz); void SetSentenceSegBatch(Matrix & sentenceBegin, vector& minibatchPackingFlag); public: void GetClassInfo(LabelInfo& lblInfo); void ReadLabelInfo(const wstring & vocfile, map & word4idx, bool readClass, map& word4cls, map& idx4word, map& idx4class, int & mNbrCls); void LoadWordMapping(const ConfigParameters& readerConfig); bool CanReadFor(wstring nodeName); /// return true if this reader can output for a node with name nodeName vector ReturnToProcessId() { return mToProcess; } void SetToProcessId(const vector& tp) { mToProcess = tp; } void SetRandomSeed(int seed) { m_seed = seed; } public: /** for sequential reading data, useful for beam search decoding */ /// this is for frame-by-frame reading of data. /// data is first read into these matrices and then if needed is column-by-column retrieved map> mMatrices; bool GetFrame(std::map*>& matrices, const size_t tidx, vector& history); /// create proposals void InitProposals(map*>& pMat); public: bool mEqualLengthOutput; bool mAllowMultPassData; /// return length of sentences size vector mSentenceLength; size_t mMaxSentenceLength; vector mSentenceBeginAt; vector mSentenceEndAt; /// a matrix of n_stream x n_length /// n_stream is the number of streams /// n_length is the maximum lenght of each stream /// for example, two sentences used in parallel in one minibatch would be /// [2 x 5] if the max length of one of the sentences is 5 /// the elements of the matrix is 0, 1, or -1, defined as SENTENCE_BEGIN, SENTENCE_MIDDLE, NO_LABELS in cbasetype.h /// 0 1 1 0 1 /// 1 0 1 0 0 /// for two parallel data streams. The first has two sentences, with 0 indicating begining of a sentence /// the second data stream has two sentences, with 0 indicating begining of sentences /// you may use 1 even if a sentence begins at that position, in this case, the trainer will carry over hidden states to the following /// frame. Matrix mtSentenceBegin; /// a matrix of 1 x n_length /// 1 denotes the case that there exists sentnece begin or no_labels case in this frame /// 0 denotes such case is not in this frame vector m_minibatchPackingFlag; /// by default it is false /// if true, reader will set to SENTENCE_MIDDLE for time positions that are orignally correspond to SENTENCE_BEGIN /// set to true so that a current minibatch can uses state activities from the previous minibatch. /// default will have truncated BPTT, which only does BPTT inside a minibatch bool mIgnoreSentenceBeginTag; }; template class MultiIOBatchLUSequenceReader : public BatchLUSequenceReader { private: map*> mReader; bool mCheckDictionaryKeys; std::map*> nameToReader; public: MultiIOBatchLUSequenceReader() { mCheckDictionaryKeys = true; nameToReader.clear(); } ~MultiIOBatchLUSequenceReader() { for (typename map*>::iterator p = mReader.begin(); p != mReader.end(); p++) { delete[] p->second; } }; bool GetMinibatch(std::map*>& matrices); void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples); void SetSentenceSegBatch(Matrix & sentenceBegin, vector& minibatchPackingFlag); size_t NumberSlicesInEachRecurrentIter(); void Init(const ConfigParameters& readerConfig); public: void SetRandomSeed(int); public: int GetSentenceEndIdFromOutputLabel(); bool DataEnd(EndDataType endDataType); /// create proposals void InitProposals(map*>& pMat); bool GetProposalObs(std::map*>& matrices, const size_t tidx, vector& history); }; }}}