Content - 431c77d8adead9ea73b690f12d7b0278567f0686 - f04e53d/LUSequenceReader.h

LUSequenceReader.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// LUSequenceReader.h - Include file for the MTK and MLF format of features and samples
//
#pragma once
//#define LEAKDETECT

#include "Basics.h"
#include "DataReader.h"
#include "DataWriter.h"
#include "LUSequenceParser.h"
#include "Config.h" // for intargvector
#include "ScriptableObjects.h"
#include <string>
#include <map>
#include <vector>

namespace Microsoft { namespace MSR { namespace CNTK {

#ifdef DBG_SMT
#define CACHE_BLOG_SIZE 2
#else
#define CACHE_BLOG_SIZE 50000
#endif

#define STRIDX2CLS L"idx2cls"
#define CLASSINFO L"classinfo"

#define MAX_STRING 100000

#define NULLLABEL 65532

enum LabelKind
{
    labelNone = 0,     // no labels to worry about
    labelCategory = 1, // category labels, creates mapping tables
    labelNextWord = 2, // sentence mapping (predicts next word)
    labelOther = 3,    // some other type of label
};

enum ReaderMode
{
    Plain = 0, // no class info
    Class = 1, // category labels, creates mapping tables
};

template <class ElemType>
class LUSequenceReader : public IDataReader<ElemType>
{
protected:
    bool m_idx2clsRead;
    bool m_clsinfoRead;

    std::wstring m_file;

public:
    using LabelType = wstring;
    using LabelIdType = long;
    long nwords, dims, nsamps, nglen, nmefeats;

    int m_seed;
    bool mRandomize;

public:
    /// deal with OOV
    map<LabelType, LabelType> mWordMapping;
    wstring mWordMappingFn;
    LabelType mUnkStr;

public:
    /// accumulated number of sentneces read so far
    unsigned long mTotalSentenceSofar;

protected:
    BatchLUSequenceParser<ElemType, LabelType> m_parser;
    size_t m_mbSize;           // size of minibatch requested
    size_t m_mbStartSample;    // starting sample # of the next minibatch
    size_t m_epochSize;        // size of an epoch
    size_t m_epoch;            // which epoch are we on
    size_t m_epochStartSample; // the starting sample for the epoch
    size_t m_totalSamples;     // number of samples in the dataset
    size_t m_featureDim;       // feature dimensions for extra features
    size_t m_featureCount;     // total number of non-zero features (in labelsDim + extra features dim)
    /// for language modeling, the m_featureCount = 1, since there is only one nonzero element
    size_t m_readNextSampleLine; // next sample to read Line
    size_t m_readNextSample;     // next sample to read
    size_t m_seqIndex;           // index into the m_sequence array
    bool m_labelFirst;           // the label is the first element in a line
    intargvector m_wordContext;

    enum LabelInfoType
    {
        labelInfoMin = 0,
        labelInfoIn = labelInfoMin,
        labelInfoOut,
        labelInfoMax
    };

    std::wstring m_labelsName[labelInfoMax];
    std::wstring m_featuresName;
    std::wstring m_labelsCategoryName[labelInfoMax];
    std::wstring m_labelsMapName[labelInfoMax];
    std::wstring m_sequenceName;

    ElemType* m_featuresBuffer;
    ElemType* m_labelsBuffer;
    LabelIdType* m_labelsIdBuffer;
    size_t* m_sequenceBuffer;

    bool m_endReached;
    int m_traceLevel;

    // feature and label data are parallel arrays
    // The following two hold the actual MB data internally, created by EnsureDataAvailable().
    std::vector<std::vector<vector<LabelIdType>>> m_featureWordContext; // [parSeq + t * numParSeq] word n-tuple in order of storage in m_value matrix
    std::vector<LabelIdType> m_labelIdData;

    std::vector<ElemType> m_labelData;
    std::vector<size_t> m_sequence;

    // we have two one for input and one for output
    struct LabelInfo
    {
        LabelKind type; // labels are categories, create mapping table
        map<LabelType, LabelIdType> word4idx;
        map<LabelIdType, LabelType> idx4word;
        LabelIdType idMax;       // maximum label ID we have encountered so far
        long dim;                // maximum label ID we will ever see (used for array dimensions)
        LabelType beginSequence; // starting sequence string (i.e. <s>)
        LabelType endSequence;   // ending sequence string (i.e. </s>)
        bool busewordmap;        /// whether using wordmap to map unseen words to unk
        std::wstring mapName;
        std::wstring fileToWrite; // set to the path if we need to write out the label file

        bool isproposal; /// whether this is for proposal generation

        ReaderMode readerMode;
        /**
        word class info saved in file in format below
        ! 29
        # 58
        $ 26
        where the first column is the word and the second column is the class id, base 0
        */
        map<wstring, long> word4cls;
        map<long, long> idx4class;
        Matrix<ElemType>* m_id2classLocal;  // CPU version
        Matrix<ElemType>* m_classInfoLocal; // CPU version
        int mNbrClasses;
        bool m_clsinfoRead;
    } m_labelInfo[labelInfoMax];

    // caching support
    DataReader<ElemType>* m_cachingReader;
    DataWriter<ElemType>* m_cachingWriter;
    ConfigParameters m_readerConfig;
    void InitCache(const ConfigParameters& config);

    void UpdateDataVariables();
    void LMSetupEpoch();
    size_t RecordsToRead(size_t mbStartSample, bool tail = false);
    void ReleaseMemory();
    void WriteLabelFile();
    void LoadLabelFile(const std::wstring& filePath, std::vector<LabelType>& retLabels);

    LabelIdType GetIdFromLabel(const LabelType& label, LabelInfo& labelInfo);
    bool GetIdFromLabel(const vector<LabelIdType>& label, vector<LabelIdType>& val);
    bool CheckIdFromLabel(const LabelType& labelValue, const LabelInfo& labelInfo, unsigned& labelId);

    bool SentenceEnd();

public:
    void Init(const ConfigParameters&){};
    void Init(const ScriptableObjects::IConfigRecord&){};
    void ChangeMaping(const map<LabelType, LabelType>& maplist,
                      const LabelType& unkstr,
                      map<LabelType, LabelIdType>& word4idx);

    void Destroy(){};

    LUSequenceReader()
    {
        m_featuresBuffer = NULL;
        m_labelsBuffer = NULL;
        m_clsinfoRead = false;
        m_idx2clsRead = false;
    }
    ~LUSequenceReader(){};
    void StartMinibatchLoop(size_t, size_t, size_t = requestDataSize){};

    void SetNumParallelSequences(const size_t /*mz*/){};
    void SentenceEnd(std::vector<size_t>& /*sentenceEnd*/){};

    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0);

public:
    int GetSentenceEndIdFromOutputLabel();
};

template <class ElemType>
class BatchLUSequenceReader : public LUSequenceReader<ElemType>
{
public:
    using LabelType = wstring;
    using LabelIdType = long;
    using LUSequenceReader<ElemType>::mWordMappingFn;
    using LUSequenceReader<ElemType>::m_cachingReader;
    using LUSequenceReader<ElemType>::mWordMapping;
    using LUSequenceReader<ElemType>::mUnkStr;
    using LUSequenceReader<ElemType>::m_cachingWriter;
    using LUSequenceReader<ElemType>::m_featuresName;
    using LUSequenceReader<ElemType>::m_labelsName;
    using LUSequenceReader<ElemType>::labelInfoMin;
    using LUSequenceReader<ElemType>::labelInfoMax;
    using LUSequenceReader<ElemType>::m_featureDim;
    using LUSequenceReader<ElemType>::m_labelInfo;
    //  using LUSequenceReader<ElemType>::m_labelInfoIn;
    using LUSequenceReader<ElemType>::m_mbStartSample;
    using LUSequenceReader<ElemType>::m_epoch;
    using LUSequenceReader<ElemType>::m_totalSamples;
    using LUSequenceReader<ElemType>::m_epochStartSample;
    using LUSequenceReader<ElemType>::m_seqIndex;
    using LUSequenceReader<ElemType>::m_endReached;
    using LUSequenceReader<ElemType>::m_readNextSampleLine;
    using LUSequenceReader<ElemType>::m_readNextSample;
    using LUSequenceReader<ElemType>::m_traceLevel;
    using LUSequenceReader<ElemType>::m_wordContext;
    using LUSequenceReader<ElemType>::m_featureCount;
    using typename LUSequenceReader<ElemType>::LabelInfo;
    using LUSequenceReader<ElemType>::labelInfoIn;
    using LUSequenceReader<ElemType>::labelInfoOut;
    //  using LUSequenceReader<ElemType>::arrayLabels;
    using LUSequenceReader<ElemType>::m_readerConfig;
    using LUSequenceReader<ElemType>::m_featuresBuffer;
    using LUSequenceReader<ElemType>::m_labelsBuffer;
    using LUSequenceReader<ElemType>::m_labelsIdBuffer;
    using LUSequenceReader<ElemType>::m_mbSize;
    using LUSequenceReader<ElemType>::m_epochSize;
    using LUSequenceReader<ElemType>::m_sequence;
    using LUSequenceReader<ElemType>::m_labelData;
    using LUSequenceReader<ElemType>::m_labelIdData;
    using LUSequenceReader<ElemType>::m_idx2clsRead;
    using LUSequenceReader<ElemType>::m_clsinfoRead;
    using LUSequenceReader<ElemType>::m_featureWordContext;
    using LUSequenceReader<ElemType>::LoadLabelFile;
    using LUSequenceReader<ElemType>::ReleaseMemory;
    using LUSequenceReader<ElemType>::LMSetupEpoch;
    using LUSequenceReader<ElemType>::ChangeMaping;
    using LUSequenceReader<ElemType>::GetIdFromLabel;
    using LUSequenceReader<ElemType>::InitCache;
    using LUSequenceReader<ElemType>::mRandomize;
    using LUSequenceReader<ElemType>::m_seed;
    using LUSequenceReader<ElemType>::mTotalSentenceSofar;
    using LUSequenceReader<ElemType>::GetSentenceEndIdFromOutputLabel;

private:
    size_t mLastProcessedSentenceId;
    size_t mRequestedNumParallelSequences;
    size_t mPosInSentence;
    vector<size_t> mToProcess; // [seqIndex] utterance id of utterance in this minibatch's position [seqIndex]
    size_t mLastPosInSentence; // BPTT cursor
    size_t mNumRead;

    std::vector<vector<LabelIdType>> m_featureTemp;
    std::vector<LabelIdType> m_labelTemp;

    bool mSentenceEnd;
    bool mSentenceBegin;

public:
    vector<bool> mProcessed;
    BatchLUSequenceParser<ElemType, LabelType> m_parser;
    BatchLUSequenceReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
        mLastProcessedSentenceId = 0;
        mRequestedNumParallelSequences = 1;
        mLastPosInSentence = 0;
        mNumRead = 0;
        mSentenceEnd = false;
        mSentenceBegin = true;
        mIgnoreSentenceBeginTag = false;
    }

    ~BatchLUSequenceReader();

    template <class ConfigRecordType>
    void InitFromConfig(const ConfigRecordType&);
    virtual void Init(const ConfigParameters& config) override
    {
        InitFromConfig(config);
    }
    virtual void Init(const ScriptableObjects::IConfigRecord& config) override
    {
        InitFromConfig(config);
    }
    void Reset();

    /// return length of sentences size
    size_t FindNextSentences(size_t numSentences);
    bool DataEnd(EndDataType endDataType);
    void SetSentenceEnd(int wrd, int pos, int actualMbSize);
    void SetSentenceBegin(int wrd, int pos, int actualMbSize);
    void SetSentenceBegin(int wrd, size_t pos, size_t actualMbSize)
    {
        SetSentenceBegin(wrd, (int) pos, (int) actualMbSize);
    } // TODO: clean this up
    void SetSentenceEnd(int wrd, size_t pos, size_t actualMbSize)
    {
        SetSentenceEnd(wrd, (int) pos, (int) actualMbSize);
    }
    void SetSentenceBegin(size_t wrd, size_t pos, size_t actualMbSize)
    {
        SetSentenceBegin((int) wrd, (int) pos, (int) actualMbSize);
    }
    void SetSentenceEnd(size_t wrd, size_t pos, size_t actualMbSize)
    {
        SetSentenceEnd((int) wrd, (int) pos, (int) actualMbSize);
    }

    size_t GetLabelOutput(std::map<std::wstring,
                                   Matrix<ElemType>*>& matrices,
                          LabelInfo& labelInfo, size_t actualmbsize);

    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
    bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);

    bool EnsureDataAvailable(size_t mbStartSample);
    size_t GetNumParallelSequences();
    void SetNumParallelSequences(const size_t mz);

    void CopyMBLayoutTo(MBLayoutPtr pMBLayout);

public:
    void GetClassInfo(LabelInfo& lblInfo);
    void ReadLabelInfo(const wstring& vocfile,
                       map<wstring, long>& word4idx,
                       bool readClass,
                       map<wstring, long>& word4cls,
                       map<long, wstring>& idx4word,
                       map<long, long>& idx4class,
                       int& mNbrCls);

    template <class ConfigRecordType>
    void LoadWordMapping(const ConfigRecordType& config);
    bool CanReadFor(wstring nodeName); /// return true if this reader can output for a node with name nodeName

    vector<size_t> ReturnToProcessId()
    {
        return mToProcess;
    }
    void SetToProcessId(const vector<size_t>& tp)
    {
        mToProcess = tp;
    }

    void SetRandomSeed(int seed)
    {
        m_seed = seed;
    }

public:
    /**
    for sequential reading data, useful for beam search decoding
    */
    /// this is for frame-by-frame reading of data.
    /// data is first read into these matrices and then if needed is column-by-column retrieved
    map<wstring, Matrix<ElemType>> mMatrices;
    bool GetFrame(std::map<std::wstring, Matrix<ElemType>*>& matrices, const size_t tidx, vector<size_t>& history);

    /// create proposals
    void InitProposals(map<wstring, Matrix<ElemType>*>& pMat);

public:
    bool mEqualLengthOutput;
    bool mAllowMultPassData;

    // return length of sentences size
    vector<size_t> mSentenceLengths; // [seqIndex] lengths of all sentences in a minibatch
    size_t mMaxSentenceLength;       // max over mSentenceLength[]  --TODO: why not compute on the fly?
    vector<int> mSentenceBeginAt;    // [seqIndex] index of first token
    const int NO_INPUT = -2;
    vector<int> mSentenceEndAt; // [seqIndex] index of last token

    MBLayoutPtr m_pMBLayout;

    // if true, reader will set to ((int) MinibatchPackingFlags::None) for time positions that are orignally correspond to ((int) MinibatchPackingFlags::SequenceStart)
    // set to true so that a current minibatch can uses state activities from the previous minibatch.
    // default will have truncated BPTT, which only does BPTT inside a minibatch
    // by default it is false
    bool mIgnoreSentenceBeginTag;
};

template <class ElemType>
class MultiIOBatchLUSequenceReader : public BatchLUSequenceReader<ElemType>
{
private:
    map<wstring, BatchLUSequenceReader<ElemType>*> mReader;

    bool mCheckDictionaryKeys;
    std::map<std::wstring, BatchLUSequenceReader<ElemType>*> nameToReader;

public:
    MultiIOBatchLUSequenceReader()
    {
        mCheckDictionaryKeys = true;
        nameToReader.clear();
    }

    ~MultiIOBatchLUSequenceReader()
    {
        for (typename map<wstring, BatchLUSequenceReader<ElemType>*>::iterator p = mReader.begin(); p != mReader.end(); p++)
        {
            delete[] p->second;
        }
    };

    bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);

    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples);

    void CopyMBLayoutTo(MBLayoutPtr pMBLayout);

    size_t GetNumParallelSequences();

    template <class ConfigRecordType>
    void InitFromConfig(const ConfigRecordType&);
    virtual void Init(const ConfigParameters& config) override
    {
        InitFromConfig(config);
    }
    virtual void Init(const ScriptableObjects::IConfigRecord& config) override
    {
        InitFromConfig(config);
    }

public:
    void SetRandomSeed(int);

public:
    int GetSentenceEndIdFromOutputLabel();
    bool DataEnd(EndDataType endDataType);

    /// create proposals
    void InitProposals(map<wstring, Matrix<ElemType>*>& pMat);
    bool GetProposalObs(std::map<std::wstring, Matrix<ElemType>*>& matrices, const size_t tidx, vector<size_t>& history);
};
} } }