Content - 715a1d107fd117822267785c71a901e1ee49fae2 - 9908c01/Source/EvalDll/EvalReader.h

visit type:
Tip revision: 8fef21be1d80b02849548dd05993abcebafd4837 authored by Dong Yu on 09 August 2016, 21:51:45 UTC
support minibatchSize (for larger models that will be out of memory without it) and frameMode (for RNNNode based Windowed RNN) for HTKMLFWriter (and HTKMLFReader).
Tip revision: 8fef21b
EvalReader.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once

#define DATAREADER_LOCAL
#include "DataReader.h"

namespace Microsoft { namespace MSR { namespace CNTK {

// Evaluation Reader class
// interface to pass to evaluation DLL
template <class ElemType>
class EvalReader : public DataReaderBase
{
    std::map<std::wstring, std::vector<ElemType>*>* m_inputs; // our input data
    std::map<std::wstring, size_t>* m_dimensions;             // the number of rows for the input data
    size_t m_recordCount;                                     // count of records in this data
    size_t m_currentRecord;                                   // next record number to read
    size_t m_mbSize;
    vector<size_t> m_switchFrame;
    size_t m_oldSig;

public:
    // Method to setup the data for the reader
    void SetData(std::map<std::wstring, std::vector<ElemType>*>* inputs, std::map<std::wstring, size_t>* dimensions)
    {
        m_inputs = inputs;
        m_dimensions = dimensions;
        m_currentRecord = 0;
        m_recordCount = 0;
        for (auto iter = inputs->begin(); iter != inputs->end(); ++iter)
        {
            // figure out the dimension of the data
            const std::wstring& val = iter->first;
            size_t count = (*inputs)[val]->size();
            size_t rows = (*dimensions)[val];
            size_t recordCount = count / rows;

            if (m_recordCount != 0)
            {
                // record count must be the same for all the data
                if (recordCount != m_recordCount)
                    RuntimeError("Record Count of %ls (%lux%lu) does not match the record count of previous entries (%lu).", val.c_str(), rows, recordCount, m_recordCount);
            }
            else
            {
                m_recordCount = recordCount;
            }
        }
    }

    void SetBoundary(size_t newSig)
    {
        if (m_switchFrame.size() == 0)
        {
            m_oldSig = newSig;
            m_switchFrame.assign(1, 0);
        }
        else
        {
            if (m_oldSig == newSig)
            {
                m_switchFrame[0] = m_mbSize + 8888; // TODO: WTF??
            }
            else
            {
                m_switchFrame[0] = 0;
                m_oldSig = newSig;
            }
        }
    }

    virtual void Init(const ConfigParameters& /*config*/) override
    {
    }
    virtual void Init(const ScriptableObjects::IConfigRecord& /*config*/) override
    {
    }

    // Destroy - cleanup and remove this class
    // NOTE: this destroys the object, and it can't be used past this point
    virtual void Destroy()
    {
        delete this;
    }

    // EvalReader Constructor
    // config - [in] configuration parameters for the datareader
    template <class ConfigRecordType>
    EvalReader(const ConfigRecordType& config)
    {
        m_recordCount = m_currentRecord = 0;
        Init(config);
    }

    // Destructor - free up the matrix values we allocated
    virtual ~EvalReader()
    {
    }

    // StartMinibatchLoop - Startup a minibatch loop
    // mbSize - [in] size of the minibatch (number of frames, etc.)
    // epoch - [in] epoch number for this loop
    // requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
    virtual void StartMinibatchLoop(size_t mbSize, size_t /*epoch*/, size_t /*requestedEpochSamples=requestDataSize*/)
    {
        m_mbSize = min(mbSize, m_recordCount);
    }

    // TryGetMinibatch - Get the next minibatch (features and labels)
    // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponding matrix,
    //             [out] each matrix resized if necessary containing data.
    // returns - true if there are more minibatches, false if no more minibatchs remain
    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices)
    {
        // how many records are we reading this time
        size_t recordCount = min(m_mbSize, m_recordCount - m_currentRecord);

        // check to see if we are out of records in this current dataset
        if (m_currentRecord >= m_recordCount)
            return false;

        // loop through all the input vectors to copy the data over
        for (auto iter = m_inputs->begin(); iter != m_inputs->end(); ++iter)
        {
            // figure out the dimension of the data
            const auto& name = iter->first;
            size_t rows = (*m_dimensions)[name];
            // size_t count = rows*recordCount;

            // find the output matrix we want to fill
            if (!matrices.HasInput(name))
                RuntimeError("No matrix data found for key '%ls'.", name.c_str());

            // allocate the matrix if we don't have one yet
            auto& matrix = matrices.GetInputMatrix<ElemType>(name);

            // copy over the data
            std::vector<ElemType>* data = iter->second;
            ElemType* dataPtr = data->data() + (m_currentRecord * rows);
            matrix.SetValue(rows, recordCount, matrix.GetDeviceId(), dataPtr, matrixFlagNormal);
        }

        // increment our record pointer
        m_currentRecord += recordCount;

        // return true if we returned any data whatsoever
        return true;
    }

    size_t GetNumParallelSequencesForFixingBPTTMode()
    {
        return 1;
    }

    void SetNumParallelSequences(const size_t)
    {
    }
    void SetSentenceSegBatch(std::vector<size_t>& sentenceEnd)
    {
        sentenceEnd.resize(m_switchFrame.size());
        for (size_t i = 0; i < m_switchFrame.size(); i++)
        {
            sentenceEnd[i] = m_switchFrame[i];
        }
    }
    void CopyMBLayoutTo(MBLayoutPtr pMBLayout)
    {
        assert(m_switchFrame.size() == 1);
        pMBLayout->Init(1, m_mbSize);

        // BUGBUG: The following code is somewhat broken in that the structure of this module only keeps track of new sentence starts,
        //         but not of ends. But end markers are now required by the MBLayout. So we must fake the end markers.
        //         That will fail if the previous sentence end fell on the boundary; then we will miss the end flag.
        //         This still works for a left-to-right model since for eval we only really look at the start flag.
        //         So we get lucky, sort of. Not nice.
        //         The correct solution is to rewrite this entire module to be more direct; no Reader needed, we can call ForwardProp() directly.
        // BUGBUG: The module also does not keep track of the actual start in the past. So we fake the start, too.
        //         There are boundary cases where this will be incorrect for models with a delay of >1 step.
        if (m_switchFrame[0] < m_mbSize) /* there is a switch frame within the minibatch */
        {
            // finish the current sequence
            if (m_switchFrame[0] > 0) // BUGBUG: gonna miss the previous end flag if starting on frame [0], see above.
                pMBLayout->AddSequence(0, 0, -1, m_switchFrame[0] - 1);
            // start the new sequence
            // We use a fake end of 1 frame beyond the actual end of the minibatch.
            pMBLayout->AddSequence(0, 0, m_switchFrame[0], m_mbSize + 1);
            // pMBLayout->Set(0, m_switchFrame[0], MinibatchPackingFlags::SequenceStart);
            // if (m_switchFrame[0] > 0)
            //    pMBLayout->Set(0, m_switchFrame[0] - 1, MinibatchPackingFlags::SequenceEnd);   // TODO: can't we use Set()?
        }
        else // all frames in this MB belong to the same utterance
        {
            // no boundary inide the MB: fake a sequence that spans 1 frame on each side.  BUGBUG: That's wrong for delays of > 1 step, see above.
            pMBLayout->AddSequence(0, 0, -1, m_mbSize + 1); // BUGBUG: gonna miss the end flag if it ends at end of this MB, see above
        }
    }

    void GetSentenceBoundary(std::vector<size_t> boundaryInfo)
    {
        m_switchFrame.resize(boundaryInfo.size());
        for (size_t i = 0; i < m_switchFrame.size(); i++)
            m_switchFrame[i] = boundaryInfo[i];
    }

    void SetRandomSeed(int)
    {
        NOT_IMPLEMENTED;
    }

    // GetLabelMapping - Gets the label mapping from integer index to label type
    // returns - a map from numeric datatype to native label type
    virtual const std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType>& GetLabelMapping(const std::wstring& /*sectionName*/)
    {
        static std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType> labelMap;
        return labelMap;
    }

    // SetLabelMapping - Sets the label mapping from integer index to label
    // labelMapping - mapping table from label values to IDs (must be 0-n)
    // note: for tasks with labels, the mapping table must be the same between a training run and a testing run
    virtual void SetLabelMapping(const std::wstring& /*sectionName*/, const std::map<typename EvalReader<ElemType>::LabelIdType, typename EvalReader<ElemType>::LabelType>& /*labelMapping*/)
    {
    }

    // GetData - Gets metadata from the specified section (into CPU memory)
    // sectionName - section name to retrieve data from
    // numRecords - number of records to read
    // data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
    // dataBufferSize - [in] size of the databuffer in bytes
    //                  [out] size of buffer filled with data
    // recordStart - record to start reading from, defaults to zero (start of data)
    // returns: true if data remains to be read, false if the end of data was reached
    virtual bool GetData(const std::wstring& /*sectionName*/, size_t /*numRecords*/, void* /*data*/, size_t& /*dataBufferSize*/, size_t /*recordStart=0*/)
    {
        return false;
    }

    virtual bool DataEnd()
    {
        return m_currentRecord < m_recordCount;
    }

    virtual bool GetMinibatch4SE(std::vector<shared_ptr<const msra::dbn::latticepair>>& /*latticeinput*/, vector<size_t>& /*uids*/,
                                 vector<size_t>& /*boundaries*/, vector<size_t>& /*extrauttmap*/)
    {
        return true;
    }

    virtual bool GetHmmData(msra::asr::simplesenonehmm* /*hmm*/)
    {
        return true;
    }

    virtual void SetValidFrameInBatch(vector<size_t>& /*validFrame*/)
    {
        return;
    }
};
} } }
Browse the archive

https://github.com/Microsoft/CNTK