https://github.com/Microsoft/CNTK
Raw File
Tip revision: 52331f59b6891484d452800d723f800ca0c4b9f4 authored by Mark Hillebrand on 18 January 2016, 08:33:07 UTC
License change
Tip revision: 52331f5
utterancesourcemulti.h
//
// <copyright file="utterancesourcemulti.h" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//
// utterancesourcemulti.h -- implementation of utterancesource.h that supports multiple feature and label sets
//

#pragma once

#include "basetypes.h"                  // for attempt()
#include "htkfeatio.h"                  // for htkmlfreader
#include "latticearchive.h"             // for reading HTK phoneme lattices (MMI training)
#include "minibatchsourcehelpers.h"
#include "minibatchiterator.h"

namespace msra { namespace dbn {

// ---------------------------------------------------------------------------
// minibatchutterancesource -- feature source to provide randomized utterances
// This also implements a frame-wise mode, which is layered on top of the utterance-wise mode
// and thus benefits from its goodies such as corpus-wide high-level randomization and chunk paging.
// ---------------------------------------------------------------------------
class minibatchutterancesourcemulti : public minibatchsource
{
    void operator=(const minibatchutterancesourcemulti & other); // non-assignable
    std::vector<size_t> vdim;                    // feature dimension after augmenting neighhors
    std::vector<size_t> leftcontext;                // number of frames to the left of the target frame in the context window
    std::vector<size_t> rightcontext;               // number of frames to the right of the target frame in the context window
    std::vector<unsigned int> sampperiod;        // (for reference and to check against model)
    std::vector<string> featkind;
    std::vector<size_t> featdim;
    const bool framemode;           // true -> actually return frame-level randomized frames (not possible in lattice mode)
    std::vector<std::vector<size_t>> counts;     // [s] occurence count for all states (used for priors)
    int verbosity;
    // lattice reader
    //const std::vector<unique_ptr<latticesource>> &lattices;
    const latticesource & lattices;

    //std::vector<latticesource> lattices;
    // word-level transcripts (for MMI mode when adding best path to lattices)
     const map<wstring,msra::lattices::lattice::htkmlfwordsequence> & allwordtranscripts; // (used for getting word-level transcripts)
   //std::vector<map<wstring,msra::lattices::lattice::htkmlfwordsequence>> allwordtranscripts; 
    // data store (incl. paging in/out of features and lattices)
    struct utterancedesc            // data descriptor for one utterance
    {
        msra::asr::htkfeatreader::parsedpath parsedpath;    // archive filename and frame range in that file
        size_t classidsbegin;       // index into allclassids[] array (first frame)

        utterancedesc (msra::asr::htkfeatreader::parsedpath && ppath, size_t classidsbegin) : parsedpath (ppath), classidsbegin (classidsbegin) {}

        const wstring & logicalpath() const { return parsedpath; /*type cast will return logical path*/ }
        size_t numframes() const { return parsedpath.numframes(); }
        const wstring key() const                           // key used for looking up lattice (not stored to save space)
        {
            // the logical path is the uttid, not a filename. don't need to remove extension
            return logicalpath();
        }
    };
    struct utterancechunkdata       // data for a chunk of utterances
    {
        std::vector<utterancedesc> utteranceset;    // utterances in this set
        size_t numutterances() const { return utteranceset.size(); }

        std::vector<size_t> firstframes;    // [utteranceindex] first frame for given utterance
        mutable msra::dbn::matrix frames;   // stores all frames consecutively (mutable since this is a cache)
        size_t totalframes;         // total #frames for all utterances in this chunk
        mutable std::vector<shared_ptr<const latticesource::latticepair>> lattices;   // (may be empty if none)

        // construction
        utterancechunkdata() : totalframes (0) {}
        //utterancechunkdata (const utterancechunkdata& other) : utteranceset(other.utteranceset), firstframes(other.firstframes), frames (other.frames), totalframes (other.totalframes), lattices (other.lattices){};
        void push_back (utterancedesc &&/*destructive*/ utt)
        {
            //printf ("start push %d %d\n",frames.rows(), frames.cols());

            if (isinram())
            {

                throw std::logic_error ("utterancechunkdata: frames already paged into RAM--too late to add data");
            }
            firstframes.push_back (totalframes);
            totalframes += utt.numframes();
            utteranceset.push_back (utt);


        }

        // accessors to an utterance's data
        size_t numframes (size_t i) const { return utteranceset[i].numframes(); }
        size_t getclassidsbegin (size_t i) const { return utteranceset[i].classidsbegin; }
        msra::dbn::matrixstripe getutteranceframes (size_t i) const // return the frame set for a given utterance
        {
            if (!isinram())
                throw std::logic_error ("getutteranceframes: called when data have not been paged in");
            const size_t ts = firstframes[i];
            const size_t n = numframes(i);
            return msra::dbn::matrixstripe (frames, ts, n);
        }
        shared_ptr<const latticesource::latticepair> getutterancelattice (size_t i) const // return the frame set for a given utterance
        {
            if (!isinram())
                throw std::logic_error ("getutteranceframes: called when data have not been paged in");
            return lattices[i];
        }

        // paging
        // test if data is in memory at the moment
        bool isinram() const { 
            return !frames.empty(); 
        }
        // page in data for this chunk
        // We pass in the feature info variables by ref which will be filled lazily upon first read
        void requiredata (string & featkind, size_t & featdim, unsigned int & sampperiod, const latticesource & latticesource, int verbosity=0) const
        {

            if (numutterances() == 0)
                throw std::logic_error ("requiredata: cannot page in virgin block");
            if (isinram())
                throw std::logic_error ("requiredata: called when data is already in memory");
            try             // this function supports retrying since we read from the unrealible network, i.e. do not return in a broken state
            {
                msra::asr::htkfeatreader reader;    // feature reader (we reinstantiate it for each block, i.e. we reopen the file actually)
                // if this is the first feature read ever, we explicitly open the first file to get the information such as feature dimension
                if (featdim == 0)
                {
                    reader.getinfo (utteranceset[0].parsedpath, featdim);
                    fprintf (stderr, "requiredata: determined feature kind as %zu-dimensional\n", featdim);
                }
                // read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file
                frames.resize (featdim, totalframes);
                if (!latticesource.empty())
                    lattices.resize (utteranceset.size());
                foreach_index (i, utteranceset)
                {
                    //fprintf (stderr, ".");
                    // read features for this file
                    auto uttframes = getutteranceframes (i);    // matrix stripe for this utterance (currently unfilled)
                    reader.readNoAlloc (utteranceset[i].parsedpath, (const string &) featkind, sampperiod, uttframes);  // note: file info here used for checkuing only
                    // page in lattice data
                    if (!latticesource.empty())
                        latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                //fprintf (stderr, "\n");
				if (verbosity)
                {
                    fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size());
                }
            }
            catch (...)
            {
                releasedata();
                throw;
            }
        }
        // page out data for this chunk
        void releasedata() const
        {
            if (numutterances() == 0)
                throw std::logic_error ("releasedata: cannot page out virgin block");
            if (!isinram())
                throw std::logic_error ("releasedata: called when data is not memory");
            // release frames
            frames.resize (0, 0);
            // release lattice data
            lattices.clear();
        }
    };
    std::vector<std::vector<utterancechunkdata>> allchunks;          // set of utterances organized in chunks, referred to by an iterator (not an index)
    std::vector<unique_ptr<biggrowablevector<CLASSIDTYPE>>> classids;            // [classidsbegin+t] concatenation of all state sequences
    bool issupervised() const { return !classids.empty(); }
    size_t numutterances;           // total number of utterances
    size_t _totalframes;             // total frames (same as classids.size() if we have labels)
    double timegetbatch;            // [v-hansu] for time measurement
    // sequence in random order of actual use (randomized, where randomization is cached)
    const size_t randomizationrange;// parameter remembered; this is the full window (e.g. 48 hours), not the half window
    size_t currentsweep;            // randomization is currently cached for this sweep; if it changes, rebuild all below
    struct chunk                    // chunk as used in actual processing order (randomized sequence)
    {
        // the underlying chunk (as a non-indexed reference into the chunk set)
        std::vector<utterancechunkdata>::const_iterator uttchunkdata;
        const utterancechunkdata & getchunkdata() const { return *uttchunkdata; }
        size_t numutterances() const { return uttchunkdata->numutterances(); }
        size_t numframes() const { return uttchunkdata->totalframes; }

        // position in utterance-position space
        size_t utteranceposbegin;
        size_t utteranceposend() const { return utteranceposbegin + numutterances(); }

        // position on global time line
        size_t globalts;            // start frame on global timeline (after randomization)
        size_t globalte() const { return globalts + numframes(); }

        // randomization range limits
        size_t windowbegin;         // randomizedchunk index of earliest chunk that utterances in here can be randomized with
        size_t windowend;           // and end index [windowbegin, windowend)
        chunk (std::vector<utterancechunkdata>::const_iterator uttchunkdata, size_t utteranceposbegin, size_t globalts) : uttchunkdata (uttchunkdata), utteranceposbegin (utteranceposbegin), globalts (globalts) {}
    };
    std::vector<std::vector<chunk>> randomizedchunks;  // utterance chunks after being brought into random order (we randomize within a rolling window over them)
    size_t chunksinram;             // (for diagnostics messages)
    struct utteranceref             // describes the underlying random utterance associated with an utterance position
    {
        size_t chunkindex;          // lives in this chunk (index into randomizedchunks[])
        size_t utteranceindex;      // utterance index in that chunk
        size_t numframes;           // (cached since we cannot directly access the underlying data from here)
        size_t globalts;            // start frame in global space after randomization (for mapping frame index to utterance position)
        size_t globalte() const { return globalts + numframes; }            // end frame
        utteranceref (size_t chunkindex, size_t utteranceindex) : chunkindex (chunkindex), utteranceindex (utteranceindex), globalts (SIZE_MAX), numframes (0) {}
        void swap (utteranceref & other)   // used in randomization
        {
            ::swap (chunkindex, other.chunkindex);
            ::swap (utteranceindex, other.utteranceindex);
            assert (globalts == SIZE_MAX && other.globalts == SIZE_MAX && numframes == 0 && other.numframes == 0);    // can only swap before assigning these
        }
    };
    std::vector<utteranceref> randomizedutterancerefs;          // [pos] randomized utterance ids
    std::unordered_map<size_t,size_t> randomizedutteranceposmap;     // [globalts] -> pos lookup table
    struct positionchunkwindow       // chunk window required in memory when at a certain position, for controlling paging
    {
        std::vector<chunk>::const_iterator definingchunk;       // the chunk in randomizedchunks[] that defined the utterance position of this utterance
        size_t windowbegin() const { return definingchunk->windowbegin; }
        size_t windowend() const { return definingchunk->windowend; }
        bool isvalidforthisposition (const utteranceref & utt) const
        {
            return utt.chunkindex >= windowbegin() && utt.chunkindex < windowend(); // check if 'utt' lives in is in allowed range for this position
        }
        positionchunkwindow (std::vector<chunk>::iterator definingchunk) : definingchunk (definingchunk) {}
    };
    std::vector<positionchunkwindow> positionchunkwindows;      // [utterance position] -> [windowbegin, windowend) for controlling paging

    // frame-level randomization layered on top of utterance chunking (randomized, where randomization is cached)
    struct frameref
    {
#ifdef  _WIN64 // (sadly, the compiler makes this 8 bytes, not 6)
        unsigned short chunkindex;           // lives in this chunk (index into randomizedchunks[])
        unsigned short utteranceindex;       // utterance index in that chunk
        static const size_t maxutterancesperchunk = 65535;
        unsigned short frameindex;           // frame index within the utterance
        static const size_t maxframesperutterance = 65535;
#elif __unix__ // (sadly, the compiler makes this 8 bytes, not 6)
        unsigned short chunkindex;           // lives in this chunk (index into randomizedchunks[])
        unsigned short utteranceindex;       // utterance index in that chunk
        static const size_t maxutterancesperchunk = 65535;
        unsigned short frameindex;           // frame index within the utterance
        static const size_t maxframesperutterance = 65535;
#else   // For Win32, we care to keep it inside 32 bits. We have already encountered setups where that's not enough.
        unsigned int chunkindex : 13;           // lives in this chunk (index into randomizedchunks[])
        unsigned int utteranceindex : 8;        // utterance index in that chunk
        static const size_t maxutterancesperchunk = 255;
        unsigned int frameindex : 11;           // frame index within the utterance
        static const size_t maxframesperutterance = 2047;
#endif
        frameref (size_t ci, size_t ui, size_t fi) : chunkindex ((unsigned short) ci), utteranceindex ((unsigned short) ui), frameindex ((unsigned short) fi)
        {
#ifdef  _WIN32
            static_assert (sizeof (frameref) == 4, "frameref: bit fields too large to fit into 32-bit integer");
#endif
            if (ci == chunkindex && ui == utteranceindex && fi == frameindex)
                return;
            throw std::logic_error ("frameref: bit fields too small");
        }
        frameref() : chunkindex (0), utteranceindex (0), frameindex (0) {}
    };
    biggrowablevector<frameref> randomizedframerefs;  // [globalt-sweepts] -> (chunk, utt, frame) lookup table for randomized frames  --this can be REALLY big!

    // TODO: this may go away if we store classids directly in the utterance data
    template<class VECTOR> class shiftedvector  // accessing a vector with a non-0 starting index
    {
        void operator= (const shiftedvector &);
        VECTOR & v;
        size_t first;
        size_t n;
        void check (size_t i) const { if (i >= n) throw std::logic_error ("shiftedvector: index out of bounds"); }
    public:
        shiftedvector (VECTOR & v, size_t first, size_t n) : v (v), first (first), n (n) { }
        // TODO: the following is not templated--do it if needed; also should return a const reference then
        size_t operator[] (size_t i) const { check (i); return v[first + i]; }
    };
    template<class UTTREF> std::vector<shiftedvector<biggrowablevector<CLASSIDTYPE>>> getclassids (const UTTREF & uttref)  // return sub-vector of classids[] for a given utterance
    {
        std::vector<shiftedvector<biggrowablevector<CLASSIDTYPE>>> allclassids;
        allclassids.empty();

        if (!issupervised())
        {
            foreach_index(i,classids)
                allclassids.push_back(std::move(shiftedvector<biggrowablevector<CLASSIDTYPE>> ((*classids[i]), 0, 0)));
            return allclassids;     // nothing to return
        }
        const auto & chunk = randomizedchunks[0][uttref.chunkindex];
        const auto & chunkdata = chunk.getchunkdata();
        const size_t classidsbegin = chunkdata.getclassidsbegin (uttref.utteranceindex); // index of first state label in global concatenated classids[] array
        const size_t n = chunkdata.numframes (uttref.utteranceindex);
        foreach_index(i,classids)
        {
            if ((*classids[i])[classidsbegin + n] != (CLASSIDTYPE) -1)
                throw std::logic_error ("getclassids: expected boundary marker not found, internal data structure screwed up");
            allclassids.push_back(std::move(shiftedvector<biggrowablevector<CLASSIDTYPE>> ((*classids[i]), classidsbegin, n)));
        }
        return allclassids;   // nothing to return
    }
public:
    // constructor
    // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
    // This mode requires utterances with time stamps.
    minibatchutterancesourcemulti (std::vector<msra::asr::FeatureSection *> & featuresections, const std::vector<std::vector<wstring>> & infiles, const std::vector<map<wstring,std::vector<msra::asr::htkmlfentry>>> & labels,
                              std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange,
                              const latticesource & lattices, const map<wstring,msra::lattices::lattice::htkmlfwordsequence> & allwordtranscripts, const bool framemode)
        : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), randomizationrange (randomizationrange), currentsweep (SIZE_MAX),
          lattices (lattices), allwordtranscripts (allwordtranscripts), framemode (framemode), chunksinram (0), timegetbatch (0), verbosity(2)    
        // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice
        // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice
    {    
        // process infiles to know dimensions of things (but not loading features)
        std::vector<utterancedesc> utteranceset;// read all utterances to here first; at the end, distribute to chunks
        utteranceset.reserve (infiles.size());
        size_t nomlf = 0;                       // number of entries missing in MLF (diagnostics)
        size_t nolat = 0;                       // number of entries missing in lattice archive (diagnostics)
        std::vector<size_t> numclasses;                  // number of output classes as found in the label file (diagnostics)
        _totalframes = 0;
        wstring key;
        size_t numutts=0;
        
        std::vector<bool>uttisvalid; // boolean flag to check that utterance is valid. valid means number of 
                                     //frames is consistent across all feature and label streams
        std::vector<size_t>uttduration; // track utterance durations to determine utterance validity

        std::vector<size_t> classidsbegin;
        if (!lattices.empty())
        {
            LogicError("lattices not supported in utterancereadermulti");
        }

        allchunks = std::vector<std::vector<utterancechunkdata>>(infiles.size(), std::vector<utterancechunkdata>());
        featdim = std::vector<size_t>(infiles.size(), 0);
        sampperiod = std::vector<unsigned int>(infiles.size(), 0);
        featkind = std::vector<string>(infiles.size(), "");
        
        numclasses = std::vector<size_t>(labels.size(), 0);
        counts = std::vector<std::vector<size_t>>(labels.size(), std::vector<size_t>());
        foreach_index (i, labels)
        {
            classids.push_back(unique_ptr<biggrowablevector<CLASSIDTYPE>>(new biggrowablevector<CLASSIDTYPE>()));
            //std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs;
            //std::unordered_map<std::string,size_t> modelsymmap;
            //lattices.push_back(shared_ptr<latticesource>(new latticesource(latticetocs, modelsymmap)));
    
        }


        // first check consistency across feature streams
        // We'll go through the SCP files for each stream to make sure the duration is consistent
        // If not, we'll plan to ignore the utterance, and inform the user
        // m indexes the feature stream
        // i indexes the files within a stream, i.e. in the SCP file)
        foreach_index(m, infiles){
            if (m == 0){
                numutts = infiles[m].size();
                uttisvalid = std::vector<bool>(numutts, true);
                uttduration = std::vector<size_t>(numutts, 0);
            }
            else if (infiles[m].size()!=numutts)
                throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");

            foreach_index(i, infiles[m]){
	            utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i],featuresections[m]), 0);  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
                // we need at least 2 frames for boundary markers to work
                if (uttframes < 2)
                {
                    //throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported");
                    fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%zd frames) because it less than %d frames for frameref bit field: %S\n",
                            i, uttframes, 2, key.c_str());
                    uttduration[i] = 0;
                    uttisvalid[i] = false;
                } else if (uttframes > frameref::maxframesperutterance || uttframes <2)
                {
                    fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%zd frames) because it exceeds max. frames (%zd) for frameref bit field: %S\n", 
                            i, uttframes, frameref::maxframesperutterance, key.c_str());
                    uttduration[i] = 0;
                    uttisvalid[i] = false;
                }
                else{
                    if (m == 0){
                        uttduration[i] = uttframes;
                        uttisvalid[i] = true;
                    }
                    else if (uttduration[i] != uttframes){
                        fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%zd vs %zd frames)\n",
                                i, uttduration[i], uttframes);
                        uttduration[i] = 0;
                        uttisvalid[i] = false;
                    }
                }
            }
        }
        size_t invalidutts=0;
        foreach_index(i, uttisvalid){
            if (!uttisvalid[i])
                invalidutts++;
        }
        if (invalidutts > uttisvalid.size() / 2)
            throw std::runtime_error("minibatchutterancesource: too many files not found in with inconsistent durations, assuming broken configuration\n");
        else if (invalidutts>0)
            fprintf(stderr, "Found inconsistent durations across feature streams in %zd out of %zd files.\n", invalidutts, uttisvalid.size());


        // now process the features and labels
        size_t utterancesetsize = 0;
        foreach_index (m, infiles)
        {
            utteranceset.clear();
            //if (m==0)
            //    numutts = infiles[m].size();
            //else
            //    if (infiles[m].size()!=numutts)
            //        throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n");
            if (m==0)
                classidsbegin.clear();
            
            foreach_index (i, infiles[m])
            {
                if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
                // build utterance descriptor
                if (m == 0 && !labels.empty())
                    classidsbegin.push_back(classids[0]->size());
                    
                if (uttisvalid[i]){
                utterancedesc utterance (msra::asr::htkfeatreader::parsedpath (infiles[m][i], featuresections[m]), labels.empty() ? 0 : classidsbegin[i] );  //mseltzer - is this foolproof for multiio? is classids always non-empty? 
                const size_t uttframes = utterance.numframes(); // will throw if frame bounds not given --required to be given in this mode
                    assert(uttframes == uttduration[i]); // ensure nothing funky happened

                    // already performed these checks above
                    // we need at least 2 frames for boundary markers to work
                    //if (uttframes < 2)
                    //    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
                    //if (uttframes > frameref::maxframesperutterance)
                    //{
                    //    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
                    //    continue;
                    //}

                // check whether we have the ref transcript
                //auto labelsiter = labels[0].end();
                bool lacksmlf = true;
                if (!labels.empty())    // empty means unsupervised mode (don't load any)
                {
                    key = utterance.key();
                    // check if labels are available (if not, it normally means that no path was found in realignment)
                    auto labelsiter = labels[0].find (key);
                    //const bool lacksmlf = (labelsiter == labels[0].end());
                    lacksmlf = (labelsiter == labels[0].end());
                    if (lacksmlf)
                        if (nomlf++ < 5)
                            fprintf (stderr, " [no labels for  %S]", key.c_str());
                    // check if lattice is available (when in lattice mode)
                    // TODO: also check the #frames here; requires a design change of the TOC format & a rerun
                    const bool lackslat = !lattices.empty() && !lattices.haslattice (key); // ('true' if we have no lattices)
                    if (lackslat)
                        if (nolat++ < 5)
                            fprintf (stderr, " [no lattice for %S]", key.c_str());
                    // skip if either one is missing
                        if (lacksmlf || lackslat){
                            uttisvalid[i] = false;
                        continue;   // skip this utterance at all
                        }
                }
                // push the label sequence into classids[], since we already looked it up
                // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.
    
                // OK, utterance has all we need --remember it

                if (m==0)
                {
                    if (!labels.empty() && !lacksmlf)
                    //if (!labels.empty() && labelsiter != labels[0].end())
                    {
                         // first verify that all the label files have the proper duration
                        foreach_index (j, labels)
                        {
                            const auto & labseq = labels[j].find(key)->second;
                            // check if durations match; skip if not
                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
                            if (labframes != uttframes)
                            {
                                fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
                                    uttisvalid[i] = false;
                                break; // continue;   // skip this utterance at all
                            }
                        }
                            if (uttisvalid[i])
                            {
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
                            // then parse each mlf if the durations are consistent
                            foreach_index(j, labels)
                            {
                                const auto & labseq = labels[j].find(key)->second;
                        
                                // expand classid sequence into flat array
                                foreach_index (i, labseq)
                                {
                                    const auto & e = labseq[i];
                                    if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
                                    if (e.classid >= udim[j])
                                    {
                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchutterancesource: class id exceeds model output dimension"));
                                    }
                                    if (e.classid != (CLASSIDTYPE) e.classid)
                                        throw std::runtime_error ("CLASSIDTYPE has too few bits");
                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
                                        classids[j]->push_back ((CLASSIDTYPE) e.classid);
                                    numclasses[j] = max (numclasses[j], (size_t)(1u + e.classid));
                                    counts[j].resize (numclasses[j], 0);
                                    counts[j][e.classid] += e.numframes;
                                }
                                classids[j]->push_back ((CLASSIDTYPE) -1);  // append a boundary marker marker for checking
    
                                if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
                                    throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
                                assert (labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
                            }
                        }
                    }
                    else{
                            assert(classids.empty() && labels.empty());
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
                    }
                }
                            else
                {
                        utteranceset.push_back(std::move(utterance));
                    }
                }
            }
            if (m == 0) 
                utterancesetsize = utteranceset.size();
            else 
                assert(utteranceset.size() == utterancesetsize);
            fprintf (stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());

            if (!labels.empty()){
                foreach_index (j, labels){
                    biggrowablevector<CLASSIDTYPE> & cid = *classids[j];
                    foreach_index (i, utteranceset){
                        //if ((*classids[j])[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1)
                        //printf("index = %d\n",utteranceset[i].classidsbegin + utteranceset[i].numframes());
                        //printf("cid[index] = %d\n",cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()]);
                        //printf("CLASSIDTYPE(-1) = %d\n",(CLASSIDTYPE) -1);
                        if (cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1)
                            throw std::logic_error ("minibatchutterancesource: classids[] out of sync");
                    }
                }
            }
            if (nomlf + nolat > 0)
            {
                fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles.size(), nomlf, nolat);
                if (nomlf + nolat > infiles[m].size() / 2)
                    throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
            }

            if (m==0) {foreach_index(j, numclasses) { fprintf(stderr,"label set %d: %zu classes\n",j, numclasses[j]); } }
            // distribute them over chunks
            // We simply count off frames until we reach the chunk size.
            // Note that we first randomize the chunks, i.e. when used, chunks are non-consecutive and thus cause the disk head to seek for each chunk.
            const size_t framespersec = 100;                    // we just assume this; our efficiency calculation is based on this
                        const size_t chunkframes = 15 * 60 * framespersec;  // number of frames to target for each chunk
            // Loading an initial 24-hour range will involve 96 disk seeks, acceptable.
            // When paging chunk by chunk, chunk size ~14 MB.
            std::vector<utterancechunkdata> & thisallchunks = allchunks[m];
            //std::vector<utterancechunkdata>  thisallchunks;

            thisallchunks.resize (0);
            thisallchunks.reserve (_totalframes / chunkframes);
            foreach_index (i, utteranceset)
            {
                // if exceeding current entry--create a new one
                // I.e. our chunks are a little larger than wanted (on av. half the av. utterance length).
                if (thisallchunks.empty() || thisallchunks.back().totalframes > chunkframes || thisallchunks.back().numutterances() >= frameref::maxutterancesperchunk)
                {
                    thisallchunks.push_back (utterancechunkdata());


                }
                // append utterance to last chunk
                utterancechunkdata & currentchunk = thisallchunks.back();

                currentchunk.push_back (std::move (utteranceset[i]));    // move it out from our temp array into the chunk
                // TODO: above push_back does not actually 'move' because the internal push_back does not accept that
            }

            numutterances = utteranceset.size();
            fprintf (stderr, "minibatchutterancesource: %zu utterances grouped into %zu chunks, av. chunk size: %.1f utterances, %.1f frames\n",
                numutterances, thisallchunks.size(), numutterances / (double) thisallchunks.size(), _totalframes / (double) thisallchunks.size());
            // Now utterances are stored exclusively in allchunks[]. They are never referred to by a sequential utterance id at this point, only by chunk/within-chunk index.
        }
        // preliminary mem allocation for frame references (if in frame mode)
        if (framemode)
            randomizedframerefs.resize (_totalframes);
    }

private:
    // shuffle a vector into random order by randomly swapping elements

    template<typename VECTOR> static void randomshuffle (VECTOR & v, size_t randomseed)
    {
        if (v.size() > RAND_MAX * (size_t) RAND_MAX)
            throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!");
        srand ((unsigned int) randomseed);
        foreach_index (i, v)
        {
            // pick a random location
            const size_t irand = msra::dbn::rand (0, v.size());

            // swap element i with it
            if (irand == (size_t) i)
                continue;
            ::swap (v[i], v[irand]);
        }
    }
#if 0
    template<typename VECTOR> static void randomshuffle(std::vector<VECTOR &> v, size_t randomseed)
    {
        foreach_index(j, v)
        {
           if (v[j].size() > RAND_MAX * (size_t) RAND_MAX)
            throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!");
        }
        srand ((unsigned int) randomseed);
        
        foreach_index (i, v[0])
        {
           // pick a random location
            const size_t irand = msra::dbn::rand (0, v[0].size());

            foreach_index(j, v){
            // swap element i with it
                if (irand == (size_t) i)
                    continue;
                ::swap (v[j][i], v[j][irand]);
            }
        }
    }
#endif //0
    static void checkoverflow (size_t fieldval, size_t targetval, const char * fieldname)
    {
        if (fieldval != targetval)
            throw std::runtime_error (msra::strfun::strprintf ("checkoverflow: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, targetval, fieldval));
    }

    // helper for testing whether a swapped frame position is valid (w.r.t. beign in RAM when being at position 't')
    bool isframepositionvalid (const size_t t, const biggrowablevector<unsigned short> & ttochunk) const
    {
        // look up valid range for time position
        const size_t positionchunkindex = ttochunk[t];              // position 't' lies within this original chunk (relationship is monotonous, not random)
        const auto & chunk = randomizedchunks[0][positionchunkindex];
        // get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk)
        const size_t poswindowbegin = chunk.windowbegin;            // rolling window over chunks (which under the hood have been randomized)
        const size_t poswindowend =   chunk.windowend;
        // Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM.

        // now see if the randomized location is within that window
        const size_t actualchunkindexforpos = randomizedframerefs[t].chunkindex;    // where this frame pos has been mapped to
        return actualchunkindexforpos >= poswindowbegin && actualchunkindexforpos < poswindowend;
        // We only need to test the chunk index. Utterance and frame can be randomized within a chunk as we want, as long it is in RAM.
    }

    // big long helper to update all cached randomization information
    // This is a rather complex process since we randomize on two levels:
    //  - chunks of consecutive data in the feature archive
    //  - within a range of chunks that is paged into RAM
    //     - utterances (in utt mode), or
    //     - frames (in frame mode)
    // The 'globalts' parameter is the start time that triggered the rerandomization; it is NOT the base time of the randomized area.
    size_t lazyrandomization (const size_t globalts)
    {
        const size_t sweep = globalts / _totalframes;    // which sweep (this determines randomization)
        if (sweep == currentsweep)                       // already got this one--nothing to do
            return sweep;

        currentsweep = sweep;
		if (verbosity>0)
            fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep

        // first randomize chunks
        std::vector<std::vector<std::vector<utterancechunkdata>::const_iterator>> randomizedchunkrefs;
        foreach_index (i, allchunks)
            randomizedchunkrefs.push_back(std::vector<std::vector<utterancechunkdata>::const_iterator>());

        foreach_index (i, allchunks)
            randomizedchunkrefs[i].reserve (allchunks[i].size());

        foreach_index (i, allchunks)    // TODO: this cries for iterating using the iterator!
        {
            foreach_index(j, allchunks[i])
                randomizedchunkrefs[i].push_back (allchunks[i].begin() + j);
           assert (randomizedchunkrefs[i].size() == allchunks[i].size());

           // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
           randomshuffle (randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)

        }

        // place them onto the global timeline -> randomizedchunks[]
        // We are processing with randomization within a rolling window over this chunk sequence.
        // Paging will happen on a chunk-by-chunk basis.
        // The global time stamp is needed to determine the paging window.
        randomizedchunks.clear();               // data chunks after being brought into random order (we randomize within a rolling window over them)

        foreach_index(i, allchunks)
            randomizedchunks.push_back(std::vector<chunk>());
        
        foreach_index(i, allchunks)
        {
            randomizedchunks[i].reserve (randomizedchunkrefs[i].size());
            foreach_index (k, randomizedchunkrefs[i])
                randomizedchunks[i].push_back (chunk (randomizedchunkrefs[i][k], randomizedchunks[i].empty() ? 0 : randomizedchunks[i].back().utteranceposend(), randomizedchunks[i].empty() ? sweepts : randomizedchunks[i].back().globalte()));
            assert (randomizedchunks[i].size() == allchunks[i].size());

            assert (randomizedchunks[i].empty() || (randomizedchunks[i].back().utteranceposend() == numutterances && randomizedchunks[i].back().globalte() == sweepts + _totalframes));
        }
        // for each chunk, compute the randomization range (w.r.t. the randomized chunk sequence)
        foreach_index (i, randomizedchunks)
        {
            foreach_index (k, randomizedchunks[i])
            {
                chunk & chunk = randomizedchunks[i][k];
                // start with the range of left neighbor
                if (k == 0)
                {
                    chunk.windowbegin = 0;
                    chunk.windowend = 1;
                }
                else
                {
                    chunk.windowbegin = randomizedchunks[i][k-1].windowbegin;  // might be too early
                    chunk.windowend = randomizedchunks[i][k-1].windowend;      // might have more space
                }
                while (chunk.globalts - randomizedchunks[i][chunk.windowbegin].globalts > randomizationrange/2)
                    chunk.windowbegin++;            // too early
                while (chunk.windowend < randomizedchunks[i].size() && randomizedchunks[i][chunk.windowend].globalte() - chunk.globalts < randomizationrange/2)
                    chunk.windowend++;              // got more space
            }
        }
        if (!framemode)     // utterance mode
        {
            // This sets up the following members:
            //  - positionchunkwindows
            //  - randomizedutterancerefs
            //  - randomizedutteranceposmap

            // We will now introduce the concept of utterance *position*.
            // During processing, utterances will be indexed by position (which is in turn derived from a frame index in getbatch()),
            // and it is assumed (required) that positions are requested consecutively.
            // Each utterance position has an underlying associated utterance, which is represented as (chunkid, within-chunk index) and randomly assigned.
            // Each utterance position also has an associated range of chunks that are kept in memory,
            // and the associated underlying utterance is guaranteed to be found within that associated range of chunks.
            // That allows to page out/in data when processing utterance positions in a consecutive manner.

            // compute chunk windows for every utterance position -> positionchunkwindows[]
            // Utterance positions can only reference underlying utterance data within the chunk window.
            // Utterance positions are defined by the randomized chunk sequence (i.e. their underlying 'defining' chunk differs from sweep to sweep).
            positionchunkwindows.clear();           // [utterance position] -> [windowbegin, windowend) for controlling paging
            positionchunkwindows.reserve (numutterances);

            // positionchunkwindows should be consistent for all inputs (distinct feature streams), so just build based on feature[0]
            // contains pointer to chunk elements but only to compute index
            foreach_index (k, randomizedchunks[0]) // TODO: this really cries for iterating using iterators!
            {
                chunk & chunk = randomizedchunks[0][k];
                for (size_t i = chunk.utteranceposbegin; i < chunk.utteranceposend(); i++)  // loop over utterances in this chunk
                {
                    positionchunkwindows.push_back (randomizedchunks[0].begin() + k);
                }
                // to look up the chunk range in memory for a position, look up the defining chunk and its range
            }
            assert (positionchunkwindows.size() == numutterances);

            // build the randomized utterances array -> randomizedutterancerefs[]
            // start by assigning all utterance positions to utterances in non-random consecutive manner
            randomizedutterancerefs.clear();        // [pos] randomized utterance ids
            randomizedutterancerefs.reserve (numutterances);
            foreach_index (k, randomizedchunks[0])
            {
                chunk & chunk = randomizedchunks[0][k];
                for (size_t i = 0; i < chunk.numutterances(); i++)  // loop over utterances in this chunk
                    randomizedutterancerefs.push_back (utteranceref (k, i));
            }
            assert (randomizedutterancerefs.size() == numutterances);
            foreach_index (i, randomizedutterancerefs)
            {
                auto & uttref = randomizedutterancerefs[i];
                assert (positionchunkwindows[i].isvalidforthisposition(uttref)); uttref;
            }

            // check we got those setup right

            // we now randomly shuffle randomizedutterancerefs[pos], while considering the constraints of what chunk range needs to be in memory
            srand ((unsigned int) sweep + 1);
            for (size_t i = 0; i < randomizedutterancerefs.size(); i++)
            {
                // get valid randomization range, expressed in chunks
                const size_t windowbegin = positionchunkwindows[i].windowbegin();
                const size_t windowend =   positionchunkwindows[i].windowend();

                // get valid randomization range, expressed in utterance positions
                // Remember, utterance positions are defined by chunks.
                const size_t posbegin = randomizedchunks[0][windowbegin].utteranceposbegin;
                const size_t posend =   randomizedchunks[0][windowend-1].utteranceposend();

                // randomization range for this utterance position is [posbegin, posend)
                for(;;)
                {
                    // pick a random location
                    const size_t j = msra::dbn::rand (posbegin, posend);    // a random number within the window
                    if (i == j)
                        break;  // the random gods say "this one points to its original position"... nothing wrong about that, but better not try to swap

                    // We want to swap utterances at i and j, but need to make sure they remain in their allowed range.
                    // This is guaranteed for a so-far untouched utterance, but both i and j may have been touched by a previous swap.

                    // We want to use the utterance previously referenced at utterance position j at position i. Is that allowed?
                    if (!positionchunkwindows[i].isvalidforthisposition (randomizedutterancerefs[j]))
                        continue;   // nope --try another

                    // Likewise may we use the utterance previously referenced at utterance position i at position j?
                    if (!positionchunkwindows[j].isvalidforthisposition (randomizedutterancerefs[i]))
                        continue;   // nope --try another

                    // yep--swap them
                    randomizedutterancerefs[i].swap (randomizedutterancerefs[j]);
                    break;
                }
            }

            // place the randomized utterances on the global timeline so we can find them by globalts
            size_t t = sweepts;
            foreach_index (i, randomizedutterancerefs)
            {
                auto & uttref = randomizedutterancerefs[i];
                uttref.globalts = t;
                uttref.numframes = randomizedchunks[0][uttref.chunkindex].getchunkdata().numframes (uttref.utteranceindex);
                t = uttref.globalte();
            }
            assert (t == sweepts + _totalframes);

            // verify that we got it right (I got a knot in my head!)
            foreach_index (i, randomizedutterancerefs)
            {
                // get utterance referenced at this position
                const auto & uttref = randomizedutterancerefs[i];
                // check if it is valid for this position
                if (uttref.chunkindex < positionchunkwindows[i].windowbegin() || uttref.chunkindex >= positionchunkwindows[i].windowend())
                    throw std::logic_error ("lazyrandomization: randomization logic mangled!");
            }

            // create lookup table for (globalts values -> pos) -> randomizedutteranceposmap[]
            randomizedutteranceposmap.clear();      // [globalts] -> pos lookup table
            foreach_index (pos, randomizedutterancerefs)
            {
                auto & uttref = randomizedutterancerefs[pos];
                randomizedutteranceposmap[uttref.globalts] = (size_t) pos;
            }
        }
        else            // frame mode
        {
            // This sets up the following members:
            //  - randomizedframerefs

            srand ((unsigned int) sweep + 1);
            // An original timeline is established by the randomized chunks, denoted by 't'.
            // Returned frames are indexed by frame position j = (globalt - sweept), which have an associated underlying 't'.
            // It is guaranteed that uttterance frame position j maps to an underlying frame within the corresponding chunk window.
            biggrowablevector<unsigned short> ttochunk; // randomized chunk index associated with frame position
            ttochunk.resize (_totalframes);
            size_t t = 0;
            frameref frameref;
            // enumerate chunks in their randomized order and assign frame indices in that order -> randomizedframerefs[t]
            // At this point, chunks are in randomized order, but utterances and frames within utterances are not randomized.
            // Later we will randomize those as well.
            foreach_index (i, randomizedchunks[0])
            {
                frameref.chunkindex = (unsigned short)i;
                checkoverflow (frameref.chunkindex, i, "frameref::chunkindex");
                const auto & chunk = randomizedchunks[0][i];
                const auto & chunkdata = chunk.getchunkdata();
                const size_t numutt = chunkdata.numutterances();
                for (size_t k = 0; k < numutt; k++)
                {
                    frameref.utteranceindex = (short)k;
                    checkoverflow (frameref.utteranceindex, k, "frameref::utteranceindex");
                    const size_t n = chunkdata.numframes (k);
                    for (size_t m = 0; m < n; m++)
                    {
                        frameref.frameindex = (short)m;
                        checkoverflow (frameref.frameindex, m, "frameref::utteranceindex");
                        randomizedframerefs[t] = frameref;  // hopefully this is a memory copy, not a bit-wise assignment! If not, then code it explicitly
                        ttochunk[t] = (unsigned short) i;
                        checkoverflow (ttochunk[t], i, "ttochunk[]");
                        t++;
                    }
                }
            }
            assert (t == _totalframes);

            // now randomize them --we use the nested loop again to avoid storing a backpointer
            // The condition is that a randomized frame may not be moved out of its associated chunk window.
            foreach_index (t, randomizedframerefs)
            {
                const size_t positionchunkindex = ttochunk[t];              // position 't' lies within this chunk (relationship is monotonous, not random)
                const auto & chunk = randomizedchunks[0][positionchunkindex];  // for window

                // get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk)
                const size_t poswindowbegin = chunk.windowbegin;            // rolling window over chunks (which under the hood have been randomized)
                const size_t poswindowend =   chunk.windowend;
                // Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM.
                // These chunks are associated with a range of frame positions.
                // It is implied that if we are at position 't', the frames covered by chunks [poswindowbegin, poswindowend) are in RAM.
                const size_t postbegin = randomizedchunks[0][poswindowbegin].globalts   - sweepts;
                const size_t postend =   randomizedchunks[0][poswindowend-1].globalte() - sweepts;
                // The position that this frame gets randomized to must be guaranteed to belong to a chunk within [postbegin, postend).

                for (;;)                                                    // (randomization retry loop)
                {
                    size_t tswap = msra::dbn::rand (postbegin, postend);               // random frame position within allowed range
                    // We want to swap 't' to 'tswap' and 'tswap' to 't'.
                    //  - Both may have been swapped before.
                    //  - Both must stay within the randomization window of their respective position.
                    // check admissibility of where the element at 'tswap' gets swapped to 't' (range = [windowbegin,windowend))
                    size_t tswapchunkindex = randomizedframerefs[tswap].chunkindex;
                    if (tswapchunkindex < poswindowbegin || tswapchunkindex >= poswindowend)
                        continue;
                    // check admissibility of where the element at t gets swapped to (which is frame position 'tswap')
                    const size_t sourcechunkindex = randomizedframerefs[t].chunkindex;
                    size_t targetchunkindex = ttochunk[tswap];      // chunk associated with this frame position defines value range
                    const auto & targetchunk = randomizedchunks[0][targetchunkindex];
                    const size_t targetwindowbegin = targetchunk.windowbegin;
                    const size_t targetwindowend =   targetchunk.windowend;
                    if (sourcechunkindex < targetwindowbegin || sourcechunkindex >= targetwindowend)
                        continue;
                    // admissible--swap the two
                    ::swap (randomizedframerefs[t], randomizedframerefs[tswap]);
#if 0
                    break;
#else               // post-check  --so far did not trigger, can be removed

                    // do a post-check if we got it right  --we seem not to
                    if (isframepositionvalid (t, ttochunk) && isframepositionvalid (tswap, ttochunk))
                        break;
                    // not valid: swap them back and try again  --we actually discovered a bug in the code above
                    ::swap (randomizedframerefs[t], randomizedframerefs[tswap]);
                    fprintf (stderr, "lazyrandomization: BUGBUG --invalid swapping condition detected\n");
#endif
                }
            }

            // check it --my head spins
            t = 0;
            foreach_index (i, randomizedchunks[0])
            {
                const auto & chunk = randomizedchunks[0][i];       // for window and chunkdata
                const size_t poswindowbegin = chunk.windowbegin;
                const size_t poswindowend =   chunk.windowend;

                const auto & chunkdata = chunk.getchunkdata();  // for numutterances/numframes
                const size_t numutt = chunkdata.numutterances();
                for (size_t k = 0; k < numutt; k++)
                {
                    const size_t n = chunkdata.numframes (k);
                    for (size_t m = 0; m < n; m++)
                    {
                        const size_t randomizedchunkindex = randomizedframerefs[t].chunkindex;
                        if (randomizedchunkindex < poswindowbegin || randomizedchunkindex >= poswindowend)
                            throw std::logic_error ("lazyrandomization: nope, you got frame randomization wrong, dude");
                        t++;
                    }
                }
            }
            assert (t == _totalframes);
        }

        return sweep;
    }

    // helper to page out a chunk with log message
    void releaserandomizedchunk (size_t k)
    {
        size_t numreleased=0;
        foreach_index(m, randomizedchunks){
            auto & chunkdata = randomizedchunks[m][k].getchunkdata();
            if (chunkdata.isinram())
            {
                if (verbosity)
                fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
                     k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1);
                chunkdata.releasedata();
                numreleased++;
            }
        }
        if (numreleased>0 && numreleased<randomizedchunks.size())
        {
            LogicError ("releaserandomizedchunk: inconsistency detected - some inputs have chunks in ram, some not");
        }
        else if (numreleased==randomizedchunks.size())
        {
            chunksinram--;
        }
        return;
    }

    // helper to page in a chunk for a given utterance
    // (window range passed in for checking only)
    // Returns true if we actually did read something.
    bool requirerandomizedchunk (const size_t chunkindex, const size_t windowbegin, const size_t windowend)
    {
        size_t numinram=0;

        if (chunkindex < windowbegin || chunkindex >= windowend)
            throw std::logic_error ("requirerandomizedchunk: requested utterance outside in-memory chunk range");

        foreach_index(m, randomizedchunks)
        {
            auto & chunk = randomizedchunks[m][chunkindex];
            auto & chunkdata = chunk.getchunkdata();
            if (chunkdata.isinram())
                numinram++;
        }
        if (numinram==randomizedchunks.size())
        {           

            return false;
        }
        else if (numinram==0)
        {
            foreach_index(m, randomizedchunks)
            {
                auto & chunk = randomizedchunks[m][chunkindex];
                auto & chunkdata = chunk.getchunkdata();
				if (verbosity)
                fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
                msra::util::attempt (5, [&]()   // (reading from network)
                {
                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
                });
            }
            chunksinram++;
            return true;
        }
        else{
            LogicError ("requirerandomizedchunk: inconsistency detected - some inputs need chunks paged in, some not");
        }
    }

    class matrixasvectorofvectors  // wrapper around a matrix that views it as a vector of column vectors
    {
        void operator= (const matrixasvectorofvectors &);  // non-assignable
        msra::dbn::matrixbase & m;
    public:
        matrixasvectorofvectors (msra::dbn::matrixbase & m) : m (m) {}
        size_t size() const { return m.cols(); }
        const_array_ref<float> operator[] (size_t j) const { return array_ref<float> (&m(0,j), m.rows()); }
    };

    size_t chunkforframepos (const size_t t) const  // find chunk for a given frame position
    {
        //inspect chunk of first feature stream only
        auto iter = std::lower_bound (randomizedchunks[0].begin(), randomizedchunks[0].end(), t, [&] (const chunk & chunk, size_t t) { return chunk.globalte() <= t; });
        const size_t chunkindex = iter - randomizedchunks[0].begin();
        if (t < randomizedchunks[0][chunkindex].globalts || t >= randomizedchunks[0][chunkindex].globalte())
            throw std::logic_error ("chunkforframepos: dude, learn STL!");
        return chunkindex;
    }

public:

    void setverbosity(int newverbosity){ verbosity = newverbosity; }

    // get the next minibatch
    // A minibatch is made up of one or more utterances.
    // We will return less than 'framesrequested' unless the first utterance is too long.
    // Note that this may return frames that are beyond the epoch end, but the first frame is always within the epoch.
    // We specify the utterance by its global start time (in a space of a infinitely repeated training set).
    // This is efficient since getbatch() is called with sequential 'globalts' except at epoch start.
    // Note that the start of an epoch does not necessarily fall onto an utterance boundary. The caller must use firstvalidglobalts() to find the first valid globalts at or after a given time.
    //
    //
    /*implement*/ bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat,
                                 std::vector<std::vector<size_t>> & uids, std::vector<std::pair<wstring, size_t>> & utteranceinfo,
                                 std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
                                 std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
    {
        bool readfromdisk = false;  // return value: shall be 'true' if we paged in anything

        auto_timer timergetbatch;
        assert (_totalframes > 0);

        // Clears <utteranceinfo> vector.
        utteranceinfo.clear();

        // update randomization if a new sweep is entered  --this is a complex operation that updates many of the data members used below
        const size_t sweep = lazyrandomization (globalts);

        const std::vector<char> noboundaryflags;    // dummy
        if (!framemode)      // regular utterance mode
        {

            // find utterance position for globalts
            // There must be a precise match; it is not possible to specify frames that are not on boundaries.
            auto positer = randomizedutteranceposmap.find (globalts);
            if (positer == randomizedutteranceposmap.end())
                throw std::logic_error ("getbatch: invalid 'globalts' parameter; must match an existing utterance boundary");
            const size_t spos = positer->second;

            // determine how many utterances will fit into the requested minibatch size
            size_t mbframes = randomizedutterancerefs[spos].numframes;   // at least one utterance, even if too long
            size_t epos;
            for (epos = spos + 1; epos < numutterances && mbframes + randomizedutterancerefs[epos].numframes < framesrequested; epos++)  // add more utterances as long as they fit within requested minibatch size
                mbframes += randomizedutterancerefs[epos].numframes;

            // do some paging housekeeping
            // This will also set the feature-kind information if it's the first time.
            // Free all chunks left of the range.
            // Page-in all chunks right of the range.
            // We are a little more blunt for now: Free all outside the range, and page in only what is touched. We could save some loop iterations.
            const size_t windowbegin = positionchunkwindows[spos].windowbegin();
            const size_t windowend =   positionchunkwindows[epos-1].windowend();
            for (size_t k = 0; k < windowbegin; k++)
                releaserandomizedchunk (k);
            for (size_t k = windowend; k < randomizedchunks[0].size(); k++)
                releaserandomizedchunk (k);

            for (size_t pos = spos; pos < epos; pos++)
                readfromdisk |= requirerandomizedchunk (randomizedutterancerefs[pos].chunkindex, windowbegin, windowend); // (window range passed in for checking only)

            // resize feat and uids
            feat.resize(vdim.size());
            uids.resize(classids.size());
            assert(feat.size()==vdim.size());
            assert(feat.size()==randomizedchunks.size());
            foreach_index(i, feat)
            {
                feat[i].resize(vdim[i], mbframes);

                if (i==0)
                {
                    foreach_index(j, uids)
                    {
                        if (issupervised())             // empty means unsupervised training -> return empty uids
                            uids[j].resize (mbframes);
                        else
                            uids[i].clear();
                        latticepairs.clear();               // will push_back() below
                        transcripts.clear();
                    }
                }
            }
            // return these utterances
            //fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
                const auto & uttref = randomizedutterancerefs[pos];
                size_t n=0;
                foreach_index(i, randomizedchunks)
                {
                    const auto & chunk = randomizedchunks[i][uttref.chunkindex];
                    const auto & chunkdata = chunk.getchunkdata();
                    assert (uttref.globalts == globalts + tspos);
                    auto uttframes = chunkdata.getutteranceframes (uttref.utteranceindex);
                    matrixasvectorofvectors uttframevectors (uttframes);    // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors())
                    n = uttframevectors.size();
                    assert (n == uttframes.cols() && uttref.numframes == n && chunkdata.numframes (uttref.utteranceindex) == n);

                    // copy the frames and class labels
                    for (size_t t = 0; t < n; t++)          // t = time index into source utterance
                    {
                        size_t leftextent, rightextent;
                        // page in the needed range of frames
                        if (leftcontext[i] == 0 && rightcontext[i] == 0)
                        {
                            leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]);
                        }
                        else
                        {
                            leftextent = leftcontext[i];
                            rightextent = rightcontext[i];
                        }
                        augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], t + tspos);
                        //augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], t + tspos);
                    }

                    // copy the frames and class labels
                    if (i==0)
                    {
                        // Sets utterance ID information.
                        utteranceinfo.push_back(std::make_pair(chunkdata.utteranceset[uttref.utteranceindex].key(), uttref.numframes));

                        auto uttclassids = getclassids (uttref);
                        foreach_index(j, uttclassids)
                        {
                            for (size_t t = 0; t < n; t++)          // t = time index into source utterance
                            {
                                if (issupervised())
                                uids[j][t + tspos] = uttclassids[j][t];
                            }

                            if (!this->lattices.empty())
                            {
                                auto latticepair = chunkdata.getutterancelattice (uttref.utteranceindex);
                                latticepairs.push_back (latticepair);
                                // look up reference
                                const auto & key = latticepair->getkey();
                                if (!allwordtranscripts.empty())
                                {
                                    const auto & transcript = allwordtranscripts.find (key)->second;
                                    transcripts.push_back (transcript.words);
                                }
                            }
                        }
                    }
                }
                tspos += n;
            }
            assert (tspos == mbframes);
        }
        else                // // debug mode returning randomized frames again, to see whether convergence is better (we don't ensure non-repetition at this point)
        {
            const size_t sweepts = sweep * _totalframes;         // first global frame index for this sweep
            const size_t sweepte = sweepts + _totalframes;       // and its end
            const size_t globalte = min (globalts + framesrequested, sweepte);  // we return as much as requested, but not exceeding sweep end
            const size_t mbframes = globalte - globalts;        // that's our mb size

            // determine window range
            // We enumerate all frames--can this be done more efficiently?
            const size_t firstchunk = chunkforframepos (globalts);
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[0][lastchunk].windowend;
			if (verbosity)
                fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
            for (size_t k = 0; k < windowbegin; k++)
                releaserandomizedchunk (k);
            for (size_t k = windowbegin; k < windowend; k++)
                readfromdisk |= requirerandomizedchunk (k, windowbegin, windowend); // (window range passed in for checking only, redundant here)
            for (size_t k = windowend; k < randomizedchunks[0].size(); k++)
                releaserandomizedchunk (k);

            // resize feat and uids
            feat.resize(vdim.size());
            uids.resize(classids.size());
            assert(feat.size()==vdim.size());
            assert(feat.size()==randomizedchunks.size());
            foreach_index(i, feat)
            {
                feat[i].resize(vdim[i], mbframes);

                if (i==0)
                {
                    foreach_index(j, uids)
                    {
                        if (issupervised())             // empty means unsupervised training -> return empty uids
                            uids[j].resize (mbframes);
                        else
                            uids[i].clear();
                        latticepairs.clear();               // will push_back() below
                        transcripts.clear();
                    }
                }
            }
            
            // return randomized frames for the time range of those utterances
            for (size_t j = 0; j < mbframes; j++)
            {
                // map to time index inside arrays
                const size_t framepos = (globalts + j) % _totalframes;  // using mod because we may actually run beyond the sweep for the last call
                const frameref & frameref = randomizedframerefs[framepos];

                // random utterance
                readfromdisk |= requirerandomizedchunk (frameref.chunkindex, windowbegin, windowend);    // (this is just a check; should not actually page in anything)
                
                foreach_index(i, randomizedchunks)
                {
                    const auto & chunk = randomizedchunks[i][frameref.chunkindex];
                    const auto & chunkdata = chunk.getchunkdata();
                    auto uttframes = chunkdata.getutteranceframes (frameref.utteranceindex);
                    matrixasvectorofvectors uttframevectors (uttframes);    // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors())
                    const size_t n = uttframevectors.size();
                    assert (n == uttframes.cols() && chunkdata.numframes (frameref.utteranceindex) == n); n;

                    // copy frame and class labels
                    const size_t t = frameref.frameindex;
                    
                    size_t leftextent, rightextent;
                    // page in the needed range of frames
                    if (leftcontext[i] == 0 && rightcontext[i] == 0)
                    {
                        leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]);
                    }
                    else
                    {
                        leftextent = leftcontext[i];
                        rightextent = rightcontext[i];
                    }
                    augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], j);
                    
                    //augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], j);
                    if (issupervised() && i == 0)
                    {
                        auto frameclassids = getclassids(frameref);
                        foreach_index(k, uids)
                            uids[k][j] = frameclassids[k][t];
                    }
                }                
            }
        }
        timegetbatch = timergetbatch;
        return readfromdisk;
    }
    double gettimegetbatch() { return timegetbatch;}

    // alternate (updated) definition for multiple inputs/outputs - read as a vector of feature matrixes or a vector of label strings
    /*implement*/ bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/,
                           std::vector<size_t> & /*uids*/, std::vector<std::pair<wstring, size_t>> & /*utterances*/,
                           std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & /*transcripts*/,
                           std::vector<shared_ptr<const latticesource::latticepair>> & /*latticepairs*/)
    {
           // should never get here
            throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchutterancesource instead\n");
        
            // for single input/output set size to be 1 and run old getbatch
            //feat.resize(1);
            //uids.resize(1);
            //return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs);
    }
    size_t totalframes() const { return _totalframes; }

    // return first valid globalts to ask getbatch() for
    // In utterance mode, the epoch start may fall in the middle of an utterance.
    // We return the end time of that utterance (which, in pathological cases, may in turn be outside the epoch; handle that).
    /*implement*/ size_t firstvalidglobalts (const size_t globalts)
    {
        // update randomization if a new sweep is entered  --this is a complex operation that updates many of the data members used below
        const size_t sweep = lazyrandomization (globalts);
        // frame mode: start at sweep boundary directly
        if (framemode)
            return globalts;
        // utterance mode
        assert (globalts >= sweep * _totalframes && globalts < (sweep + 1) * _totalframes); sweep;
        foreach_index (pos, randomizedutterancerefs)
            if (randomizedutterancerefs[pos].globalts >= globalts)
                return randomizedutterancerefs[pos].globalts;   // exact or inexact match
        return randomizedutterancerefs.back().globalte();       // boundary case: requested time falls within the last utterance
    }

    const std::vector<size_t> & unitcounts() const { return counts[0]; }
    const std::vector<size_t> & unitcounts(size_t index) const { return counts[index]; }

};

};};
back to top