Content - 77619d6642de274ff92e8a8a545f8e40d3f93769 - 41ada22/Source/Readers/Kaldi2Reader/utterancesourcemulti.h

utterancesourcemulti.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// utterancesourcemulti.h -- implementation of utterancesource.h that supports multiple feature and label sets
//

#pragma once

#include "basetypes.h"      // for attempt()
#include "htkfeatio.h"      // for htkmlfreader
#include "latticearchive.h" // for reading HTK phoneme lattices (MMI training)
#include "minibatchsourcehelpers.h"
#include "minibatchiterator.h"

namespace msra { namespace dbn {

// ---------------------------------------------------------------------------
// minibatchutterancesource -- feature source to provide randomized utterances
// This also implements a frame-wise mode, which is layered on top of the utterance-wise mode
// and thus benefits from its goodies such as corpus-wide high-level randomization and chunk paging.
// ---------------------------------------------------------------------------
class minibatchutterancesourcemulti : public minibatchsource
{
    void operator=(const minibatchutterancesourcemulti &other); // non-assignable
    std::vector<size_t> vdim;                                   // feature dimension after augmenting neighhors
    std::vector<size_t> leftcontext;                            // number of frames to the left of the target frame in the context window
    std::vector<size_t> rightcontext;                           // number of frames to the right of the target frame in the context window
    std::vector<unsigned int> sampperiod;                       // (for reference and to check against model)
    std::vector<string> featkind;
    std::vector<size_t> featdim;
    const bool framemode;                    // true -> actually return frame-level randomized frames (not possible in lattice mode)
    std::vector<std::vector<size_t>> counts; // [s] occurence count for all states (used for priors)
    int verbosity;
    // lattice reader
    // const std::vector<std::unique_ptr<latticesource>> &lattices;
    const latticesource &lattices;

    // std::vector<latticesource> lattices;
    // word-level transcripts (for MMI mode when adding best path to lattices)
    const std::map<std::wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts; // (used for getting word-level transcripts)
                                                                                         // std::vector<std::map<std::wstring,msra::lattices::lattice::htkmlfwordsequence>> allwordtranscripts;
    // data store (incl. paging in/out of features and lattices)
    struct utterancedesc // data descriptor for one utterance
    {
        msra::asr::htkfeatreader::parsedpath parsedpath; // archive filename and frame range in that file
        size_t classidsbegin;                            // index into allclassids[] array (first frame)

        utterancedesc(msra::asr::htkfeatreader::parsedpath &&ppath, size_t classidsbegin)
            : parsedpath(ppath), classidsbegin(classidsbegin)
        {
        }

        const std::wstring &logicalpath() const
        {
            return parsedpath; /*type cast will return logical path*/
        }
        size_t numframes() const
        {
            return parsedpath.numframes();
        }
        const std::wstring key() const // key used for looking up lattice (not stored to save space)
        {
            // the logical path is the uttid, not a filename. don't need to remove extension
            return logicalpath();
        }
    };
    struct utterancechunkdata // data for a chunk of utterances
    {
        std::vector<utterancedesc> utteranceset; // utterances in this set
        size_t numutterances() const
        {
            return utteranceset.size();
        }

        std::vector<size_t> firstframes;                                            // [utteranceindex] first frame for given utterance
        mutable msra::dbn::matrix frames;                                           // stores all frames consecutively (mutable since this is a cache)
        size_t totalframes;                                                         // total #frames for all utterances in this chunk
        mutable std::vector<std::shared_ptr<const latticesource::latticepair>> lattices; // (may be empty if none)

        // construction
        utterancechunkdata()
            : totalframes(0)
        {
        }
        // utterancechunkdata (const utterancechunkdata& other) : utteranceset(other.utteranceset), firstframes(other.firstframes), frames (other.frames), totalframes (other.totalframes), lattices (other.lattices){};
        void push_back(utterancedesc && /*destructive*/ utt)
        {
            // printf ("start push %d %d\n",frames.rows(), frames.cols());

            if (isinram())
            {

                throw std::logic_error("utterancechunkdata: frames already paged into RAM--too late to add data");
            }
            firstframes.push_back(totalframes);
            totalframes += utt.numframes();
            utteranceset.push_back(utt);
        }

        // accessors to an utterance's data
        size_t numframes(size_t i) const
        {
            return utteranceset[i].numframes();
        }
        size_t getclassidsbegin(size_t i) const
        {
            return utteranceset[i].classidsbegin;
        }
        msra::dbn::matrixstripe getutteranceframes(size_t i) const // return the frame set for a given utterance
        {
            if (!isinram())
                throw std::logic_error("getutteranceframes: called when data have not been paged in");
            const size_t ts = firstframes[i];
            const size_t n = numframes(i);
            return msra::dbn::matrixstripe(frames, ts, n);
        }
        std::shared_ptr<const latticesource::latticepair> getutterancelattice(size_t i) const // return the frame set for a given utterance
        {
            if (!isinram())
                throw std::logic_error("getutteranceframes: called when data have not been paged in");
            return lattices[i];
        }

        // paging
        // test if data is in memory at the moment
        bool isinram() const
        {
            return !frames.empty();
        }
        // page in data for this chunk
        // We pass in the feature info variables by ref which will be filled lazily upon first read
        void requiredata(string &featkind, size_t &featdim, unsigned int &sampperiod, const latticesource &latticesource, int verbosity = 0) const
        {

            if (numutterances() == 0)
                throw std::logic_error("requiredata: cannot page in virgin block");
            if (isinram())
                throw std::logic_error("requiredata: called when data is already in memory");
            try // this function supports retrying since we read from the unrealible network, i.e. do not return in a broken state
            {
                msra::asr::htkfeatreader reader; // feature reader (we reinstantiate it for each block, i.e. we reopen the file actually)
                // if this is the first feature read ever, we explicitly open the first file to get the information such as feature dimension
                if (featdim == 0)
                {
                    reader.getinfo(utteranceset[0].parsedpath, featdim);
                    fprintf(stderr, "requiredata: determined feature kind as %zu-dimensional\n", featdim);
                }
                // read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file
                frames.resize(featdim, totalframes);
                if (!latticesource.empty())
                    lattices.resize(utteranceset.size());
                foreach_index (i, utteranceset)
                {
                    // fprintf (stderr, ".");
                    // read features for this file
                    auto uttframes = getutteranceframes(i);                                                           // matrix stripe for this utterance (currently unfilled)
                    reader.readNoAlloc(utteranceset[i].parsedpath, (const string &) featkind, sampperiod, uttframes); // note: file info here used for checkuing only
                    // page in lattice data
                    if (!latticesource.empty())
                        latticesource.getlattices(utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                // fprintf (stderr, "\n");
                if (verbosity)
                {
                    fprintf(stderr, "requiredata: %zu utterances read\n", utteranceset.size());
                }
            }
            catch (...)
            {
                releasedata();
                throw;
            }
        }
        // page out data for this chunk
        void releasedata() const
        {
            if (numutterances() == 0)
                throw std::logic_error("releasedata: cannot page out virgin block");
            if (!isinram())
                throw std::logic_error("releasedata: called when data is not memory");
            // release frames
            frames.resize(0, 0);
            // release lattice data
            lattices.clear();
        }
    };
    std::vector<std::vector<utterancechunkdata>> allchunks;           // set of utterances organized in chunks, referred to by an iterator (not an index)
    std::vector<std::unique_ptr<biggrowablevector<CLASSIDTYPE>>> classids; // [classidsbegin+t] concatenation of all state sequences
    bool issupervised() const
    {
        return !classids.empty();
    }
    size_t numutterances; // total number of utterances
    size_t _totalframes;  // total frames (same as classids.size() if we have labels)
    double timegetbatch;  // [v-hansu] for time measurement
    // sequence in random order of actual use (randomized, where randomization is cached)
    const size_t randomizationrange; // parameter remembered; this is the full window (e.g. 48 hours), not the half window
    size_t currentsweep;             // randomization is currently cached for this sweep; if it changes, rebuild all below
    struct chunk                     // chunk as used in actual processing order (randomized sequence)
    {
        // the underlying chunk (as a non-indexed reference into the chunk set)
        std::vector<utterancechunkdata>::const_iterator uttchunkdata;
        const utterancechunkdata &getchunkdata() const
        {
            return *uttchunkdata;
        }
        size_t numutterances() const
        {
            return uttchunkdata->numutterances();
        }
        size_t numframes() const
        {
            return uttchunkdata->totalframes;
        }

        // position in utterance-position space
        size_t utteranceposbegin;
        size_t utteranceposend() const
        {
            return utteranceposbegin + numutterances();
        }

        // position on global time line
        size_t globalts; // start frame on global timeline (after randomization)
        size_t globalte() const
        {
            return globalts + numframes();
        }

        // randomization range limits
        size_t windowbegin; // randomizedchunk index of earliest chunk that utterances in here can be randomized with
        size_t windowend;   // and end index [windowbegin, windowend)
        chunk(std::vector<utterancechunkdata>::const_iterator uttchunkdata, size_t utteranceposbegin, size_t globalts)
            : uttchunkdata(uttchunkdata), utteranceposbegin(utteranceposbegin), globalts(globalts)
        {
        }
    };
    std::vector<std::vector<chunk>> randomizedchunks; // utterance chunks after being brought into random order (we randomize within a rolling window over them)
    size_t chunksinram;                               // (for diagnostics messages)
    struct utteranceref                               // describes the underlying random utterance associated with an utterance position
    {
        size_t chunkindex;     // lives in this chunk (index into randomizedchunks[])
        size_t utteranceindex; // utterance index in that chunk
        size_t numframes;      // (cached since we cannot directly access the underlying data from here)
        size_t globalts;       // start frame in global space after randomization (for mapping frame index to utterance position)
        size_t globalte() const
        {
            return globalts + numframes;
        } // end frame
        utteranceref(size_t chunkindex, size_t utteranceindex)
            : chunkindex(chunkindex), utteranceindex(utteranceindex), globalts(SIZE_MAX), numframes(0)
        {
        }
        void swap(utteranceref &other) // used in randomization
        {
            std::swap(chunkindex, other.chunkindex);
            std::swap(utteranceindex, other.utteranceindex);
            assert(globalts == SIZE_MAX && other.globalts == SIZE_MAX && numframes == 0 && other.numframes == 0); // can only swap before assigning these
        }
    };
    std::vector<utteranceref> randomizedutterancerefs;            // [pos] randomized utterance ids
    std::unordered_map<size_t, size_t> randomizedutteranceposmap; // [globalts] -> pos lookup table
    struct positionchunkwindow                                    // chunk window required in memory when at a certain position, for controlling paging
    {
        std::vector<chunk>::const_iterator definingchunk; // the chunk in randomizedchunks[] that defined the utterance position of this utterance
        size_t windowbegin() const
        {
            return definingchunk->windowbegin;
        }
        size_t windowend() const
        {
            return definingchunk->windowend;
        }
        bool isvalidforthisposition(const utteranceref &utt) const
        {
            return utt.chunkindex >= windowbegin() && utt.chunkindex < windowend(); // check if 'utt' lives in is in allowed range for this position
        }
        positionchunkwindow(std::vector<chunk>::iterator definingchunk)
            : definingchunk(definingchunk)
        {
        }
    };
    std::vector<positionchunkwindow> positionchunkwindows; // [utterance position] -> [windowbegin, windowend) for controlling paging

    // frame-level randomization layered on top of utterance chunking (randomized, where randomization is cached)
    struct frameref
    {
#ifdef _WIN64                          // (sadly, the compiler makes this 8 bytes, not 6)
        unsigned short chunkindex;     // lives in this chunk (index into randomizedchunks[])
        unsigned short utteranceindex; // utterance index in that chunk
        static const size_t maxutterancesperchunk = 65535;
        unsigned short frameindex; // frame index within the utterance
        static const size_t maxframesperutterance = 65535;
#elif __unix__ // (sadly, the compiler makes this 8 bytes, not 6)
        unsigned short chunkindex;     // lives in this chunk (index into randomizedchunks[])
        unsigned short utteranceindex; // utterance index in that chunk
        static const size_t maxutterancesperchunk = 65535;
        unsigned short frameindex; // frame index within the utterance
        static const size_t maxframesperutterance = 65535;
#else          // For Win32, we care to keep it inside 32 bits. We have already encountered setups where that's not enough.
        unsigned int chunkindex : 13;    // lives in this chunk (index into randomizedchunks[])
        unsigned int utteranceindex : 8; // utterance index in that chunk
        static const size_t maxutterancesperchunk = 255;
        unsigned int frameindex : 11; // frame index within the utterance
        static const size_t maxframesperutterance = 2047;
#endif
        frameref(size_t ci, size_t ui, size_t fi)
            : chunkindex((unsigned short) ci), utteranceindex((unsigned short) ui), frameindex((unsigned short) fi)
        {
#ifdef _WIN32
            static_assert(sizeof(frameref) == 4, "frameref: bit fields too large to fit into 32-bit integer");
#endif
            if (ci == chunkindex && ui == utteranceindex && fi == frameindex)
                return;
            throw std::logic_error("frameref: bit fields too small");
        }
        frameref()
            : chunkindex(0), utteranceindex(0), frameindex(0)
        {
        }
    };
    biggrowablevector<frameref> randomizedframerefs; // [globalt-sweepts] -> (chunk, utt, frame) lookup table for randomized frames  --this can be REALLY big!

    // TODO: this may go away if we store classids directly in the utterance data
    template <class VECTOR>
    class shiftedvector // accessing a vector with a non-0 starting index
    {
        void operator=(const shiftedvector &);
        VECTOR &v;
        size_t first;
        size_t n;
        void check(size_t i) const
        {
            if (i >= n)
                throw std::logic_error("shiftedvector: index out of bounds");
        }

    public:
        shiftedvector(VECTOR &v, size_t first, size_t n)
            : v(v), first(first), n(n)
        {
        }
        // TODO: the following is not templated--do it if needed; also should return a const reference then
        size_t operator[](size_t i) const
        {
            check(i);
            return v[first + i];
        }
    };
    template <class UTTREF>
    std::vector<shiftedvector<biggrowablevector<CLASSIDTYPE>>> getclassids(const UTTREF &uttref) // return sub-vector of classids[] for a given utterance
    {
        std::vector<shiftedvector<biggrowablevector<CLASSIDTYPE>>> allclassids;

        if (!issupervised())
        {
            foreach_index (i, classids)
                allclassids.push_back(std::move(shiftedvector<biggrowablevector<CLASSIDTYPE>>((*classids[i]), 0, 0)));
            return allclassids; // nothing to return
        }
        const auto &chunk = randomizedchunks[0][uttref.chunkindex];
        const auto &chunkdata = chunk.getchunkdata();
        const size_t classidsbegin = chunkdata.getclassidsbegin(uttref.utteranceindex); // index of first state label in global concatenated classids[] array
        const size_t n = chunkdata.numframes(uttref.utteranceindex);
        foreach_index (i, classids)
        {
            if ((*classids[i])[classidsbegin + n] != (CLASSIDTYPE) -1)
                throw std::logic_error("getclassids: expected boundary marker not found, internal data structure screwed up");
            allclassids.push_back(std::move(shiftedvector<biggrowablevector<CLASSIDTYPE>>((*classids[i]), classidsbegin, n)));
        }
        return allclassids; // nothing to return
    }

public:
    // constructor
    // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
    // This mode requires utterances with time stamps.
    minibatchutterancesourcemulti(std::vector<msra::asr::FeatureSection *> &featuresections, const std::vector<std::vector<std::wstring>> &infiles, const std::vector<std::map<std::wstring, std::vector<msra::asr::htkmlfentry>>> &labels,
                                  std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange,
                                  const latticesource &lattices, const std::map<std::wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts, const bool framemode)
        : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2)
    // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice
    // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice
    {
        // process infiles to know dimensions of things (but not loading features)
        std::vector<utterancedesc> utteranceset; // read all utterances to here first; at the end, distribute to chunks
        utteranceset.reserve(infiles.size());
        size_t nomlf = 0;               // number of entries missing in MLF (diagnostics)
        size_t nolat = 0;               // number of entries missing in lattice archive (diagnostics)
        std::vector<size_t> numclasses; // number of output classes as found in the label file (diagnostics)
        _totalframes = 0;
        std::wstring key;
        size_t numutts = 0;

        std::vector<bool> uttisvalid;    // boolean flag to check that utterance is valid. valid means number of
                                         // frames is consistent across all feature and label streams
        std::vector<size_t> uttduration; // track utterance durations to determine utterance validity

        std::vector<size_t> classidsbegin;
        if (!lattices.empty())
        {
            LogicError("lattices not supported in utterancereadermulti");
        }

        allchunks = std::vector<std::vector<utterancechunkdata>>(infiles.size(), std::vector<utterancechunkdata>());
        featdim = std::vector<size_t>(infiles.size(), 0);
        sampperiod = std::vector<unsigned int>(infiles.size(), 0);
        featkind = std::vector<string>(infiles.size(), "");

        numclasses = std::vector<size_t>(labels.size(), 0);
        counts = std::vector<std::vector<size_t>>(labels.size(), std::vector<size_t>());
        foreach_index (i, labels)
        {
            classids.push_back(std::unique_ptr<biggrowablevector<CLASSIDTYPE>>(new biggrowablevector<CLASSIDTYPE>()));
            // std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs;
            // std::unordered_map<std::string,size_t> modelsymmap;
            // lattices.push_back(std::shared_ptr<latticesource>(new latticesource(latticetocs, modelsymmap)));
        }

        // first check consistency across feature streams
        // We'll go through the SCP files for each stream to make sure the duration is consistent
        // If not, we'll plan to ignore the utterance, and inform the user
        // m indexes the feature stream
        // i indexes the files within a stream, i.e. in the SCP file)
        foreach_index (m, infiles)
        {
            if (m == 0)
            {
                numutts = infiles[m].size();
                uttisvalid = std::vector<bool>(numutts, true);
                uttduration = std::vector<size_t>(numutts, 0);
            }
            else if (infiles[m].size() != numutts)
                throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances");

            foreach_index (i, infiles[m])
            {
                utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i], featuresections[m]), 0); // mseltzer - is this foolproof for multiio? is classids always non-empty?
                const size_t uttframes = utterance.numframes();                                                      // will throw if frame bounds not given --required to be given in this mode
                // we need at least 2 frames for boundary markers to work
                if (uttframes < 2)
                {
                    // throw std::runtime_error("minibatchutterancesource: utterances < 2 frames not supported");
                    fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%zd frames) because it less than %d frames for frameref bit field: %S\n",
                            i, uttframes, 2, key.c_str());
                    uttduration[i] = 0;
                    uttisvalid[i] = false;
                }
                else if (uttframes > frameref::maxframesperutterance || uttframes < 2)
                {
                    fprintf(stderr, "minibatchutterancesource: skipping %d-th file (%zd frames) because it exceeds max. frames (%zd) for frameref bit field: %S\n",
                            i, uttframes, frameref::maxframesperutterance, key.c_str());
                    uttduration[i] = 0;
                    uttisvalid[i] = false;
                }
                else
                {
                    if (m == 0)
                    {
                        uttduration[i] = uttframes;
                        uttisvalid[i] = true;
                    }
                    else if (uttduration[i] != uttframes)
                    {
                        fprintf(stderr, "minibatchutterancesource: skipping %d-th file due to inconsistency in duration in different feature streams (%zd vs %zd frames)\n",
                                i, uttduration[i], uttframes);
                        uttduration[i] = 0;
                        uttisvalid[i] = false;
                    }
                }
            }
        }
        size_t invalidutts = 0;
        foreach_index (i, uttisvalid)
        {
            if (!uttisvalid[i])
                invalidutts++;
        }
        if (invalidutts > uttisvalid.size() / 2)
            throw std::runtime_error("minibatchutterancesource: too many files not found in with inconsistent durations, assuming broken configuration\n");
        else if (invalidutts > 0)
            fprintf(stderr, "Found inconsistent durations across feature streams in %zd out of %zd files.\n", invalidutts, uttisvalid.size());

        // now process the features and labels
        size_t utterancesetsize = 0;
        foreach_index (m, infiles)
        {
            utteranceset.clear();
            // if (m==0)
            //    numutts = infiles[m].size();
            // else
            //    if (infiles[m].size()!=numutts)
            //        throw std::runtime_error("minibatchutterancesourcemulti: all feature files must have same number of utterances\n");
            if (m == 0)
                classidsbegin.clear();

            foreach_index (i, infiles[m])
            {
                if (i % (infiles[m].size() / 100 + 1) == 0)
                {
                    fprintf(stderr, ".");
                    fflush(stderr);
                }
                // build utterance descriptor
                if (m == 0 && !labels.empty())
                    classidsbegin.push_back(classids[0]->size());

                if (uttisvalid[i])
                {
                    utterancedesc utterance(msra::asr::htkfeatreader::parsedpath(infiles[m][i], featuresections[m]), labels.empty() ? 0 : classidsbegin[i]); // mseltzer - is this foolproof for multiio? is classids always non-empty?
                    const size_t uttframes = utterance.numframes();                                                                                          // will throw if frame bounds not given --required to be given in this mode
                    assert(uttframes == uttduration[i]);                                                                                                     // ensure nothing funky happened

                    // already performed these checks above
                    // we need at least 2 frames for boundary markers to work
                    // if (uttframes < 2)
                    //    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
                    // if (uttframes > frameref::maxframesperutterance)
                    // {
                    //    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
                    //    continue;
                    // }

                    // check whether we have the ref transcript
                    // auto labelsiter = labels[0].end();
                    bool lacksmlf = true;
                    if (!labels.empty()) // empty means unsupervised mode (don't load any)
                    {
                        key = utterance.key();
                        // check if labels are available (if not, it normally means that no path was found in realignment)
                        auto labelsiter = labels[0].find(key);
                        // const bool lacksmlf = (labelsiter == labels[0].end());
                        lacksmlf = (labelsiter == labels[0].end());
                        if (lacksmlf)
                            if (nomlf++ < 5)
                                fprintf(stderr, " [no labels for  %S]", key.c_str());
                        // check if lattice is available (when in lattice mode)
                        // TODO: also check the #frames here; requires a design change of the TOC format & a rerun
                        const bool lackslat = !lattices.empty() && !lattices.haslattice(key); // ('true' if we have no lattices)
                        if (lackslat)
                            if (nolat++ < 5)
                                fprintf(stderr, " [no lattice for %S]", key.c_str());
                        // skip if either one is missing
                        if (lacksmlf || lackslat)
                        {
                            uttisvalid[i] = false;
                            continue; // skip this utterance at all
                        }
                    }
                    // push the label sequence into classids[], since we already looked it up
                    // TODO: we can store labels more efficiently now since we don't do frame-wise random access anymore.

                    // OK, utterance has all we need --remember it

                    if (m == 0)
                    {
                        if (!labels.empty() && !lacksmlf)
                        // if (!labels.empty() && labelsiter != labels[0].end())
                        {
                            // first verify that all the label files have the proper duration
                            foreach_index (j, labels)
                            {
                                const auto &labseq = labels[j].find(key)->second;
                                // check if durations match; skip if not
                                size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size() - 1].firstframe + labseq[labseq.size() - 1].numframes);
                                if (labframes != uttframes)
                                {
                                    fprintf(stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                    nomlf++;
                                    uttisvalid[i] = false;
                                    break; // continue;   // skip this utterance at all
                                }
                            }
                            if (uttisvalid[i])
                            {
                                utteranceset.push_back(std::move(utterance));
                                _totalframes += uttframes;
                                // then parse each mlf if the durations are consistent
                                foreach_index (j, labels)
                                {
                                    const auto &labseq = labels[j].find(key)->second;

                                    // expand classid sequence into flat array
                                    foreach_index (i, labseq)
                                    {
                                        const auto &e = labseq[i];
                                        if ((i > 0 && labseq[i - 1].firstframe + labseq[i - 1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
                                            throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: labels not in consecutive order MLF in label set: %S", key.c_str()));
                                        if (e.classid >= udim[j])
                                        {
                                            throw std::runtime_error(msra::strfun::strprintf("minibatchutterancesource: class id exceeds model output dimension"));
                                        }
                                        if (e.classid != (CLASSIDTYPE) e.classid)
                                            throw std::runtime_error("CLASSIDTYPE has too few bits");
                                        for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
                                            classids[j]->push_back((CLASSIDTYPE) e.classid);
                                        numclasses[j] = std::max(numclasses[j], (size_t)(1u + e.classid));
                                        counts[j].resize(numclasses[j], 0);
                                        counts[j][e.classid] += e.numframes;
                                    }
                                    classids[j]->push_back((CLASSIDTYPE) -1); // append a boundary marker marker for checking

                                    if (!labels[j].empty() && classids[j]->size() != _totalframes + utteranceset.size())
                                        throw std::logic_error(msra::strfun::strprintf("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
                                    assert(labels[j].empty() || classids[j]->size() == _totalframes + utteranceset.size());
                                }
                            }
                        }
                        else
                        {
                            assert(classids.empty() && labels.empty());
                            utteranceset.push_back(std::move(utterance));
                            _totalframes += uttframes;
                        }
                    }
                    else
                    {
                        utteranceset.push_back(std::move(utterance));
                    }
                }
            }
            if (m == 0)
                utterancesetsize = utteranceset.size();
            else
                assert(utteranceset.size() == utterancesetsize);
            fprintf(stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(), infiles[m].size());

            if (!labels.empty())
            {
                foreach_index (j, labels)
                {
                    biggrowablevector<CLASSIDTYPE> &cid = *classids[j];
                    foreach_index (i, utteranceset)
                    {
                        // if ((*classids[j])[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1)
                        // printf("index = %d\n",utteranceset[i].classidsbegin + utteranceset[i].numframes());
                        // printf("cid[index] = %d\n",cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()]);
                        // printf("CLASSIDTYPE(-1) = %d\n",(CLASSIDTYPE) -1);
                        if (cid[utteranceset[i].classidsbegin + utteranceset[i].numframes()] != (CLASSIDTYPE) -1)
                            throw std::logic_error("minibatchutterancesource: classids[] out of sync");
                    }
                }
            }
            if (nomlf + nolat > 0)
            {
                fprintf(stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles.size(), nomlf, nolat);
                if (nomlf + nolat > infiles[m].size() / 2)
                    throw std::runtime_error("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
            }

            if (m == 0)
            {
                foreach_index (j, numclasses)
                {
                    fprintf(stderr, "label set %d: %zu classes\n", j, numclasses[j]);
                }
            }
            // distribute them over chunks
            // We simply count off frames until we reach the chunk size.
            // Note that we first randomize the chunks, i.e. when used, chunks are non-consecutive and thus cause the disk head to seek for each chunk.
            const size_t framespersec = 100;                   // we just assume this; our efficiency calculation is based on this
            const size_t chunkframes = 15 * 60 * framespersec; // number of frames to target for each chunk
            // Loading an initial 24-hour range will involve 96 disk seeks, acceptable.
            // When paging chunk by chunk, chunk size ~14 MB.
            std::vector<utterancechunkdata> &thisallchunks = allchunks[m];
            // std::vector<utterancechunkdata>  thisallchunks;

            thisallchunks.resize(0);
            thisallchunks.reserve(_totalframes / chunkframes);
            foreach_index (i, utteranceset)
            {
                // if exceeding current entry--create a new one
                // I.e. our chunks are a little larger than wanted (on av. half the av. utterance length).
                if (thisallchunks.empty() || thisallchunks.back().totalframes > chunkframes || thisallchunks.back().numutterances() >= frameref::maxutterancesperchunk)
                {
                    thisallchunks.push_back(utterancechunkdata());
                }
                // append utterance to last chunk
                utterancechunkdata &currentchunk = thisallchunks.back();

                currentchunk.push_back(std::move(utteranceset[i])); // move it out from our temp array into the chunk
                // TODO: above push_back does not actually 'move' because the internal push_back does not accept that
            }

            numutterances = utteranceset.size();
            fprintf(stderr, "minibatchutterancesource: %zu utterances grouped into %zu chunks, av. chunk size: %.1f utterances, %.1f frames\n",
                    numutterances, thisallchunks.size(), numutterances / (double) thisallchunks.size(), _totalframes / (double) thisallchunks.size());
            // Now utterances are stored exclusively in allchunks[]. They are never referred to by a sequential utterance id at this point, only by chunk/within-chunk index.
        }
        // preliminary mem allocation for frame references (if in frame mode)
        if (framemode)
            randomizedframerefs.resize(_totalframes);
    }

private:
    // shuffle a vector into random order by randomly swapping elements

    template <typename VECTOR>
    static void randomshuffle(VECTOR &v, size_t randomseed)
    {
        if (v.size() > RAND_MAX * (size_t) RAND_MAX)
            throw std::runtime_error("randomshuffle: too large set: need to change to different random generator!");
        srand((unsigned int) randomseed);
        foreach_index (i, v)
        {
            // pick a random location
            const size_t irand = Microsoft::MSR::CNTK::rand(0, v.size());

            // swap element i with it
            if (irand == (size_t) i)
                continue;
            std::swap(v[i], v[irand]);
        }
    }
#if 0
    template<typename VECTOR> static void randomshuffle(std::vector<VECTOR &> v, size_t randomseed)
    {
        foreach_index(j, v)
        {
           if (v[j].size() > RAND_MAX * (size_t) RAND_MAX)
            throw std::runtime_error ("randomshuffle: too large set: need to change to different random generator!");
        }
        srand ((unsigned int) randomseed);

        foreach_index (i, v[0])
        {
           // pick a random location
            const size_t irand = msra::dbn::rand (0, v[0].size());

            foreach_index(j, v){
            // swap element i with it
                if (irand == (size_t) i)
                    continue;
                std::swap (v[j][i], v[j][irand]);
            }
        }
    }
#endif // 0
    static void checkoverflow(size_t fieldval, size_t targetval, const char *fieldname)
    {
        if (fieldval != targetval)
            throw std::runtime_error(msra::strfun::strprintf("checkoverflow: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, targetval, fieldval));
    }

    // helper for testing whether a swapped frame position is valid (w.r.t. beign in RAM when being at position 't')
    bool isframepositionvalid(const size_t t, const biggrowablevector<unsigned short> &ttochunk) const
    {
        // look up valid range for time position
        const size_t positionchunkindex = ttochunk[t]; // position 't' lies within this original chunk (relationship is monotonous, not random)
        const auto &chunk = randomizedchunks[0][positionchunkindex];
        // get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk)
        const size_t poswindowbegin = chunk.windowbegin; // rolling window over chunks (which under the hood have been randomized)
        const size_t poswindowend = chunk.windowend;
        // Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM.

        // now see if the randomized location is within that window
        const size_t actualchunkindexforpos = randomizedframerefs[t].chunkindex; // where this frame pos has been mapped to
        return actualchunkindexforpos >= poswindowbegin && actualchunkindexforpos < poswindowend;
        // We only need to test the chunk index. Utterance and frame can be randomized within a chunk as we want, as long it is in RAM.
    }

    // big long helper to update all cached randomization information
    // This is a rather complex process since we randomize on two levels:
    //  - chunks of consecutive data in the feature archive
    //  - within a range of chunks that is paged into RAM
    //     - utterances (in utt mode), or
    //     - frames (in frame mode)
    // The 'globalts' parameter is the start time that triggered the rerandomization; it is NOT the base time of the randomized area.
    size_t lazyrandomization(const size_t globalts)
    {
        const size_t sweep = globalts / _totalframes; // which sweep (this determines randomization)
        if (sweep == currentsweep)                    // already got this one--nothing to do
            return sweep;

        currentsweep = sweep;
        if (verbosity > 0)
            fprintf(stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes; // first global frame index for this sweep

        // first randomize chunks
        std::vector<std::vector<std::vector<utterancechunkdata>::const_iterator>> randomizedchunkrefs;
        foreach_index (i, allchunks)
            randomizedchunkrefs.push_back(std::vector<std::vector<utterancechunkdata>::const_iterator>());

        foreach_index (i, allchunks)
            randomizedchunkrefs[i].reserve(allchunks[i].size());

        foreach_index (i, allchunks) // TODO: this cries for iterating using the iterator!
        {
            foreach_index (j, allchunks[i])
                randomizedchunkrefs[i].push_back(allchunks[i].begin() + j);
            assert(randomizedchunkrefs[i].size() == allchunks[i].size());

            // note that sincew randomshuffle() uses sweep as seed, this will keep the randomization common across all feature streams
            randomshuffle(randomizedchunkrefs[i], sweep); // bring into random order (with random seed depending on sweep)
        }

        // place them onto the global timeline -> randomizedchunks[]
        // We are processing with randomization within a rolling window over this chunk sequence.
        // Paging will happen on a chunk-by-chunk basis.
        // The global time stamp is needed to determine the paging window.
        randomizedchunks.clear(); // data chunks after being brought into random order (we randomize within a rolling window over them)

        foreach_index (i, allchunks)
            randomizedchunks.push_back(std::vector<chunk>());

        foreach_index (i, allchunks)
        {
            randomizedchunks[i].reserve(randomizedchunkrefs[i].size());
            foreach_index (k, randomizedchunkrefs[i])
                randomizedchunks[i].push_back(chunk(randomizedchunkrefs[i][k], randomizedchunks[i].empty() ? 0 : randomizedchunks[i].back().utteranceposend(), randomizedchunks[i].empty() ? sweepts : randomizedchunks[i].back().globalte()));
            assert(randomizedchunks[i].size() == allchunks[i].size());

            assert(randomizedchunks[i].empty() || (randomizedchunks[i].back().utteranceposend() == numutterances && randomizedchunks[i].back().globalte() == sweepts + _totalframes));
        }
        // for each chunk, compute the randomization range (w.r.t. the randomized chunk sequence)
        foreach_index (i, randomizedchunks)
        {
            foreach_index (k, randomizedchunks[i])
            {
                chunk &chunk = randomizedchunks[i][k];
                // start with the range of left neighbor
                if (k == 0)
                {
                    chunk.windowbegin = 0;
                    chunk.windowend = 1;
                }
                else
                {
                    chunk.windowbegin = randomizedchunks[i][k - 1].windowbegin; // might be too early
                    chunk.windowend = randomizedchunks[i][k - 1].windowend;     // might have more space
                }
                while (chunk.globalts - randomizedchunks[i][chunk.windowbegin].globalts > randomizationrange / 2)
                    chunk.windowbegin++; // too early
                while (chunk.windowend < randomizedchunks[i].size() && randomizedchunks[i][chunk.windowend].globalte() - chunk.globalts < randomizationrange / 2)
                    chunk.windowend++; // got more space
            }
        }
        if (!framemode) // utterance mode
        {
            // This sets up the following members:
            //  - positionchunkwindows
            //  - randomizedutterancerefs
            //  - randomizedutteranceposmap

            // We will now introduce the concept of utterance *position*.
            // During processing, utterances will be indexed by position (which is in turn derived from a frame index in getbatch()),
            // and it is assumed (required) that positions are requested consecutively.
            // Each utterance position has an underlying associated utterance, which is represented as (chunkid, within-chunk index) and randomly assigned.
            // Each utterance position also has an associated range of chunks that are kept in memory,
            // and the associated underlying utterance is guaranteed to be found within that associated range of chunks.
            // That allows to page out/in data when processing utterance positions in a consecutive manner.

            // compute chunk windows for every utterance position -> positionchunkwindows[]
            // Utterance positions can only reference underlying utterance data within the chunk window.
            // Utterance positions are defined by the randomized chunk sequence (i.e. their underlying 'defining' chunk differs from sweep to sweep).
            positionchunkwindows.clear(); // [utterance position] -> [windowbegin, windowend) for controlling paging
            positionchunkwindows.reserve(numutterances);

            // positionchunkwindows should be consistent for all inputs (distinct feature streams), so just build based on feature[0]
            // contains pointer to chunk elements but only to compute index
            foreach_index (k, randomizedchunks[0]) // TODO: this really cries for iterating using iterators!
            {
                chunk &chunk = randomizedchunks[0][k];
                for (size_t i = chunk.utteranceposbegin; i < chunk.utteranceposend(); i++) // loop over utterances in this chunk
                {
                    positionchunkwindows.push_back(randomizedchunks[0].begin() + k);
                }
                // to look up the chunk range in memory for a position, look up the defining chunk and its range
            }
            assert(positionchunkwindows.size() == numutterances);

            // build the randomized utterances array -> randomizedutterancerefs[]
            // start by assigning all utterance positions to utterances in non-random consecutive manner
            randomizedutterancerefs.clear(); // [pos] randomized utterance ids
            randomizedutterancerefs.reserve(numutterances);
            foreach_index (k, randomizedchunks[0])
            {
                chunk &chunk = randomizedchunks[0][k];
                for (size_t i = 0; i < chunk.numutterances(); i++) // loop over utterances in this chunk
                    randomizedutterancerefs.push_back(utteranceref(k, i));
            }
            assert(randomizedutterancerefs.size() == numutterances);
            foreach_index (i, randomizedutterancerefs)
            {
                auto &uttref = randomizedutterancerefs[i];
                assert(positionchunkwindows[i].isvalidforthisposition(uttref));
                uttref;
            }

            // check we got those setup right

            // we now randomly shuffle randomizedutterancerefs[pos], while considering the constraints of what chunk range needs to be in memory
            srand((unsigned int) sweep + 1);
            for (size_t i = 0; i < randomizedutterancerefs.size(); i++)
            {
                // get valid randomization range, expressed in chunks
                const size_t windowbegin = positionchunkwindows[i].windowbegin();
                const size_t windowend = positionchunkwindows[i].windowend();

                // get valid randomization range, expressed in utterance positions
                // Remember, utterance positions are defined by chunks.
                const size_t posbegin = randomizedchunks[0][windowbegin].utteranceposbegin;
                const size_t posend = randomizedchunks[0][windowend - 1].utteranceposend();

                // randomization range for this utterance position is [posbegin, posend)
                for (;;)
                {
                    // pick a random location
                    const size_t j = Microsoft::MSR::CNTK::rand(posbegin, posend); // a random number within the window
                    if (i == j)
                        break; // the random gods say "this one points to its original position"... nothing wrong about that, but better not try to swap

                    // We want to swap utterances at i and j, but need to make sure they remain in their allowed range.
                    // This is guaranteed for a so-far untouched utterance, but both i and j may have been touched by a previous swap.

                    // We want to use the utterance previously referenced at utterance position j at position i. Is that allowed?
                    if (!positionchunkwindows[i].isvalidforthisposition(randomizedutterancerefs[j]))
                        continue; // nope --try another

                    // Likewise may we use the utterance previously referenced at utterance position i at position j?
                    if (!positionchunkwindows[j].isvalidforthisposition(randomizedutterancerefs[i]))
                        continue; // nope --try another

                    // yep--swap them
                    randomizedutterancerefs[i].swap(randomizedutterancerefs[j]);
                    break;
                }
            }

            // place the randomized utterances on the global timeline so we can find them by globalts
            size_t t = sweepts;
            foreach_index (i, randomizedutterancerefs)
            {
                auto &uttref = randomizedutterancerefs[i];
                uttref.globalts = t;
                uttref.numframes = randomizedchunks[0][uttref.chunkindex].getchunkdata().numframes(uttref.utteranceindex);
                t = uttref.globalte();
            }
            assert(t == sweepts + _totalframes);

            // verify that we got it right (I got a knot in my head!)
            foreach_index (i, randomizedutterancerefs)
            {
                // get utterance referenced at this position
                const auto &uttref = randomizedutterancerefs[i];
                // check if it is valid for this position
                if (uttref.chunkindex < positionchunkwindows[i].windowbegin() || uttref.chunkindex >= positionchunkwindows[i].windowend())
                    throw std::logic_error("lazyrandomization: randomization logic mangled!");
            }

            // create lookup table for (globalts values -> pos) -> randomizedutteranceposmap[]
            randomizedutteranceposmap.clear(); // [globalts] -> pos lookup table
            foreach_index (pos, randomizedutterancerefs)
            {
                auto &uttref = randomizedutterancerefs[pos];
                randomizedutteranceposmap[uttref.globalts] = (size_t) pos;
            }
        }
        else // frame mode
        {
            // This sets up the following members:
            //  - randomizedframerefs

            srand((unsigned int) sweep + 1);
            // An original timeline is established by the randomized chunks, denoted by 't'.
            // Returned frames are indexed by frame position j = (globalt - sweept), which have an associated underlying 't'.
            // It is guaranteed that uttterance frame position j maps to an underlying frame within the corresponding chunk window.
            biggrowablevector<unsigned short> ttochunk; // randomized chunk index associated with frame position
            ttochunk.resize(_totalframes);
            size_t t = 0;
            frameref frameref;
            // enumerate chunks in their randomized order and assign frame indices in that order -> randomizedframerefs[t]
            // At this point, chunks are in randomized order, but utterances and frames within utterances are not randomized.
            // Later we will randomize those as well.
            foreach_index (i, randomizedchunks[0])
            {
                frameref.chunkindex = (unsigned short) i;
                checkoverflow(frameref.chunkindex, i, "frameref::chunkindex");
                const auto &chunk = randomizedchunks[0][i];
                const auto &chunkdata = chunk.getchunkdata();
                const size_t numutt = chunkdata.numutterances();
                for (size_t k = 0; k < numutt; k++)
                {
                    frameref.utteranceindex = (short) k;
                    checkoverflow(frameref.utteranceindex, k, "frameref::utteranceindex");
                    const size_t n = chunkdata.numframes(k);
                    for (size_t m = 0; m < n; m++)
                    {
                        frameref.frameindex = (short) m;
                        checkoverflow(frameref.frameindex, m, "frameref::utteranceindex");
                        randomizedframerefs[t] = frameref; // hopefully this is a memory copy, not a bit-wise assignment! If not, then code it explicitly
                        ttochunk[t] = (unsigned short) i;
                        checkoverflow(ttochunk[t], i, "ttochunk[]");
                        t++;
                    }
                }
            }
            assert(t == _totalframes);

            // now randomize them --we use the nested loop again to avoid storing a backpointer
            // The condition is that a randomized frame may not be moved out of its associated chunk window.
            foreach_index (t, randomizedframerefs)
            {
                const size_t positionchunkindex = ttochunk[t];               // position 't' lies within this chunk (relationship is monotonous, not random)
                const auto &chunk = randomizedchunks[0][positionchunkindex]; // for window

                // get in-RAM chunk range for this frame position (shared across all frame positions within the same chunk)
                const size_t poswindowbegin = chunk.windowbegin; // rolling window over chunks (which under the hood have been randomized)
                const size_t poswindowend = chunk.windowend;
                // Chunk implies that if we are at position 't', we are guaranteed to have chunks [poswindowbegin, poswindowend) in RAM.
                // These chunks are associated with a range of frame positions.
                // It is implied that if we are at position 't', the frames covered by chunks [poswindowbegin, poswindowend) are in RAM.
                const size_t postbegin = randomizedchunks[0][poswindowbegin].globalts - sweepts;
                const size_t postend = randomizedchunks[0][poswindowend - 1].globalte() - sweepts;
                // The position that this frame gets randomized to must be guaranteed to belong to a chunk within [postbegin, postend).

                for (;;) // (randomization retry loop)
                {
                    size_t tswap = Microsoft::MSR::CNTK::rand(postbegin, postend); // random frame position within allowed range
                    // We want to swap 't' to 'tswap' and 'tswap' to 't'.
                    //  - Both may have been swapped before.
                    //  - Both must stay within the randomization window of their respective position.
                    // check admissibility of where the element at 'tswap' gets swapped to 't' (range = [windowbegin,windowend))
                    size_t tswapchunkindex = randomizedframerefs[tswap].chunkindex;
                    if (tswapchunkindex < poswindowbegin || tswapchunkindex >= poswindowend)
                        continue;
                    // check admissibility of where the element at t gets swapped to (which is frame position 'tswap')
                    const size_t sourcechunkindex = randomizedframerefs[t].chunkindex;
                    size_t targetchunkindex = ttochunk[tswap]; // chunk associated with this frame position defines value range
                    const auto &targetchunk = randomizedchunks[0][targetchunkindex];
                    const size_t targetwindowbegin = targetchunk.windowbegin;
                    const size_t targetwindowend = targetchunk.windowend;
                    if (sourcechunkindex < targetwindowbegin || sourcechunkindex >= targetwindowend)
                        continue;
                    // admissible--swap the two
                    std::swap(randomizedframerefs[t], randomizedframerefs[tswap]);
#if 0
                    break;
#else // post-check  --so far did not trigger, can be removed

                    // do a post-check if we got it right  --we seem not to
                    if (isframepositionvalid(t, ttochunk) && isframepositionvalid(tswap, ttochunk))
                        break;
                    // not valid: swap them back and try again  --we actually discovered a bug in the code above
                    std::swap(randomizedframerefs[t], randomizedframerefs[tswap]);
                    fprintf(stderr, "lazyrandomization: BUGBUG --invalid swapping condition detected\n");
#endif
                }
            }

            // check it --my head spins
            t = 0;
            foreach_index (i, randomizedchunks[0])
            {
                const auto &chunk = randomizedchunks[0][i]; // for window and chunkdata
                const size_t poswindowbegin = chunk.windowbegin;
                const size_t poswindowend = chunk.windowend;

                const auto &chunkdata = chunk.getchunkdata(); // for numutterances/numframes
                const size_t numutt = chunkdata.numutterances();
                for (size_t k = 0; k < numutt; k++)
                {
                    const size_t n = chunkdata.numframes(k);
                    for (size_t m = 0; m < n; m++)
                    {
                        const size_t randomizedchunkindex = randomizedframerefs[t].chunkindex;
                        if (randomizedchunkindex < poswindowbegin || randomizedchunkindex >= poswindowend)
                            throw std::logic_error("lazyrandomization: nope, you got frame randomization wrong, dude");
                        t++;
                    }
                }
            }
            assert(t == _totalframes);
        }

        return sweep;
    }

    // helper to page out a chunk with log message
    void releaserandomizedchunk(size_t k)
    {
        size_t numreleased = 0;
        foreach_index (m, randomizedchunks)
        {
            auto &chunkdata = randomizedchunks[m][k].getchunkdata();
            if (chunkdata.isinram())
            {
                if (verbosity)
                    fprintf(stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
                            k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte() - 1, chunksinram - 1);
                chunkdata.releasedata();
                numreleased++;
            }
        }
        if (numreleased > 0 && numreleased < randomizedchunks.size())
        {
            LogicError("releaserandomizedchunk: inconsistency detected - some inputs have chunks in ram, some not");
        }
        else if (numreleased == randomizedchunks.size())
        {
            chunksinram--;
        }
        return;
    }

    // helper to page in a chunk for a given utterance
    // (window range passed in for checking only)
    // Returns true if we actually did read something.
    bool requirerandomizedchunk(const size_t chunkindex, const size_t windowbegin, const size_t windowend)
    {
        size_t numinram = 0;

        if (chunkindex < windowbegin || chunkindex >= windowend)
            throw std::logic_error("requirerandomizedchunk: requested utterance outside in-memory chunk range");

        foreach_index (m, randomizedchunks)
        {
            auto &chunk = randomizedchunks[m][chunkindex];
            auto &chunkdata = chunk.getchunkdata();
            if (chunkdata.isinram())
                numinram++;
        }
        if (numinram == randomizedchunks.size())
        {

            return false;
        }
        else if (numinram == 0)
        {
            foreach_index (m, randomizedchunks)
            {
                auto &chunk = randomizedchunks[m][chunkindex];
                auto &chunkdata = chunk.getchunkdata();
                if (verbosity)
                    fprintf(stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte() - 1, chunksinram + 1);
                msra::util::attempt(5, [&]() // (reading from network)
                                    {
                                        chunkdata.requiredata(featkind[m], featdim[m], sampperiod[m], this->lattices, verbosity);
                                    });
            }
            chunksinram++;
            return true;
        }
        else
        {
            LogicError("requirerandomizedchunk: inconsistency detected - some inputs need chunks paged in, some not");
        }
    }

    class matrixasvectorofvectors // wrapper around a matrix that views it as a vector of column vectors
    {
        void operator=(const matrixasvectorofvectors &); // non-assignable
        msra::dbn::matrixbase &m;

    public:
        matrixasvectorofvectors(msra::dbn::matrixbase &m)
            : m(m)
        {
        }
        size_t size() const
        {
            return m.cols();
        }
        const_array_ref<float> operator[](size_t j) const
        {
            return array_ref<float>(&m(0, j), m.rows());
        }
    };

    size_t chunkforframepos(const size_t t) const // find chunk for a given frame position
    {
        // inspect chunk of first feature stream only
        auto iter = std::lower_bound(randomizedchunks[0].begin(), randomizedchunks[0].end(), t, [&](const chunk &chunk, size_t t)
                                     {
                                         return chunk.globalte() <= t;
                                     });
        const size_t chunkindex = iter - randomizedchunks[0].begin();
        if (t < randomizedchunks[0][chunkindex].globalts || t >= randomizedchunks[0][chunkindex].globalte())
            throw std::logic_error("chunkforframepos: dude, learn STL!");
        return chunkindex;
    }

public:
    void setverbosity(int newverbosity)
    {
        verbosity = newverbosity;
    }

    // get the next minibatch
    // A minibatch is made up of one or more utterances.
    // We will return less than 'framesrequested' unless the first utterance is too long.
    // Note that this may return frames that are beyond the epoch end, but the first frame is always within the epoch.
    // We specify the utterance by its global start time (in a space of a infinitely repeated training set).
    // This is efficient since getbatch() is called with sequential 'globalts' except at epoch start.
    // Note that the start of an epoch does not necessarily fall onto an utterance boundary. The caller must use firstvalidglobalts() to find the first valid globalts at or after a given time.
    //
    //
    /*implement*/ bool getbatch(const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> &feat,
                                std::vector<std::vector<size_t>> &uids, std::vector<std::pair<std::wstring, size_t>> &utteranceinfo,
                                std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> &transcripts,
                                std::vector<std::shared_ptr<const latticesource::latticepair>> &latticepairs)
    {
        bool readfromdisk = false; // return value: shall be 'true' if we paged in anything

        auto_timer timergetbatch;
        assert(_totalframes > 0);

        // Clears <utteranceinfo> vector.
        utteranceinfo.clear();

        // update randomization if a new sweep is entered  --this is a complex operation that updates many of the data members used below
        const size_t sweep = lazyrandomization(globalts);

        const std::vector<char> noboundaryflags; // dummy
        if (!framemode)                          // regular utterance mode
        {

            // find utterance position for globalts
            // There must be a precise match; it is not possible to specify frames that are not on boundaries.
            auto positer = randomizedutteranceposmap.find(globalts);
            if (positer == randomizedutteranceposmap.end())
                throw std::logic_error("getbatch: invalid 'globalts' parameter; must match an existing utterance boundary");
            const size_t spos = positer->second;

            // determine how many utterances will fit into the requested minibatch size
            size_t mbframes = randomizedutterancerefs[spos].numframes; // at least one utterance, even if too long
            size_t epos;
            for (epos = spos + 1; epos < numutterances && mbframes + randomizedutterancerefs[epos].numframes < framesrequested; epos++) // add more utterances as long as they fit within requested minibatch size
                mbframes += randomizedutterancerefs[epos].numframes;

            // do some paging housekeeping
            // This will also set the feature-kind information if it's the first time.
            // Free all chunks left of the range.
            // Page-in all chunks right of the range.
            // We are a little more blunt for now: Free all outside the range, and page in only what is touched. We could save some loop iterations.
            const size_t windowbegin = positionchunkwindows[spos].windowbegin();
            const size_t windowend = positionchunkwindows[epos - 1].windowend();
            for (size_t k = 0; k < windowbegin; k++)
                releaserandomizedchunk(k);
            for (size_t k = windowend; k < randomizedchunks[0].size(); k++)
                releaserandomizedchunk(k);

            for (size_t pos = spos; pos < epos; pos++)
                readfromdisk |= requirerandomizedchunk(randomizedutterancerefs[pos].chunkindex, windowbegin, windowend); // (window range passed in for checking only)

            // resize feat and uids
            feat.resize(vdim.size());
            uids.resize(classids.size());
            assert(feat.size() == vdim.size());
            assert(feat.size() == randomizedchunks.size());
            foreach_index (i, feat)
            {
                feat[i].resize(vdim[i], mbframes);

                if (i == 0)
                {
                    foreach_index (j, uids)
                    {
                        if (issupervised()) // empty means unsupervised training -> return empty uids
                            uids[j].resize(mbframes);
                        else
                            uids[i].clear();
                        latticepairs.clear(); // will push_back() below
                        transcripts.clear();
                    }
                }
            }
            // return these utterances
            // fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0; // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
                const auto &uttref = randomizedutterancerefs[pos];
                size_t n = 0;
                foreach_index (i, randomizedchunks)
                {
                    const auto &chunk = randomizedchunks[i][uttref.chunkindex];
                    const auto &chunkdata = chunk.getchunkdata();
                    assert(uttref.globalts == globalts + tspos);
                    auto uttframes = chunkdata.getutteranceframes(uttref.utteranceindex);
                    matrixasvectorofvectors uttframevectors(uttframes); // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors())
                    n = uttframevectors.size();
                    assert(n == uttframes.cols() && uttref.numframes == n && chunkdata.numframes(uttref.utteranceindex) == n);

                    // copy the frames and class labels
                    for (size_t t = 0; t < n; t++) // t = time index into source utterance
                    {
                        size_t leftextent, rightextent;
                        // page in the needed range of frames
                        if (leftcontext[i] == 0 && rightcontext[i] == 0)
                        {
                            leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]);
                        }
                        else
                        {
                            leftextent = leftcontext[i];
                            rightextent = rightcontext[i];
                        }
                        augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], t + tspos);
                        // augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], t + tspos);
                    }

                    // copy the frames and class labels
                    if (i == 0)
                    {
                        // Sets utterance ID information.
                        utteranceinfo.push_back(std::make_pair(chunkdata.utteranceset[uttref.utteranceindex].key(), uttref.numframes));

                        auto uttclassids = getclassids(uttref);
                        foreach_index (j, uttclassids)
                        {
                            for (size_t t = 0; t < n; t++) // t = time index into source utterance
                            {
                                if (issupervised())
                                    uids[j][t + tspos] = uttclassids[j][t];
                            }

                            if (!this->lattices.empty())
                            {
                                auto latticepair = chunkdata.getutterancelattice(uttref.utteranceindex);
                                latticepairs.push_back(latticepair);
                                // look up reference
                                const auto &key = latticepair->getkey();
                                if (!allwordtranscripts.empty())
                                {
                                    const auto &transcript = allwordtranscripts.find(key)->second;
                                    transcripts.push_back(transcript.words);
                                }
                            }
                        }
                    }
                }
                tspos += n;
            }
            assert(tspos == mbframes);
        }
        else // //  debug mode returning randomized frames again, to see whether convergence is better (we don't ensure non-repetition at this point)
        {
            const size_t sweepts = sweep * _totalframes;                      // first global frame index for this sweep
            const size_t sweepte = sweepts + _totalframes;                    // and its end
            const size_t globalte = std::min(globalts + framesrequested, sweepte); // we return as much as requested, but not exceeding sweep end
            const size_t mbframes = globalte - globalts;                      // that's our mb size

            // determine window range
            // We enumerate all frames--can this be done more efficiently?
            const size_t firstchunk = chunkforframepos(globalts);
            const size_t lastchunk = chunkforframepos(globalte - 1);
            const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[0][lastchunk].windowend;
            if (verbosity)
                fprintf(stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
                        globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
            for (size_t k = 0; k < windowbegin; k++)
                releaserandomizedchunk(k);
            for (size_t k = windowbegin; k < windowend; k++)
                readfromdisk |= requirerandomizedchunk(k, windowbegin, windowend); // (window range passed in for checking only, redundant here)
            for (size_t k = windowend; k < randomizedchunks[0].size(); k++)
                releaserandomizedchunk(k);

            // resize feat and uids
            feat.resize(vdim.size());
            uids.resize(classids.size());
            assert(feat.size() == vdim.size());
            assert(feat.size() == randomizedchunks.size());
            foreach_index (i, feat)
            {
                feat[i].resize(vdim[i], mbframes);

                if (i == 0)
                {
                    foreach_index (j, uids)
                    {
                        if (issupervised()) // empty means unsupervised training -> return empty uids
                            uids[j].resize(mbframes);
                        else
                            uids[i].clear();
                        latticepairs.clear(); // will push_back() below
                        transcripts.clear();
                    }
                }
            }

            // return randomized frames for the time range of those utterances
            for (size_t j = 0; j < mbframes; j++)
            {
                // map to time index inside arrays
                const size_t framepos = (globalts + j) % _totalframes; // using mod because we may actually run beyond the sweep for the last call
                const frameref &frameref = randomizedframerefs[framepos];

                // random utterance
                readfromdisk |= requirerandomizedchunk(frameref.chunkindex, windowbegin, windowend); // (this is just a check; should not actually page in anything)

                foreach_index (i, randomizedchunks)
                {
                    const auto &chunk = randomizedchunks[i][frameref.chunkindex];
                    const auto &chunkdata = chunk.getchunkdata();
                    auto uttframes = chunkdata.getutteranceframes(frameref.utteranceindex);
                    matrixasvectorofvectors uttframevectors(uttframes); // (wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors())
                    const size_t n = uttframevectors.size();
                    assert(n == uttframes.cols() && chunkdata.numframes(frameref.utteranceindex) == n);
                    n;

                    // copy frame and class labels
                    const size_t t = frameref.frameindex;

                    size_t leftextent, rightextent;
                    // page in the needed range of frames
                    if (leftcontext[i] == 0 && rightcontext[i] == 0)
                    {
                        leftextent = rightextent = augmentationextent(uttframevectors[t].size(), vdim[i]);
                    }
                    else
                    {
                        leftextent = leftcontext[i];
                        rightextent = rightcontext[i];
                    }
                    augmentneighbors(uttframevectors, noboundaryflags, t, leftextent, rightextent, feat[i], j);

                    // augmentneighbors(uttframevectors, noboundaryflags, t, feat[i], j);
                    if (issupervised() && i == 0)
                    {
                        auto frameclassids = getclassids(frameref);
                        foreach_index (k, uids)
                            uids[k][j] = frameclassids[k][t];
                    }
                }
            }
        }
        timegetbatch = timergetbatch;
        return readfromdisk;
    }
    double gettimegetbatch()
    {
        return timegetbatch;
    }

    // alternate (updated) definition for multiple inputs/outputs - read as a vector of feature matrixes or a vector of label strings
    /*implement*/ bool getbatch(const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/,
                                std::vector<size_t> & /*uids*/, std::vector<std::pair<std::wstring, size_t>> & /*utterances*/,
                                std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & /*transcripts*/,
                                std::vector<std::shared_ptr<const latticesource::latticepair>> & /*latticepairs*/)
    {
        // should never get here
        throw std::runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchutterancesource instead\n");

        // for single input/output set size to be 1 and run old getbatch
        // feat.resize(1);
        // uids.resize(1);
        // return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, latticepairs);
    }
    size_t totalframes() const
    {
        return _totalframes;
    }

    // return first valid globalts to ask getbatch() for
    // In utterance mode, the epoch start may fall in the middle of an utterance.
    // We return the end time of that utterance (which, in pathological cases, may in turn be outside the epoch; handle that).
    /*implement*/ size_t firstvalidglobalts(const size_t globalts)
    {
        // update randomization if a new sweep is entered  --this is a complex operation that updates many of the data members used below
        const size_t sweep = lazyrandomization(globalts);
        // frame mode: start at sweep boundary directly
        if (framemode)
            return globalts;
        // utterance mode
        assert(globalts >= sweep * _totalframes && globalts < (sweep + 1) * _totalframes);
        sweep;
        foreach_index (pos, randomizedutterancerefs)
            if (randomizedutterancerefs[pos].globalts >= globalts)
                return randomizedutterancerefs[pos].globalts; // exact or inexact match
        return randomizedutterancerefs.back().globalte();     // boundary case: requested time falls within the last utterance
    }

    const std::vector<size_t> &unitcounts() const
    {
        return counts[0];
    }
    const std::vector<size_t> &unitcounts(size_t index) const
    {
        return counts[index];
    }
};
};
};