Content - 0e1c4370fbe1a4843a488fb596679a0d88180468 - 89490fe/Source/Readers/Kaldi2Reader/latticearchive.cpp

visit type:
Tip revision: e28e66abad9b51b4b56f7e0da730e215e549dacd authored by Emad Barsoum on 12 January 2018, 02:09:28 UTC
Merge branch 'swish' of https://github.com/lakshayg/CNTK into lakshayg-swish
Tip revision: e28e66a
latticearchive.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//

#pragma once

#include "stdafx.h"
#include "basetypes.h"
#include "fileutil.h"
#include "htkfeatio.h" // for MLF reading for numer lattices
#include "latticearchive.h"
#include "msra_mgram.h" // for MLF reading for numer lattices
#include <stdio.h>
#include <stdint.h>
#include <vector>
#include <string>
#include <set>
#include <hash_map>
#include <regex>

#pragma warning(disable : 4996)
namespace msra { namespace lattices {

// helper to write a symbol hash (string -> int) to a file
// File has two sections:
//  - physicalunitname     // line number is mapping, starting with 0
//  - logunitname physicalunitname   // establishes a mapping; logunitname will get the same numeric index as physicalunitname
template <class UNITMAP>
static void writeunitmap(const wstring &symlistpath, const UNITMAP &unitmap)
{
    std::vector<std::string> units;
    units.reserve(unitmap.size());
    std::vector<std::string> mappings;
    mappings.reserve(unitmap.size());
    for (auto iter = unitmap.cbegin(); iter != unitmap.cend(); iter++) // why would 'for (auto iter : unitmap)' not work?
    {
        const std::string label = iter->first;
        const size_t unitid = iter->second;
        if (units.size() <= unitid)
            units.resize(unitid + 1); // we grow it on demand; the result must be compact (all entries filled), we check that later
        if (!units[unitid].empty())   // many-to-one mapping: remember the unit; look it up while writing
            mappings.push_back(label);
        else
            units[unitid] = label;
    }

    auto_file_ptr flist = fopenOrDie(symlistpath, L"wb");
    // write (physical) units
    foreach_index (k, units)
    {
        if (units[k].empty())
            throw std::logic_error("build: unitmap has gaps");
        fprintfOrDie(flist, "%s\n", units[k].c_str());
    }
    // write log-phys mappings
    foreach_index (k, mappings)
    {
        const std::string unit = mappings[k];             // logical name
        const size_t unitid = unitmap.find(unit)->second; // get its unit id; this indexes the units array
        const std::string tounit = units[unitid];         // and get the name from tehre
        fprintfOrDie(flist, "%s %s\n", unit.c_str(), tounit.c_str());
    }
    fflushOrDie(flist);
}

// (little helper to do a map::find() with default value)
template <typename MAPTYPE, typename KEYTYPE, typename VALTYPE>
static size_t tryfind(const MAPTYPE &map, const KEYTYPE &key, VALTYPE deflt)
{
    auto iter = map.find(key);
    if (iter == map.end())
        return deflt;
    else
        return iter->second;
}

// archive format:
//  - output files of build():
//     - OUTPATH                --the resulting archive (a huge file), simple concatenation of binary blocks
//     - OUTPATH.toc            --contains keys and offsets; this is how content in archive is found
//       KEY=ARCHIVE[BYTEOFFSET]        // where ARCHIVE can be empty, meaning same as previous
//     - OUTPATH.symlist    --list of all unit names encountered, in order of numeric index used in archive (first = index 0)
//                                This file is suitable as an input to HHEd's AU command.
//  - in actual use,
//     - .toc files can be concatenated
//     - .symlist files must remain paired with the archive file
//  - for actual training, user also needs to provide, typically from an HHEd AU run:
//     - OUTPATH.tying          --map from triphone units to senone sequence by name; get full phone set from .symlist above
//       UNITNAME SENONE[2] SENONE[3] SENONE[4]
/*static*/ void archive::build(const std::vector<std::wstring> &infiles, const std::wstring &outpath,
                               const std::unordered_map<std::string, size_t> &modelsymmap,
                               const msra::asr::htkmlfreader<msra::asr::htkmlfentry, msra::lattices::lattice::htkmlfwordsequence> &labels, // non-empty: build numer lattices
                               const msra::lm::CMGramLM &unigram, const msra::lm::CSymbolSet &unigramsymbols)                              // for numer lattices
{
#if 0 // little unit test helper for testing the read function
    bool test = true;
    if (test)
    {
        archive a;
        a.open (outpath + L".toc");
        lattice L;
        std::hash_map<string,size_t> symmap;
        a.getlattice (L"sw2001_A_1263622500_1374610000", L, symmap);
        a.getlattice (L"sw2001_A_1391162500_1409287500", L, symmap);
        return;
    }
#endif

    const bool numermode = !labels.empty(); // if labels are passed then we shall convert the MLFs to lattices, and 'infiles' are regular keys

    const std::wstring tocpath = outpath + L".toc";
    const std::wstring symlistpath = outpath + L".symlist";

    // process all files
    std::set<std::wstring> seenkeys; // (keep track of seen keys; throw error for duplicate keys)
    msra::files::make_intermediate_dirs(outpath);

    auto_file_ptr f = fopenOrDie(outpath, L"wb");
    auto_file_ptr ftoc = fopenOrDie(tocpath, L"wb");
    size_t brokeninputfiles = 0;
    foreach_index (i, infiles)
    {
        const std::wstring &inlatpath = infiles[i];
        fprintf(stderr, "build: processing lattice '%S'\n", inlatpath.c_str());

        // get key
        std::wstring key = regex_replace(inlatpath, wregex(L"=.*"), wstring()); // delete mapping
        key = regex_replace(key, wregex(L".*[\\\\/]"), wstring());              // delete path
        key = regex_replace(key, wregex(L"\\.[^\\.\\\\/:]*$"), wstring());      // delete extension (or not if none)
        if (!seenkeys.insert(key).second)
            throw std::runtime_error(msra::strfun::strprintf("build: duplicate key for lattice '%S'", inlatpath.c_str()));

        // we fail all the time due to totally broken HDecode/copy process, OK if not too many files are missing
        bool latticeread = false;
        try
        {
            // fetch lattice
            lattice L;
            if (!numermode)
                L.fromhtklattice(inlatpath, modelsymmap); // read HTK lattice
            else
                L.frommlf(key, modelsymmap, labels, unigram, unigramsymbols); // read MLF into a numerator lattice
            latticeread = true;

            // write to archive
            uint64_t offset = fgetpos(f);
            L.fwrite(f);
            fflushOrDie(f);

            // write reference to TOC file   --note: TOC file is a headerless UTF8 file; so don't use fprintf %S format (default code page)
            fprintfOrDie(ftoc, "%s=%s[%llu]\n", msra::strfun::utf8(key).c_str(), ((i - brokeninputfiles) == 0) ? msra::strfun::utf8(outpath).c_str() : "", offset);
            fflushOrDie(ftoc);

            fprintf(stderr, "written lattice to offset %llu as '%S'\n", offset, key.c_str());
        }
        catch (const exception &e)
        {
            if (latticeread)
                throw; // write failure
            // we ignore read failures
            fprintf(stderr, "ERROR: skipping unreadable lattice '%S': %s\n", inlatpath.c_str(), e.what());
            brokeninputfiles++;
        }
    }

    // write out the unit map
    // TODO: This is sort of redundant now--it gets the symmap from the HMM, i.e. always the same for all archives.
    writeunitmap(symlistpath, modelsymmap);

    fprintf(stderr, "completed %lu out of %lu lattices (%lu read failures, %.1f%%)\n", infiles.size(), infiles.size() - brokeninputfiles, brokeninputfiles, 100.0f * brokeninputfiles / infiles.size());
}

// helper to set a context value (left, right) with checking of uniqueness
void lattice::nodecontext::setcontext(int &lr, int val)
{
    if (lr == unknown)
        lr = val;
    else if (lr != val)
        lr = (signed short) ambiguous;
}

// helper for merge() to determine the unique node contexts
vector<lattice::nodecontext> lattice::determinenodecontexts(const msra::asr::simplesenonehmm &hset) const
{
    const size_t spunit = tryfind(hset.getsymmap(), "sp", SIZE_MAX);
    const size_t silunit = tryfind(hset.getsymmap(), "sil", SIZE_MAX);
    vector<lattice::nodecontext> nodecontexts(nodes.size());
    nodecontexts.front().left = nodecontext::start;
    nodecontexts.front().right = nodecontext::ambiguous; // (should not happen, but won't harm either)
    nodecontexts.back().right = nodecontext::end;
    nodecontexts.back().left = nodecontext::ambiguous; // (should not happen--we require !sent_end; but who knows)
    size_t multispseen = 0;                            // bad entries with multi-sp
    foreach_index (j, edges)
    {
        const auto &e = edges[j];
        const size_t S = e.S;
        const size_t E = e.E;
        auto a = getaligninfo(j);
        if (a.size() == 0) // !NULL edge
            throw std::logic_error("determinenodecontexts: !NULL edges not allowed in merging, should be removed before");
        size_t A = a[0].unit;
        size_t Z = a[a.size() - 1].unit;
        if (Z == spunit)
        {
            if (a.size() < 2)
                throw std::runtime_error("determinenodecontexts: context-free unit (/sp/) found as a single-phone word");
            else
            {
                Z = a[a.size() - 2].unit;
                if (Z == spunit) // a bugg lattice --I got this from HVite, to be tracked down
                {
                    // search from end once again, to print a warning
                    int n;
                    for (n = (int) a.size() - 1; n >= 0; n--)
                        if (a[n].unit != spunit)
                            break;
                    // ends with n = position of furthest non-sp
                    if (n < 0) // only sp?
                        throw std::runtime_error("determinenodecontexts: word consists only of /sp/");
                    fprintf(stderr, "determinenodecontexts: word with %lu /sp/ at the end found, edge %d\n", a.size() - 1 - n, j);
                    multispseen++;
                    Z = a[n].unit;
                }
            }
        }
        if (A == spunit || Z == spunit)
        {
#if 0
            fprintf (stderr, "A=%d   Z=%d   fa=%d   j=%d/N=%d    L=%d  n=%d   totalalign=%d  ts/te=%d/%d\n", (int) A, (int) Z, (int) e.firstalign,(int) j, (int) edges.size(), (int) nodes.size(), (int) a.size(), (int) align.size(),
                    nodes[S].t, nodes[E].t);
            foreach_index (kk, a)
                fprintf (stderr, "a[%d] = %d\n", kk, a[kk].unit);
            dump (stderr, [&] (size_t i) { return hset.gethmm (i).getname(); });
#endif
            throw std::runtime_error("determinenodecontexts: context-free unit (/sp/) found as a start phone or second last phone");
        }
        const auto &Ahmm = hset.gethmm(A);
        const auto &Zhmm = hset.gethmm(Z);
        int Aid = (int) Ahmm.gettransPindex();
        int Zid = (int) Zhmm.gettransPindex();
        nodecontexts[S].setright(Aid);
        nodecontexts[E].setleft(Zid);
    }
    if (multispseen > 0)
        fprintf(stderr, "determinenodecontexts: %lu broken edges in %lu with multiple /sp/ at the end seen\n", multispseen, edges.size());
    // check CI conditions and put in 't'
    // We make the hard assumption that there is only one CI phone, /sil/.
    const auto &silhmm = hset.gethmm(silunit);
    int silid = silhmm.gettransPindex();
    foreach_index (i, nodecontexts)
    {
        auto &nc = nodecontexts[i];
        if ((nc.left == nodecontext::unknown) ^ (nc.right == nodecontext::unknown))
            throw std::runtime_error("determinenodecontexts: invalid dead-end node in lattice");
        if (nc.left == nodecontext::ambiguous && nc.right != silid && nc.right != nodecontext::end)
            throw std::runtime_error("determinenodecontexts: invalid ambiguous left context (right context is not CI)");
        if (nc.right == nodecontext::ambiguous && nc.left != silid && nc.left != nodecontext::start)
            throw std::runtime_error("determinenodecontexts: invalid ambiguous right context (left context is not CI)");
        nc.t = nodes[i].t;
    }
    return nodecontexts; // (will this use a move constructor??)
}

// compar function for sorting and merging
bool lattice::nodecontext::operator<(const nodecontext &other) const
{
    // sort by t, left, right, i  --sort by i to make i appear before iother, as assumed in merge function
    int diff = (int) t - (int) other.t;
    if (diff == 0)
    {
        diff = left - other.left;
        if (diff == 0)
        {
            diff = right - other.right;
            if (diff == 0)
                return i < other.i; // (cannot use 'diff=' pattern since unsigned but may be SIZE_MAX)
        }
    }
    return diff < 0;
}

// remove that final !NULL edge
// We have that in HAPI lattices, but there can be only one at the end.
void lattice::removefinalnull()
{
    const auto &lastedge = edges.back();
    // last edge can be !NULL, recognized as having 0 alignment records
    if (lastedge.firstalign < align.size()) // has alignment records --not !NULL
        return;
    if (lastedge.S != nodes.size() - 2 || lastedge.E != nodes.size() - 1)
        throw std::runtime_error("removefinalnull: malformed final !NULL edge");
    edges.resize(edges.size() - 1); // remove it
    nodes.resize(nodes.size() - 1); // its start node is now the new end node
    foreach_index (j, edges)
        if (edges[j].E >= nodes.size())
            throw std::runtime_error("removefinalnull: cannot have final !NULL edge and other edges connecting to end node at the same time");
}

// merge a secondary lattice into the first
// With lots of caveats:
//  - this optimizes lattices to true unigram lattices where the only unique node condition is acoustic context
//  - no !NULL edge at the end, call removefinalnull() before
//  - this function returns an unsorted edges[] array, i.e. invalid. We sort in uniq'ed representation, which is easier.
// This function is not elegant at all, just hard labor!
void lattice::merge(const lattice &other, const msra::asr::simplesenonehmm &hset)
{
    if (!edges2.empty() || !other.edges2.empty())
        throw std::logic_error("merge: lattice(s) must be in non-uniq'ed format (V1)");
    if (!info.numframes || !other.info.numframes)
        throw std::logic_error("merge: lattice(s) must have identical number of frames");

    // establish node contexts
    auto contexts = determinenodecontexts(hset);
    auto othercontexts = other.determinenodecontexts(hset);

    // create joint node space and node mapping
    // This also collapses non-unique nodes.
    // Note the edge case sil-sil in one lattice which may be sil-ambiguous or ambiguous-sil on the other.
    // We ignore this, keeping such nodes unmerged. That's OK since middle /sil/ words have zero LM, and thus it's OK to keep them non-connected.
    foreach_index (i, contexts)
        contexts[i].i = i;
    foreach_index (i, othercontexts)
        othercontexts[i].iother = i;
    contexts.insert(contexts.end(), othercontexts.begin(), othercontexts.end()); // append othercontext
    sort(contexts.begin(), contexts.end());
    vector<size_t> nodemap(nodes.size(), SIZE_MAX);
    vector<size_t> othernodemap(other.nodes.size(), SIZE_MAX);
    int j = 0;
    foreach_index (i, contexts) // merge identical nodes  --this is the critical step
    {
        if (j == 0 || contexts[j - 1].t != contexts[i].t || contexts[j - 1].left != contexts[i].left || contexts[j - 1].right != contexts[i].right)
            contexts[j++] = contexts[i]; // entered a new one
        // node map
        if (contexts[i].i != SIZE_MAX)
            nodemap[contexts[i].i] = j - 1;
        if (contexts[i].iother != SIZE_MAX)
            othernodemap[contexts[i].iother] = j - 1;
    }
    fprintf(stderr, "merge: joint node space uniq'ed to %d from %d\n", j, contexts.size());
    contexts.resize(j);

    // create a new node array (just copy the contexts[].t fields)
    nodes.resize(contexts.size());
    foreach_index (inew, nodes)
        nodes[inew].t = (unsigned short) contexts[inew].t;
    info.numnodes = nodes.size();

    // incorporate the alignment records
    const size_t alignoffset = align.size();
    align.insert(align.end(), other.align.begin(), other.align.end());

    // map existing edges' S and E fields, and also 'firstalign'
    foreach_index (j, edges)
    {
        edges[j].S = nodemap[edges[j].S];
        edges[j].E = nodemap[edges[j].E];
    }
    auto otheredges = other.edges;
    foreach_index (j, otheredges)
    {
        otheredges[j].S = othernodemap[otheredges[j].S];
        otheredges[j].E = othernodemap[otheredges[j].E];
        otheredges[j].firstalign += alignoffset; // that's where they are now
    }

    // at this point, a new 'nodes' array exists, and the edges already are w.r.t. the new node space and align space

    // now we are read to merge 'other' edges into this, simply by concatenation
    edges.insert(edges.end(), otheredges.begin(), otheredges.end());

    // remove acoustic scores --they are likely not identical if they come from different decoders
    // If we don't do that, this will break the sorting in builduniquealignments()
    info.hasacscores = 0;
    foreach_index (j, edges)
        edges[j].a = 0.0f;

    // Note: we have NOT sorted or de-duplicated yet. That is best done after conversion to the uniq'ed format.
}

// remove duplicates
// This must be called in uniq'ed format.
void lattice::dedup()
{
    if (edges2.empty())
        throw std::logic_error("dedup: lattice must be in uniq'ed format (V2)");

    size_t k = 0;
    foreach_index (j, edges2)
    {
        if (k > 0 && edges2[k - 1].S == edges2[j].S && edges2[k - 1].E == edges2[j].E && edges2[k - 1].firstalign == edges2[j].firstalign)
        {
            if (edges2[k - 1].implysp != edges2[j].implysp)
                throw std::logic_error("dedup: inconsistent 'implysp' flag for otherwise identical edges");
            continue;
        }
        edges2[k++] = edges2[j];
    }
    fprintf(stderr, "dedup: edges reduced to %d from %d\n", k, edges2.size());
    edges2.resize(k);
    info.numedges = edges2.size();
    edges.clear(); // (should already be, but isn't; make sure we no longer use it)
}

// load all lattices from a TOC file and write them to a new archive
// Use this to
//  - upgrade the file format to latest in case of format changes
//  - check consistency (read only; don't write out)
//  - dump to stdout
//  - merge two lattices (for merging numer into denom lattices)
// Input path is an actual TOC path, output is the stem (.TOC will be added). --yes, not nice, maybe fix it later
// Example command:
// convertlatticearchive --latticetocs dummy c:\smbrdebug\sw20_small.den.lats.toc.10 -w c:\smbrdebug\sw20_small.den.lats.converted --cdphonetying c:\smbrdebug\combined.tying --statelist c:\smbrdebug\swb300h.9304.aligned.statelist --transprobs c:\smbrdebug\MMF.9304.transprobs
// How to regenerate from my test lattices:
// buildlatticearchive c:\smbrdebug\sw20_small.den.lats.regenerated c:\smbrdebug\hvitelat\*lat
// We support two special output path syntaxs:
//  - empty ("") -> don't output, just check the format
//  - dash ("-") -> dump lattice to stdout instead
/*static*/ void archive::convert(const std::wstring &intocpath, const std::wstring &intocpath2, const std::wstring &outpath,
                                 const msra::asr::simplesenonehmm &hset)
{
    const auto &modelsymmap = hset.getsymmap();

    const std::wstring tocpath = outpath + L".toc";
    const std::wstring symlistpath = outpath + L".symlist";

    // open input archive
    // TODO: I find that HVite emits redundant physical triphones, and even HHEd seems so (in .tying file).
    //  Thus, we should uniq the units before sorting. We can do that here if we have the .tying file.
    //  And then use the modelsymmap to map them down.
    //  Do this directly in the hset module (it will be transparent).
    std::vector<std::wstring> intocpaths(1, intocpath); // set of paths consisting of 1
    msra::lattices::archive archive(intocpaths, modelsymmap);

    // secondary archive for optional merging operation
    const bool mergemode = !intocpath2.empty(); // true if merging two lattices
    std::vector<std::wstring> intocpaths2;
    if (mergemode)
        intocpaths2.push_back(intocpath2);
    msra::lattices::archive archive2(intocpaths2, modelsymmap); // (if no merging then this archive2 is empty)

    // read the intocpath file once again to get the keys in original order
    std::vector<char> textbuffer;
    auto toclines = msra::files::fgetfilelines(intocpath, textbuffer);

    auto_file_ptr f = NULL;
    auto_file_ptr ftoc = NULL;

    // process all files
    if (outpath != L"" && outpath != L"-") // test for special syntaxes that bypass to actually create an output archive
    {
        msra::files::make_intermediate_dirs(outpath);
        f = fopenOrDie(outpath, L"wb");
        ftoc = fopenOrDie(tocpath, L"wb");
    }
    vector<const char *> invmodelsymmap; // only used for dump() mode

    // we must parse the toc file once again to get the keys in original order
    size_t skippedmerges = 0;
    foreach_index (i, toclines)
    {
        const char *line = toclines[i];
        const char *p = strchr(line, '=');
        if (p == NULL)
            throw std::runtime_error("open: invalid TOC line (no = sign): " + std::string(line));
        const std::wstring key = msra::strfun::utf16(std::string(line, p - line));

        fprintf(stderr, "convert: processing lattice '%S'\n", key.c_str());

        // fetch lattice  --this performs any necessary format conversions already
        lattice L;
        archive.getlattice(key, L);

        lattice L2;
        if (mergemode)
        {
            if (!archive2.haslattice(key))
            {
                fprintf(stderr, "convert: cannot merge because lattice '%S' missing in secondary archive; skipping\n", key.c_str());
                skippedmerges++;
                continue;
            }
            archive2.getlattice(key, L2);

            // merge it in
            // This will connect each node with matching 1-phone context conditions; aimed at merging numer lattices.
            L.removefinalnull(); // get rid of that final !NULL headache
            L2.removefinalnull();
            L.merge(L2, hset);
            // note: we are left with dups due to true unigram merging (HTK lattices cannot represent true unigram lattices since id is on the nodes)
        }
        // L.removefinalnull();
        // L.determinenodecontexts (hset);

        // convert it  --TODO: once we permanently use the new format, do this in fread() for V1
        // Note: Merging may have left this in unsorted format; we need to be robust against that.
        const size_t spunit = tryfind(modelsymmap, "sp", SIZE_MAX);
        L.builduniquealignments(spunit);

        if (mergemode)
            L.dedup();

        if (f && ftoc)
        {
            // write to archive
            uint64_t offset = fgetpos(f);
            L.fwrite(f);
            fflushOrDie(f);

            // write reference to TOC file   --note: TOC file is a headerless UTF8 file; so don't use fprintf %S format (default code page)
            fprintfOrDie(ftoc, "%s=%s[%llu]\n", msra::strfun::utf8(key).c_str(), (i == 0) ? msra::strfun::utf8(outpath).c_str() : "", offset);
            fflushOrDie(ftoc);

            fprintf(stderr, "written converted lattice to offset %llu as '%S'\n", offset, key.c_str());
        }
        else if (outpath == L"-")
        {
            if (invmodelsymmap.empty()) // build this lazily
            {
                invmodelsymmap.resize(modelsymmap.size());
                for (auto iter = modelsymmap.begin(); iter != modelsymmap.end(); iter++)
                    invmodelsymmap[iter->second] = iter->first.c_str();
            }
            L.rebuildedges(false);
            L.dump(stdout, [&](size_t i)
                   {
                       return invmodelsymmap[i];
                   });
        }
    } // end for (toclines)
    if (skippedmerges > 0)
        fprintf(stderr, "convert: %d out of %d merge operations skipped due to secondary lattice missing\n", skippedmerges, toclines.size());

    // write out the updated unit map
    if (f && ftoc)
        writeunitmap(symlistpath, modelsymmap);

    fprintf(stderr, "converted %d lattices\n", toclines.size());
}

// ---------------------------------------------------------------------------
// reading lattices from external formats (HTK lat, MLF)
// ---------------------------------------------------------------------------

// read an HTK lattice
// The lattice is expected to be freshly constructed (I did not bother to check).
void lattice::fromhtklattice(const wstring &path, const std::unordered_map<std::string, size_t> &unitmap)
{
    vector<char> textbuffer;
    auto lines = msra::files::fgetfilelines(path, textbuffer);
    if (lines.empty())
        throw std::runtime_error("lattice: mal-formed lattice--empty input file (or all-zeroes)");
    auto iter = lines.begin();
    // parse out LMF and WP
    char dummychar = 0; // dummy for sscanf() end checking
    for (; iter != lines.end() && strncmp(*iter, "N=", 2); iter++)
    {
        if (strncmp(*iter, "lmscale=", 8) == 0) // note: HTK sometimes generates extra garbage space at the end of this line
            if (sscanf_s(*iter, "lmscale=%f wdpenalty=%f%c", &info.lmf, &info.wp, &dummychar, sizeof(dummychar)) != 2 && dummychar != ' ')
                throw std::runtime_error("lattice: mal-formed lmscale/wdpenalty line in lattice: " + string(*iter));
    }

    // parse N and L
    if (iter != lines.end())
    {
        unsigned long N, L;
        if (sscanf_s(*iter, "N=%lu L=%lu %c", &N, &L, &dummychar, sizeof(dummychar)) != 2)
            throw std::runtime_error("lattice: mal-formed N=/L= line in lattice: " + string(*iter));
        info.numnodes = N;
        info.numedges = L;
        iter++;
    }
    else
        throw std::runtime_error("lattice: mal-formed before parse N=/L= line in lattice.");

    assert(info.numnodes > 0);
    nodes.reserve(info.numnodes);
    // parse the nodes
    for (size_t i = 0; i < info.numnodes; i++, iter++)
    {
        if (iter == lines.end())
            throw std::runtime_error("lattice: not enough I lines in lattice");
        unsigned long itest;
        float t;
        if (sscanf_s(*iter, "I=%lu t=%f%c", &itest, &t, &dummychar, sizeof(dummychar)) < 2)
            throw std::runtime_error("lattice: mal-formed node line in lattice: " + string(*iter));
        if (i != (size_t) itest)
            throw std::runtime_error("lattice: out-of-sequence node line in lattice: " + string(*iter));
        nodes.push_back(nodeinfo((unsigned int) (t / info.frameduration + 0.5)));
        info.numframes = max(info.numframes, (size_t) nodes.back().t);
    }
    // parse the edges
    assert(info.numedges > 0);
    edges.reserve(info.numedges);
    align.reserve(info.numedges * 10); // 10 phones per word on av. should be enough
    std::string label;
    for (size_t j = 0; j < info.numedges; j++, iter++)
    {
        if (iter == lines.end())
            throw std::runtime_error("lattice: not enough J lines in lattice");
        unsigned long jtest;
        unsigned long S, E;
        float a, l;
        char d[1024];
        // example:
        // J=12    S=1    E=13   a=-326.81   l=-5.090  d=:sil-t:s+k:e,0.03:dh:m-ax:m+sil,0.03:sil,0.02:
        int nvals = sscanf_s(*iter, "J=%lu S=%lu E=%lu a=%f l=%f d=%s", &jtest, &S, &E, &a, &l, &d, sizeof(d));
        if (nvals == 5 && j == info.numedges - 1) // special case: last edge is a !NULL and thus may have the d= record missing
            strcpy(d, ":");
        else if (nvals != 6)
            throw std::runtime_error("lattice: mal-formed edge line in lattice: " + string(*iter));
        if (j != (size_t) jtest)
            throw std::runtime_error("lattice: out-of-sequence edge line in lattice: " + string(*iter));
        edges.push_back(edgeinfowithscores(S, E, a, l, align.size()));
        // build align array
        size_t edgeframes = 0; // (for checking whether the alignment sums up right)
        const char *p = d;
        if (p[0] != ':' || (p[1] == 0 && j < info.numedges - 1)) // last edge may be empty
            throw std::runtime_error("lattice: alignment info must start with a colon and must have at least one entry: " + string(*iter));
        p++;
        while (*p)
        {
            // p points to an entry of the form TRIPHONE,DURATION
            const char *q = strchr(p, ',');
            if (q == NULL)
                throw std::runtime_error("lattice: alignment entry lacking a comma: " + string(*iter));
            if (q == p)
                throw std::runtime_error("lattice: alignment entry label empty: " + string(*iter));
            label.assign(p, q - p); // the triphone label
            q++;
            char *ep;
            double duration = strtod(q, &ep); // (weird--returns a non-const ptr in ep to a const object)
            p = ep;
            if (*p != ':')
                throw std::runtime_error("lattice: alignment entry not ending with a colon: " + string(*iter));
            p++;
            // create the alignment entry
            const size_t frames = (unsigned int) (duration / info.frameduration + 0.5);
            auto it = unitmap.find(label);
            if (it == unitmap.end())
                throw std::runtime_error("lattice: unit in alignment that is not in model: " + label);
            const size_t unitid = it->second;
            // const size_t unitid = unitmap.insert (make_pair (label, unitmap.size())).first->second;  // may create a new entry with index = #entries
            align.push_back(aligninfo(unitid, frames));
            edgeframes += frames;
        }
        if (edgeframes != nodes[E].t - (size_t) nodes[S].t)
        {
            char msg[128];
            sprintf(msg, "\n-- where edgeframes=%d != (nodes[E].t - nodes[S].t=%d), the gap is %d.", edgeframes, nodes[E].t - (size_t) nodes[S].t, edgeframes + nodes[S].t - nodes[E].t);
            throw std::runtime_error("lattice: alignment info duration mismatches edge duration: " + string(*iter) + msg);
        }
    }
    if (iter != lines.end())
        throw std::runtime_error("lattice: unexpected garbage at end of lattice: " + string(*iter));
    checklattice();

    // create more efficient storage for alignments
    const size_t spunit = tryfind(unitmap, "sp", SIZE_MAX);
    builduniquealignments(spunit);

    showstats();
}

// construct a numerator lattice from an MLF entry
// The lattice is expected to be freshly constructed (I did not bother to check).
void lattice::frommlf(const wstring &key, const std::unordered_map<std::string, size_t> &unitmap,
                      const msra::asr::htkmlfreader<msra::asr::htkmlfentry, lattice::htkmlfwordsequence> &labels,
                      const msra::lm::CMGramLM &unigram, const msra::lm::CSymbolSet &unigramsymbols)
{
    const auto &transcripts = labels.allwordtranscripts(); // (TODO: we could just pass the transcripts map--does not really matter)

    // get the labels (state and word)
    auto iter = transcripts.find(key);
    if (iter == transcripts.end())
        throw std::runtime_error("frommlf: no reference word sequence in MLF for lattice with key " + strfun::utf8(key));
    const auto &transcript = iter->second;
    if (transcript.words.size() == 0)
        throw std::runtime_error("frommlf: empty reference word sequence for lattice with key " + strfun::utf8(key));

    // determine unigram scores for all words
    vector<float> lmscores(transcript.words.size());
    size_t silence = unigramsymbols["!silence"];
    size_t lmend = unigramsymbols["</s>"];
    size_t sentstart = unigramsymbols["!sent_start"];
    size_t sentend = unigramsymbols["!sent_end"];

    // create the lattice
    nodes.resize(transcript.words.size() + 1);
    edges.resize(transcript.words.size());
    align.reserve(transcript.align.size());
    size_t numframes = 0;
    foreach_index (j, transcript.words)
    {
        const auto &w = transcript.words[j];
        nodes[j].t = w.firstframe;
        auto &e = edges[j];
        e.unused = 0;
        e.S = j;
        e.E = j + 1;
        if (e.E != j + 1)
            throw std::runtime_error(msra::strfun::strprintf("frommlf: too many tokens to be represented as edgeinfo::E in label set: %S", key.c_str()));
        e.a = 0.0f; // no ac score

        // LM score
        // !sent_start and !silence are patched to LM score 0
        size_t wid = w.wordindex;
        if (wid == sentstart)
        {
            if (j != 0)
                throw std::logic_error("frommlf: found an !sent_start token not at the first position");
        }
        else if (wid == sentend)
        {
            if (j != (int) transcript.words.size() - 1)
                throw std::logic_error("frommlf: found an !sent_end token not at the end position");
            wid = lmend; // use </s> for score lookup
        }
        const int iwid = (int) wid;
        e.l = (wid != sentstart && wid != silence) ? (float) unigram.score(&iwid, 1) : 0.0f;

        // alignment
        e.implysp = 0;
        e.firstalign = align.size();
        auto a = transcript.getaligninfo(j);
        align.insert(align.end(), a.begin(), a.end());
        foreach_index (k, a)
            numframes += a[k].frames;
    }
    nodes[transcript.words.size()].t = (unsigned short) numframes;
    if (nodes[transcript.words.size()].t != numframes)
        throw std::runtime_error(msra::strfun::strprintf("frommlf: too many frames to be represented as nodeinfo::t in label set: %S", key.c_str()));
    info.lmf = -1.0f; // indicates not set
    info.wp = 0.0f;   // not set indicated by lmf < 0
    info.numedges = edges.size();
    info.numnodes = nodes.size();
    info.numframes = numframes;
    checklattice();

    // create more efficient storage for alignments
    const size_t spunit = tryfind(unitmap, "sp", SIZE_MAX);
    builduniquealignments(spunit);

    showstats();
}
};
};
Browse the archive

https://github.com/Microsoft/CNTK