Revision 54a6b1d2ecd1efc0ee666f5696d717c77ae0f4e1 authored by Frank Seide on 06 September 2015, 16:20:28 UTC, committed by Frank Seide on 06 September 2015, 16:20:28 UTC
Makefile adapted to new paths, but not yet building Network and SGD as separate libs
1 parent 0a67d0c
latticearchive.h
//
// <copyright file="latticearchive.h" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//
// latticearchive.h -- managing lattice archives
#if 0
#endif
#pragma once
#undef HACK_IN_SILENCE // [v-hansu] hack to simulate DEL in the lattice
#define SILENCE_PENALTY // give penalty to added silence
#include "basetypes.h"
#include "latticestorage.h"
#include "simple_checked_arrays.h"
#include "fileutil.h"
#include <stdint.h>
#include <vector>
#include <string>
#include <hash_map>
#include <algorithm> // for find()
#include "simplesenonehmm.h"
namespace msra { namespace math { class ssematrixbase; template<class ssematrixbase> class ssematrix; template<class ssematrixbase> class ssematrixstriperef; };};
namespace msra { namespace lm { class CMGramLM; class CSymbolSet; };}; // for numer-lattice building
namespace msra { namespace asr { template<typename A, typename B> class htkmlfreader; struct htkmlfentry; };}; // for numer lattice building
namespace msra { namespace lattices {
typedef msra::math::ssematrixbase matrixbase;
typedef msra::math::ssematrix<matrixbase> matrix;
typedef msra::math::ssematrixstriperef<matrixbase> matrixstripe;
class littlematrixheap;
enum mbrclassdefinition // used to identify definition of class in minimum bayesian risk
{
senone = 1, // senone is default, which means no mapping; sMBR
//monophonestate = 2,
monophone = 3, // pMBR?
};
// ===========================================================================
// lattice -- one lattice in memory
// ===========================================================================
class lattice
{
struct header_v1_v2
{
size_t numnodes : 32;
size_t numedges : 32;
float lmf;
float wp;
double frameduration; // in seconds
size_t numframes : 32; // number of frames
size_t impliedspunitid : 31; // id of implied last unit (intended as /sp/); only used in V2
size_t hasacscores : 1; // if 1 then ac scores are embedded
header_v1_v2() : numnodes (0), numedges (0), lmf (1.0f), wp (0.0f), frameduration (0.01/*assumption*/), numframes (0), impliedspunitid (SIZE_MAX), hasacscores (1) { }
};
header_v1_v2 info; // information about the lattice
static const unsigned int NOEDGE = 0xffffff; // 24 bits
//static_assert (sizeof (nodeinfo) == 8, "unexpected size of nodeeinfo"); // note: int64_t required to allow going across 32-bit boundary
// ensure type size as these are expected to be of this size in the files we read
static_assert (sizeof (nodeinfo) == 2, "unexpected size of nodeeinfo"); // note: int64_t required to allow going across 32-bit boundary
static_assert (sizeof (edgeinfowithscores) == 16, "unexpected size of edgeinfowithscores");
static_assert (sizeof (aligninfo) == 4, "unexpected size of aligninfo");
std::vector<nodeinfo> nodes;
std::vector<edgeinfowithscores> edges;
std::vector<aligninfo> align;
// V2 lattices --for a while, we will store both in RAM, until all code is updated
static int fsgn (float f) { if (f > 0) return 1; else if (f < 0) return -1; else return 0; } // what's this function called??
int comparealign (size_t j1, size_t j2, bool sortbyfinalsp) const // strcmp()-like function for comparing alignments
{
// sortbyfinalsp: This is for dealing with edges that only differ in a final zero-frame /sp/
// These should be considered equal in merging, such that the one without /sp/ (MLFs don't have final /sp/)
// gets merged away (since it is inconsistent with decoding).
// - sortbyfinalsp = true: use in sorting (the longer edge with /sp/ will go FIRST so that it is the one to survive uniq-ing)
// - sortbyfinalsp = false: use in uniq-ing; the edges will just be reported as identical
if (edges[j1].implysp || edges[j2].implysp)
throw std::logic_error ("comparealign: must not operate on edges with implysp flag set");
const auto a1 = getaligninfo (j1);
const auto a2 = getaligninfo (j2);
// sort by unit sequence first
for (size_t k = 0; k < a1.size() && k < a2.size(); k++)
{
int diff = (int) a1[k].unit - (int) a2[k].unit;
if (diff != 0)
return diff;
}
// then by the alignment --we want to keep similar alignments of the same sequence close by
for (size_t k = 0; k < a1.size() && k < a2.size(); k++)
{
int diff = (int) a1[k].frames - (int) a2[k].frames;
if (diff != 0)
return diff;
}
// identical sequence up to here --check if they only differ in a final 0-frame /sp/
// This is for merging of MLFs with lattices, where MLFs don't have /sp/.
if ((a2.size() == a1.size() + 1 && a2.back().frames == 0) // a2 has extra 0-frame /sp/
|| (a1.size() == a2.size() + 1 && a1.back().frames == 0)) // a1 has extra 0-frame /sp/
{
if (!sortbyfinalsp) // 'false' -> report them equal (used in final merge)
return 0;
// 'true' -> the longer one (with /sp/) comes first, i.e. the sorting order is reverse
return (int) a2.size() - (int) a1.size(); // (note a1 and a2 swapped)
}
// all identical--if length same, then identical; else length determines ordering
return (int) a1.size() - (int) a2.size();
}
// sort order that is useful for uniq'ing alignments
int uniqueorder (const edgeinfo & e1, const edgeinfo & e2) const
{
// first sort by start and end time (required for the scoring functions)
int diff = (int) nodes[e1.S].t - (int) nodes[e2.S].t;
if (diff != 0)
return diff;
diff = (int) nodes[e1.E].t - (int) nodes[e2.E].t;
if (diff != 0)
return diff;
// now sort by alignment (and also a and l, which must be identical--but likely they are anyway)
size_t j1 = e1.firstalign; // temporarily: these are the indices to the original edges
size_t j2 = e2.firstalign;
// now compare by alignment
diff = comparealign (j1, j2, true);
if (diff != 0)
return diff;
// With the above, we are sorted properly to detect alignment dups.
// a and l are also stored in the uniq'ed storage
// Note: When merging lattices, 'l' may have different precision.
// We did sort by alignment first (above), so we can still detect dups if from this sort order if later we are lenient.
diff = fsgn (edges[j1].l - edges[j2].l);
if (diff != 0)
return diff;
diff = fsgn (edges[j1].a - edges[j2].a);
if (diff != 0)
return diff;
// identical--these can be grouped
// and sort identical edges by start and end node again
// This is not really used, since we later sort once again according to 'latticeorder()'
diff = (int) e1.S - (int) e2.S;
if (diff != 0)
return diff;
diff = (int) e1.E - (int) e2.E;
//if (diff != 0)
return diff;
}
// lattice sort order --algorithms assume lattices are sorted by E, then by S
int latticeorder (const edgeinfo & e1, const edgeinfo & e2) const
{
// sort identical edges by start and end node again
int diff = (int) e1.E - (int) e2.E;
if (diff != 0)
return diff;
diff = (int) e1.S - (int) e2.S;
if (diff != 0)
return diff;
// within same S/E pair, sort by firstalign
// Since end nodes represent word identities in HAPI, this should only ever happen when merging lattices, but let's not rely on HAPI's assumptions.
// builduniquealignments() only dedups the alignment records, but in case of merging, we want to dedup edges altogether.
// For that, it is necessary that within a given S/E pair, where we now may have several different words, these edges are sorted
// to be able to dedup based on firstalign.
diff = (int) e1.firstalign - (int) e2.firstalign;
return diff;
}
// more compact lattice storage
std::vector<edgeinfo> edges2; // TODO: rename these
std::vector<aligninfo> uniquededgedatatokens; // [-1]: LM score; [-2]: ac score; [0..]: actual aligninfo records
float & uniqueedgelmscore (size_t firstalign) { return *(float*) &uniquededgedatatokens.data()[firstalign-1]; }
float & uniqueedgeacscore (size_t firstalign) { if (info.hasacscores) return *(float*) &uniquededgedatatokens.data()[firstalign-2]; else throw std::logic_error ("uniqueedgeacscore: no ac scores stored in this lattice"); }
public: // TODO: make private again once
// construct from edges/align
// This is also used for merging, where the edges[] array is not correctly sorted. So don't assume this here.
void builduniquealignments (size_t spunit = SIZE_MAX/*fix this later*/)
{
// infer /sp/ unit if not given
// BUGBUG: This sometimes leads to incorrect results. We currently post-fix it.
if (spunit == SIZE_MAX)
{
// Using a very simple heuristics; take the last unit of the first non-silence edge. We know it works for our current setup, but otherwise it's tricky.
foreach_index (j, edges)
{
const auto ai = getaligninfo (j);
if (ai.size() < 2) // less than 2--must be /sil/
continue;
spunit = ai[ai.size() - 1].unit;
fprintf (stderr, "builduniquealignments: /sp/ unit inferred through heuristics as %d\n", spunit);
break;
}
}
info.impliedspunitid = spunit;
// edges2 array gets sorted to group edges with identical alignments together
info.hasacscores = 0; // if we got any score != 0.0, we will set this
edges2.resize (edges.size());
foreach_index (j, edges)
{
if (edges[j].implysp)
throw std::logic_error ("builduniquealignments: original edges[] array must not have implied /sp/");
edges2[j].S = edges[j].S;
edges2[j].E = edges[j].E;
edges2[j].unused = 0;
edges2[j].implysp = 0;
edges2[j].firstalign = j; // index into the original edges[] array before sorting, temporarily stored here to survive sorting
checkoverflow (edges2[j].S, edges[j].S, "edgeinfo2::S");
checkoverflow (edges2[j].E, edges[j].E, "edgeinfo2::E");
checkoverflow (edges2[j].firstalign, j, "edgeinfo2::firstalign (j for sorting)");
if (edges[j].a != 0.0f)
info.hasacscores = 1;
}
// sort edges
sort (edges2.begin(), edges2.end(), [&] (const edgeinfo & e1, const edgeinfo & e2) { return uniqueorder (e1, e2) < 0; });
// create a uniq'ed version of the align[] array, into uniquededgedatatokens[]
uniquededgedatatokens.resize (0);
uniquededgedatatokens.reserve (align.size());
size_t numuniquealignments = 0; // number of unique alignments (=number of edges with unique alignments)
size_t prevj = SIZE_MAX; // this is an index into the original edges[] array before sorting
size_t numimpliedsp = 0; // (statistics)
foreach_index (j2, edges2)
{
size_t j = edges2[j2].firstalign; // index into the original edges[] array before sorting (was temporarily stored here)
// allocate a new edge group if this edge differs from the previous
const float lmargin = 1e-3f; // if merging then the same LM score may come from different ASCII sources with different precision. HTK lattices store 3 digits after the period.
#if 1 // diagnostics on the merging of MLF and HTK inputs
if (prevj != SIZE_MAX && fabs (edges[prevj].l - edges[j].l) <= lmargin && comparealign (prevj, j, false) == 0
&& nodes[edges[prevj].S].t == nodes[edges[j].S].t
&& nodes[edges[prevj].E].t == nodes[edges[j].E].t
&& edges[prevj].l != edges[j].l) // some diagnostics
fprintf (stderr, "build: merging edges %d and %d despite slightly different LM scores %.8f vs. %.8f, ts/te=%.2f/%.2f\n",
prevj, j, edges[prevj].l, edges[j].l, nodes[edges[prevj].S].t * 0.01f, nodes[edges[prevj].E].t * 0.01f);
#endif
if (prevj == SIZE_MAX || fabs (edges[prevj].l - edges[j].l) > lmargin || (info.hasacscores && edges[prevj].a != edges[j].a) || comparealign (prevj, j, false) != 0)
{
// allocate a new alignment
size_t currentfirstalign = uniquededgedatatokens.size() + 1;
if (info.hasacscores)
currentfirstalign++;
// inject the lm and ac scores
uniquededgedatatokens.resize (currentfirstalign);
uniqueedgelmscore (currentfirstalign) = edges[j].l;
if (info.hasacscores)
uniqueedgeacscore (currentfirstalign) = edges[j].a;
// and copy it
edges2[j2].firstalign = currentfirstalign; // this is where it starts
checkoverflow (edges2[j2].firstalign, currentfirstalign, "firstalign"); // this is also a sequence check
const auto ai = getaligninfo (j);
size_t nalign = ai.size();
if (nalign == 0 && (size_t) j2 != edges.size() -1)
throw std::runtime_error ("builduniquealignments: !NULL edges forbidden except for the very last edge");
// special optimization: we do not store the /sp/ unit at the end
if (nalign > 1/*be robust against 1-unit edges that consist of spunit*/ && ai[nalign-1].unit == spunit)
{
nalign--;
edges2[j2].implysp = 1;
numimpliedsp++; // (diagnostics only)
}
else
edges2[j2].implysp = 0;
// copy the tokens
for (size_t k = 0; k < nalign; k++)
{
auto a = ai[k];
if (a.last)
throw std::logic_error ("builduniquealignments: unexpected 'last' flag already set in input aligns (numeric overflow in old format?)");
if (k == nalign -1)
a.last = 1;
uniquededgedatatokens.push_back (a);
}
numuniquealignments++;
}
else // duplicate from previous
{
edges2[j2].firstalign = edges2[j2-1].firstalign;
edges2[j2].implysp = edges2[j2-1].implysp;
}
prevj = j;
}
const size_t uniquealigntokens = uniquededgedatatokens.size() - (numuniquealignments * (info.hasacscores ? 2 : 1));
const size_t nonuniquenonsptokens = align.size() - numimpliedsp;
fprintf (stderr, "builduniquealignments: %d edges: %d unique alignments (%.2f%%); %d align tokens - %d implied /sp/ units = %d, uniqued to %d (%.2f%%)\n",
edges.size(), numuniquealignments, 100.0f * numuniquealignments / edges.size(),
align.size(), numimpliedsp, nonuniquenonsptokens, uniquealigntokens, 100.0f * uniquealigntokens / nonuniquenonsptokens);
// sort it back into original order (sorted by E, then by S)
sort (edges2.begin(), edges2.end(), [&] (const edgeinfo & e1, const edgeinfo & e2) { return latticeorder (e1, e2) < 0; });
// TODO: be more consistent--we should clear out edges[] at this point!
}
private:
// infer ends in case of broken lattices with zero-token edges
// This happened when /sp/ was wrongly inferred, and 0-token edges were generated, for which we have no 'last' flag.
// Such lattices can no longer be generated, but we are stuck with old ones that have this problem.
void inferends (std::vector<bool> & isend) const
{
isend.resize (uniquededgedatatokens.size() +1, false);
isend.back() = true;
foreach_index (j, edges2)
{
size_t end = edges2[j].firstalign;
end--; // LM score
if (info.hasacscores)
end--; // ac score
// end is now the index of a unit right after a align sequence
isend[end] = true;
}
}
public:
// hack function to add a single-/sil/ edge, as well as a single /sp/, with LM score 0 to every unique (S,E) pair that doesn't already have a /sil/ and/or /sp/
// This is to simulate DELetions. We observe a massive DEL problem. Hypothesis: Caused by too many strong positive obs for /sil/ and/or /sp/, and too few strong counter-weights.
// Note: Adding /sil/ lowers the objective function a little (unexpected; maybe due to the hack), adding /sp/ lowers it more (really unexpected since no hack here).
void hackinsilencesubstitutionedges (size_t silunit, size_t spunit, bool addsp)
{
std::vector<edgeinfowithscores> newedges;
newedges.reserve (edges.size() * 5 / 2); // avoid realloc , this saves time :)
std::vector<aligninfo> newalign;
newalign.reserve (align.size() * 5 / 2); // avoid realloc
// loop over all edges, and duplicate them, inserting /sil/ and /sp/ edges
// Note that we assume that single-sil edges only (and always) exist at start and end, to avoid checking.
// This is a hack anyway.
// We exploit sortedness of the edges array by (S,E).
foreach_index (j, edges)
{
auto e = edges[j];
assert (e.unused == 0);
e.unused = 0;
#ifdef SILENCE_PENALTY
const float penaltyforsil = float(-1.8/12); // estimated on training MLF; assuming LMF=12
#else
const float penaltyforsil = 0.0f;
#endif
if (j > 0 && (e.S != edges[j-1].S || e.E != edges[j-1].E)) // new block entered
{
if (e.S != 0 && e.E != nodes.size() -1) // and it's not a block that has !sent_start/end
{
// create a new silence edge
// To make it perfectly clear: For /sil/, this is a HACK--the acoustic contexts are WRONG. Only a quick test. (For /sp/ there is no such problem though.)
const size_t numframes = nodes[e.E].t - nodes[e.S].t;
if (numframes > 0)
{
edgeinfowithscores sile = e;
sile.unused = 1; // indicate that this is an added edge
sile.l = penaltyforsil; // sil is penalized
sile.implysp = 0;
sile.a = LOGZERO; // must not have this anyway
sile.firstalign = newalign.size(); // we create a new entry for this
if (sile.firstalign != newalign.size())
throw std::runtime_error ("hackinsilencesubstitutionedges: numeric bit-field overflow of .firstalign");
newedges.push_back (sile);
// create a new align entry
aligninfo asil (silunit, numframes);
newalign.push_back (asil);
if (addsp)
{
edgeinfowithscores spe = sile;
spe.firstalign = newalign.size();
if (spe.firstalign != newalign.size())
throw std::runtime_error ("hackinsilencesubstitutionedges: numeric bit-field overflow of .firstalign");
newedges.push_back (spe);
aligninfo asp (spunit, numframes);
newalign.push_back (asp);
}
}
}
}
if (e.S != 0 && e.E != nodes.size() - 1) // add penalty to sil that appears by the end in a word edge
{
auto a = getaligninfo (j);
if (a.back().unit == silunit && a.size() > 1)
e.l += penaltyforsil;
}
// copy the edge
e.firstalign = newalign.size();
if (e.firstalign != newalign.size())
throw std::runtime_error ("hackinsilencesubstitutionedges: numeric bit-field overflow of .firstalign");
newedges.push_back (e);
// copy the align records
auto a = getaligninfo (j);
foreach_index (k, a)
newalign.push_back (a[k]);
}
static int count = 0;
if (count++ < 10) // (limit the log spam)
{
fprintf (stderr, "hackinsilencesubstitutionedges: added %d DEL (/sil/-%ssubstitution) edges (from %d to %d; align from %d to %d)\n",
(int) (newedges.size() - edges.size()),
addsp ? " and /sp/-" : "",
(int) edges.size(), (int) newedges.size(),
(int) align.size(), (int) newalign.size());
}
edges.swap (newedges);
info.numedges = edges.size();
align.swap (newalign);
edges.shrink_to_fit(); // [v-hansu] might be useful when RAM is out of use
align.shrink_to_fit();
#if 0 // [v-hansu] to dump lattice for checking
static size_t countdump = 0;
FILE *f = fopen ("lattice", "a");
foreach_index (j, edges)
fprintf (f, "S=%d\tE=%d\tunused=%d\n", edges[j].S, edges[j].E, edges[j].unused);
countdump++;
fflush(f);
fclose(f);
if (countdump == 10)
exit(0);
#endif
}
// go back from V2 format to edges and align, so old code can still run
// This will go away one we updated all code to use the new data structures.
void rebuildedges (bool haszerotokenedges/*pass true for broken spunit that may have reduced edges to 0 entries*/)
{
// deal with broken (zero-token) edges
std::vector<bool> isendworkaround;
if (haszerotokenedges)
inferends (isendworkaround);
edges.resize (edges2.size());
align.resize (0);
align.reserve (uniquededgedatatokens.size() * 10); // should be enough
foreach_index (j, edges)
{
edges[j].S = edges2[j].S;
edges[j].E = edges2[j].E;
edges[j].unused = 0;
edges[j].implysp = 0;
const size_t firstalign = edges2[j].firstalign;
edges[j].a = info.hasacscores ? uniqueedgeacscore (firstalign) : -1e30f/*LOGZERO*/; // cannot reconstruct; not available
edges[j].l = uniqueedgelmscore (firstalign);
// expand the alignment tokens
edges[j].firstalign = align.size();
const size_t edgedur = nodes[edges2[j].E].t - nodes[edges2[j].S].t; // for checking and back-filling the implied /sp/
size_t aligndur = 0;
if (firstalign == uniquededgedatatokens.size() && (size_t) j != edges.size() -1)
throw std::runtime_error ("rebuildedges: !NULL edges forbidden except for the last edge");
for (size_t k = firstalign; k < uniquededgedatatokens.size(); k++)
{
if (!isendworkaround.empty() && isendworkaround[k]) // secondary criterion to detect ends in broken lattices
break;
aligninfo ai = uniquededgedatatokens[k];
if (ai.unused != 0)
throw std::runtime_error ("rebuildedges: mal-formed uniquededgedatatokens[] array: 'unused' field must be 0");
bool islast = ai.last != 0;
ai.last = 0; // old format does not support this
align.push_back (ai);
aligndur += ai.frames;
if (aligndur > edgedur)
throw std::runtime_error ("rebuildedges: mal-formed uniquededgedatatokens[] array: aligment longer than edge");
if (islast)
break;
if (k == uniquededgedatatokens.size() -1)
throw std::runtime_error ("rebuildedges: mal-formed uniquededgedatatokens[] array: missing 'last' flag in last entry");
}
if (edges2[j].implysp)
{
if (info.impliedspunitid == SIZE_MAX)
throw std::runtime_error ("rebuildedges: edge requests implied /sp/ but none specified in lattice header");
if (aligndur > edgedur)
throw std::runtime_error ("rebuildedges: edge alignment longer than edge duration");
aligninfo ai (info.impliedspunitid, edgedur - aligndur/*frames: remaining frames are /sp/ */);
align.push_back (ai);
}
}
//fprintf (stderr, "rebuildedges: %d edges reconstructed to %d alignment tokens\n", edges.size(), align.size()); // [v-hansu] comment out because it takes up most of the log
align.shrink_to_fit(); // free up unused memory (since we need it!!)
// now get rid of the V2 data altogether
uniquededgedatatokens.clear();
uniquededgedatatokens.shrink_to_fit();
edges2.clear();
edges2.shrink_to_fit();
}
public:
class parallelstate;
// a word sequence read from an MLF file
struct htkmlfwordsequence
{
// a word entry read from an MLF file
struct word // word info we are reading from the MLF file (if we want to add the ground-truth path)
{
static const unsigned int unknownwordindex = 0xfffff; // max value storable in 'wordindex'
unsigned int wordindex : 20; // per mapping table; unknownwordindex denotes unknown word
unsigned int firstalign : 12; // index into align record to first phoneme entry
unsigned int firstframe : 16; // TODO: obsolete; once removed, we are back at 32 bits--yay
word() {} // to keep compiler happy
word (size_t wid, size_t ts, size_t as)
{
wordindex = (unsigned int) wid;
firstframe = (unsigned int) ts;
firstalign = (unsigned int) as;
if (wordindex != wid)
throw std::runtime_error ("htkmlfwordentry: vocabulary size too large for bit field 'wordindex'");
if (firstframe != ts)
throw std::runtime_error ("htkmlfwordentry: start frame too large for bit field 'firstframe'");
if (firstalign != as)
throw std::runtime_error ("htkmlfwordentry: first-align index too large for bit field 'firstframe'");
}
};
typedef aligninfo aligninfo; // now we can access it as htkmlfwordsequence::aligninfo although it comes from some totally other corner of the system
std::vector<word> words;
std::vector<aligninfo> align;
// get aligninfo array for a word
const_array_ref<aligninfo> getaligninfo (size_t j) const { size_t begin = (size_t) words[j].firstalign; size_t end = j+1 < words.size() ? (size_t) words[j+1].firstalign : align.size(); return const_array_ref<aligninfo> (align.data() + begin, end - begin); }
};
private:
struct edgealignments // struct to return alignments using an efficient long-vector storage
{
std::vector<unsigned int> alignoffsets; // [j] index of first alignment in allalignments; one extra element for length of last entry
std::vector<unsigned short> allalignments; // all alignments concatenated
public:
edgealignments (const lattice & L)
{
size_t alignbufsize = 0;
alignoffsets.resize (L.edges.size() + 1); // one extra element so we can determine the length of last entry
foreach_index (j, L.edges)
{
alignoffsets[j] = (unsigned int) alignbufsize;
size_t edgenumframes = L.nodes[L.edges[j].E].t - L.nodes[L.edges[j].S].t;
alignbufsize += edgenumframes;
}
alignoffsets[L.edges.size()] = (unsigned int) alignbufsize; // (TODO: remove if not actually needed)
}
// edgealignments[j][t] is the senone at frame offset t in edge j
array_ref<unsigned short> operator[] (size_t j)
{
if (allalignments.size() == 0)
allalignments.resize(alignoffsets.back()) ;
size_t offset = alignoffsets[j];
size_t numframes = alignoffsets[j+1] - alignoffsets[j];
if (numframes == 0)
return array_ref<unsigned short> ();
return array_ref<unsigned short> (&allalignments[offset], numframes);
}
const_array_ref<unsigned short> operator[] (size_t j) const
{
size_t offset = alignoffsets[j];
size_t numframes = alignoffsets[j+1] - alignoffsets[j];
return const_array_ref<unsigned short> (&allalignments[offset], numframes);
}
// CUDA support
const std::vector<unsigned int> & getalignoffsets() const { return alignoffsets; }
std::vector<unsigned short> & getalignmentsbuffer() { allalignments.resize(alignoffsets.back()) ; return allalignments; } // for retrieving it from the GPU
const std::vector<unsigned short> & getalignmentsbuffer() const {
if(allalignments.size() != alignoffsets.back() )
throw::runtime_error("getalignmentsbuffer: allalignments not allocated!\n");
return allalignments; } // for retrieving it from the GPU
size_t getalignbuffersize() const {return alignoffsets.back();}
};
struct backpointers
{
std::vector<size_t> backptroffsets; // TODO: we could change this to 'unsigned int' to save some transfer time
std::vector<unsigned short> backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer
size_t numofstates; // per sil hmm
public:
backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0)
{
size_t edgeswithsilence = 0; // (diagnostics only: number of edges with at least one /sil/)
size_t backptrbufsize = 0; // number of entries in buffer for silence backpointer array, used as cursor as we build it
backptroffsets.resize (L.edges.size() + 1); // +1, so that the final entry determines the overall size of the allocated buffer
const size_t silUnitId = hset.gethmmid ("sil");
numofstates = hset.gethmm (silUnitId).getnumstates();
foreach_index (j, L.edges)
{
// for each edge, determine if it needs a backpointer buffer for silence
// Multiple /sil/ in the same edge will share the same buffer, so we need to know the max length.
const auto & aligntokens = L.getaligninfo (j); // get alignment tokens
backptroffsets[j] = backptrbufsize; // buffer for this edge begins here
size_t maxsilframes = 0; // max #frames--we allocate this many for this edge
size_t numsilunits = 0; // number of /sil/ units in this edge
foreach_index (a, aligntokens)
{
if (aligntokens[a].unit == silUnitId)
{
numsilunits++; // count
if (aligntokens[a].frames > maxsilframes) // determine max #frames
maxsilframes = aligntokens[a].frames;
}
}
#if 1 // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is)
if (numsilunits > 1)
{
fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, numsilunits);
fprintf (stderr, "alignments: :");
foreach_index (a, aligntokens)
{
const auto & unit = aligntokens[a];
const auto & hmm = hset.gethmm (unit.unit);
fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
}
fprintf (stderr, "\n");
}
#endif
if (numsilunits > 0)
edgeswithsilence++; // (for diagnostics message only)
backptrbufsize += maxsilframes * numofstates;
}
backptroffsets[L.edges.size()] = backptrbufsize; // (TODO: remove if not actually needed)
fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
}
// CUDA support
const std::vector<size_t> & getbackptroffsets() const { return backptroffsets; }
std::vector<unsigned short> & getbackptrbuffer() { backptrstorage.resize(backptroffsets.back()) ; return backptrstorage; } // for retrieving it from the GPU
size_t getbackptrstoragesize() const {return backptroffsets.back();}
};
void forwardbackwardalign (parallelstate & parallelstate,
const msra::asr::simplesenonehmm & hset, const bool softalignstates,
const double minlogpp, const std::vector<double> & origlogpps,
std::vector<msra::math::ssematrixbase *> & abcs, littlematrixheap & matrixheap,
const bool returnsenoneids,
std::vector<float> & edgeacscores, const msra::math::ssematrixbase & logLLs,
edgealignments & thisedgealignments, backpointers & thisbackpointers) const;
double forwardbackwardlatticesMBR (const std::vector<float> & edgeacscores, const msra::asr::simplesenonehmm & hset,
const std::vector<double> & logalphas, const std::vector<double> & logbetas,
const float lmf, const float wp, const float amf, const_array_ref<size_t> & uids,
const edgealignments & thisedgealignments, std::vector<double> & Eframescorrect) const;
void sMBRerrorsignal (parallelstate & parallelstate,
msra::math::ssematrixbase & errorsignal, msra::math::ssematrixbase & errorsignalneg,
const std::vector<double> & logpps, const float amf, double minlogpp,
const std::vector<double> & origlogpps, const std::vector<double> & logEframescorrect,
const double logEframescorrecttotal, const edgealignments & thisedgealignments) const;
void mmierrorsignal (parallelstate & parallelstate, double minlogpp, const std::vector<double> & origlogpps,
std::vector<msra::math::ssematrixbase *> & abcs, const bool softalignstates,
const std::vector<double> & logpps, const msra::asr::simplesenonehmm & hset,
const edgealignments & thisedgealignments, msra::math::ssematrixbase & errorsignal) const;
double bestpathlattice (const std::vector<float> & edgeacscores, std::vector<double> & logpps,
const float lmf, const float wp, const float amf) const;
static float lattice::alignedge (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset,
const msra::math::ssematrixbase & logLLs, msra::math::ssematrixbase & gammas,
size_t edgeindex, const bool returnsenoneids, array_ref<unsigned short> thisedgealignments);
const_array_ref<aligninfo> getaligninfo (size_t j) const { size_t begin = (size_t) edges[j].firstalign; size_t end = j+1 < edges.size() ? (size_t) edges[j+1].firstalign : align.size(); return const_array_ref<aligninfo> (align.data() + begin, end - begin); }
static std::string gettranscript (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset);
void parallelforwardbackwardalign (parallelstate & parallelstate,
const msra::asr::simplesenonehmm & hset, const msra::math::ssematrixbase & logLLs,
std::vector<float> & edgeacscores, edgealignments & edgealignments, backpointers & backpointers) const;
void parallelsMBRerrorsignal (parallelstate & parallelstate, const edgealignments & thisedgealignments,
const std::vector<double> & logpps, const float amf,
const std::vector<double> & logEframescorrect, const double logEframescorrecttotal,
msra::math::ssematrixbase & errorsignal, msra::math::ssematrixbase & errorsignalneg) const;
void parallelmmierrorsignal (parallelstate & parallelstate, const edgealignments & thisedgealignments,
const std::vector<double> & logpps, msra::math::ssematrixbase & errorsignal) const;
double parallelforwardbackwardlattice (parallelstate & parallelstate, const std::vector<float> & edgeacscores,
const edgealignments & thisedgealignments, const float lmf, const float wp,
const float amf, const float boostingfactor, std::vector<double> & logpps, std::vector<double> & logalphas,
std::vector<double> & logbetas, const bool returnEframescorrect,
const_array_ref<size_t> & uids, std::vector<double> & logEframescorrect,
std::vector<double> & Eframescorrectbuf, double & logEframescorrecttotal) const;
static double scoregroundtruth (const_array_ref<size_t> uids, const_array_ref<htkmlfwordsequence::word> transcript,
const std::vector<float> & transcriptunigrams, const msra::math::ssematrixbase & logLLs,
const msra::asr::simplesenonehmm & hset, const float lmf, const float wp, const float amf);
static float lattice::forwardbackwardedge (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset,
const msra::math::ssematrixbase & logLLs, msra::math::ssematrixbase & gammas,
size_t edgeindex);
double forwardbackwardlattice (const std::vector<float> & edgeacscores, parallelstate & parallelstate,
std::vector<double> & logpps, std::vector<double> & logalphas, std::vector<double> & logbetas,
const float lmf, const float wp, const float amf, const float boostingfactor, const bool sMBRmode,
const_array_ref<size_t> & uids, const edgealignments & thisedgealignments,
std::vector<double> & logEframescorrect, std::vector<double> & Eframescorrectbuf,
double & logEframescorrecttotal) const;
public:
// construct from a HTK lattice file
void fromhtklattice (const wstring & path, const std::unordered_map<std::string,size_t> & unitmap);
// construct from an MLF file (numerator lattice)
void frommlf (const wstring & key, const std::unordered_map<std::string,size_t> & unitmap, const msra::asr::htkmlfreader<msra::asr::htkmlfentry,lattice::htkmlfwordsequence> & labels,
const msra::lm::CMGramLM & lm, const msra::lm::CSymbolSet & unigramsymbols);
// check consistency
// - only one end node
// - only forward edges
// - nodes are sorted by time
// - edges are sorted by end node (they happen to come like this; so we can capitalize on it)
void checklattice() const
{
// in/out counts to detect orphan nodes
std::vector<size_t> numin (info.numnodes, 0), numout (info.numnodes, 0);
// check edges' sortedness and count in/out
for (size_t j = 0; j < info.numedges; j++)
{
const auto & e = edges[j];
if (e.E <= e.S)
throw std::runtime_error ("checklattice: lattice is not topologically sorted");
if (nodes[e.E].t < nodes[e.S].t)
throw std::runtime_error ("checklattice: lattice edge has negative time range");
if (nodes[e.E].t == nodes[e.S].t && j < info.numedges-1)
throw std::runtime_error ("checklattice: 0-frame edges forbidden except for very last edge");
if (j != (info.numedges - 1) && nodes[e.E].t == nodes[e.S].t)// last arc can be zero time range
throw std::runtime_error ("checklattice: lattice edge has zero time range");
if (j > 0 && e.E < edges[j-1].E)
throw std::runtime_error ("checklattice: lattice is not sorted by end node");
if (j > 0 && e.E == edges[j-1].E && e.S < edges[j-1].S) // == also not allowed except for terminal edges
throw std::runtime_error ("checklattice: lattice is not sorted by start node within the same end node");
if (j > 0 && e.E == edges[j-1].E && e.S == edges[j-1].S)
{ // Note: same E means identical word on the edge, due to word id stored on node. Thus, the edge is redundant = forbidden.
if (e.E != info.numnodes-1)
throw std::runtime_error ("checklattice: lattice has duplicate edges");
else // Exception: end of lattice, which happens rarely (2 examples found) and won't cause dramatic error, none in typical cases.
fprintf (stderr, "checklattice: WARNING: duplicate edge J=%d (S=%d -> E=%d) at end of lattice\n", (int) j, (int) e.S, (int) e.E);
}
numin[e.E]++;
numout[e.S]++;
}
// check nodes and in/out counts
if (nodes[0].t != 0.0f)
throw std::runtime_error ("checklattice: lattice does not begin with time 0");
for (size_t i = 0; i < info.numnodes; i++)
{
if (i > 0 && nodes[i].t < nodes[i-1].t)
throw std::runtime_error ("checklattice: lattice nodes not sorted by time");
if ((numin[i] > 0) ^ (i > 0))
throw std::runtime_error ("checklattice: found an orphaned start node");
if ((numout[i] > 0) ^ (i < info.numnodes-1))
throw std::runtime_error ("checklattice: found an orphaned end node");
}
}
void showstats() const // display stats info for a lattice
{
size_t totaledgeframes = 0;
for (size_t j = 0; j < info.numedges; j++)
totaledgeframes += nodes[edges[j].E].t - (size_t) nodes[edges[j].S].t;
fprintf (stderr, "lattice: read %d nodes, %d edges, %d units, %d frames, %.1f edges/node, %.1f units/edge, %.1f frames/edge, density %.1f\n",
info.numnodes, info.numedges, align.size(), info.numframes,
info.numedges / (double) info.numnodes, align.size() / (double) info.numedges, totaledgeframes / (double) info.numedges, totaledgeframes / (double) info.numframes);
}
// merge a second lattice in --for use by convert()
private:
// helper for merge()
struct nodecontext
{
int left, right;
static const signed short unknown = -1; // not set yet
static const signed short ambiguous = -2; // multiple --this is allowed if the other context is /sil/
static const signed short start = -3; // lattice start node
static const signed short end = -4; // lattice end node
nodecontext() { left = unknown; right = unknown; t = SIZE_MAX; i = SIZE_MAX; iother = SIZE_MAX; }
// helpers to set the values with uniq checks
private:
void setcontext (int & lr, int val);
public:
void setleft (int val) { setcontext (left, val); }
void setright (int val) { setcontext (right, val); }
// for building joint node space
size_t t; // frame index
size_t i; // original node index
size_t iother; // original node index in 'other' lattice
bool operator< (const nodecontext & other) const;
};
std::vector<nodecontext> determinenodecontexts (const msra::asr::simplesenonehmm & hset) const;
public:
void removefinalnull(); // call this before merge on both lattices
void merge (const lattice & other, const msra::asr::simplesenonehmm & hset);
void dedup(); // call this after merge() after conversion to uniq'ed format
template<typename HMMLOOKUPFUNCTION>
void dump (FILE * f, const HMMLOOKUPFUNCTION & gethmmname) const // dump a lattice in HTK-like format
{
fprintf (f, "N=%lu L=%lu\n", nodes.size(), edges.size());
//foreach_index (i, nodes)
// fprintf (f, "I=%d\tt=%.2f\n", i, nodes[i].t * 0.01f);
foreach_index (j, edges)
{
const auto & e = edges[j];
fprintf (f, "J=%d\tS=%d\tE=%d\tts=%.2f\tte=%.2f\ta=%.3f\tl=%.8f\td=:",
(int) j, (int) e.S, (int) e.E, (float) nodes[e.S].t * 0.01f, (float) nodes[e.E].t * 0.01f, (float) e.a, (float) e.l);
const auto align = getaligninfo (j);
foreach_index (k, align) // e.g. d=:aa:m-ih:s+t:e,0.03:ow:e-t:m+sil,0.03:sil,0.21:
fprintf (f, "%s,%.2f:", gethmmname (align[k].unit), align[k].frames * 0.01f);
fprintf (f, "\n");
}
}
size_t getnumframes () const { return info.numframes; }
size_t getnumnodes () const { return info.numnodes; }
size_t getnumedges () const { return info.numedges; }
// write a tag, followed by an integer
void fwritetag (FILE * f, const char * tag, size_t n)
{
fputTag (f, tag);
fputint (f, (int) n);
}
template<class VECTOR> void fwritevector (FILE * f, const char * tag, const VECTOR & v)
{
fwritetag (f, tag, v.size());
fwriteOrDie (v, f);
}
void fwrite (FILE * f)
{
#if 1
const size_t version = 2; // format version
fwritetag (f, "LAT ", version);
fwriteOrDie (&info, sizeof (info), 1, f);
fwritevector (f, "NODS", nodes);
fwritevector (f, "EDGS", edges2); // uniqued edges
fwritevector (f, "ALNS", uniquededgedatatokens); // uniqued alignments and scores
fputTag (f, "END ");
#else
const size_t version = 1; // format version
fwritetag (f, "LAT ", version);
fwriteOrDie (&info, sizeof (info), 1, f);
fwritevector (f, "NODE", nodes);
fwritevector (f, "EDGE", edges);
fwritevector (f, "ALIG", align);
fputTag (f, "END ");
#endif
}
// empty constructor, e.g. for use in minibatch source
lattice() { }
size_t freadtag (FILE * f, const char * tag)
{
fcheckTag (f, tag);
return (unsigned int) fgetint (f);
}
template<class VECTOR> void freadvector (FILE * f, const char * tag, VECTOR & v, size_t expectedsize = SIZE_MAX)
{
const size_t sz = freadtag (f, tag);
if (expectedsize != SIZE_MAX && sz != expectedsize)
throw std::runtime_error (std::string ("freadvector: malformed file, number of vector elements differs from head, for tag ") + tag);
freadOrDie (v, sz, f);
}
// read from a stream
// This can be used on an existing structure and will replace its content. May be useful to avoid memory allocations (resize() will not shrink memory).
// For efficiency, we will not check the inner consistency of the file here, but rather when we further process it.
// (We already use the tag mechanism to check the rough structure.)
// If this fails, the lattice is in unusable state, but it is OK to call fread() again to regain a usable object. I.e. this is safe to be used in retry loops.
// This will also map the aligninfo entries to the new symbol table, through idmap.
// V1 lattices will be converted. 'spsenoneid' is used in that process.
template<class IDMAP> void fread (FILE * f, const IDMAP & idmap, size_t spunit)
{
size_t version = freadtag (f, "LAT ");
if (version == 1)
{
freadOrDie (&info, sizeof (info), 1, f);
freadvector (f, "NODE", nodes, info.numnodes);
if (nodes.back().t != info.numframes)
throw std::runtime_error ("fread: mismatch between info.numframes and last node's time");
freadvector (f, "EDGE", edges, info.numedges);
freadvector (f, "ALIG", align);
fcheckTag (f, "END ");
// map align ids to user's symmap --the lattice gets updated in place here
foreach_index (k, align)
align[k].updateunit (idmap); // updates itself
#if 0 // TODO: this is not complete. Enable once we move to more compact5 data structure.
//showstats();
// version 1 is outdated --we build the compact version now
// TODO: once all is converted, edges() will become a local variable here
buildedgegroupstorage();
#endif
}
else if (version == 2)
{
freadOrDie (&info, sizeof (info), 1, f);
freadvector (f, "NODS", nodes, info.numnodes);
if (nodes.back().t != info.numframes)
throw std::runtime_error ("fread: mismatch between info.numframes and last node's time");
freadvector (f, "EDGS", edges2, info.numedges); // uniqued edges
freadvector (f, "ALNS", uniquededgedatatokens); // uniqued alignments
fcheckTag (f, "END ");
// check if we need to map
#if 1 // post-bugfix for incorrect inference of spunit
if (info.impliedspunitid != SIZE_MAX && info.impliedspunitid >= idmap.size()) // we have buggy lattices like that--what do they mean??
{
fprintf (stderr, "fread: detected buggy spunit id %d which is out of range (%d entries in map)\n", info.impliedspunitid, idmap.size());
throw std::runtime_error ("fread: out of bounds spunitid");
}
#endif
// This is critical--we have a buggy lattice set that requires no mapping where mapping would fail
bool needsmapping = false;
foreach_index (k, idmap)
{
if (idmap[k] != (size_t) k
#if 1
&& (k != (int) idmap.size() -1 || idmap[k] != spunit) // that HACK that we add one more /sp/ entry at the end...
#endif
)
{
needsmapping = true;
break;
}
}
// map align ids to user's symmap --the lattice gets updated in place here
if (needsmapping)
{
if (info.impliedspunitid != SIZE_MAX)
info.impliedspunitid = idmap[info.impliedspunitid];
// deal with broken (zero-token) edges
std::vector<bool> isendworkaround;
if (info.impliedspunitid != spunit)
{
fprintf (stderr, "fread: lattice with broken spunit, using workaround to handle potentially broken zero-token edges\n");
inferends (isendworkaround);
}
size_t uniquealignments = 1;
const size_t skipscoretokens = info.hasacscores ? 2 : 1;
for (size_t k = skipscoretokens; k < uniquededgedatatokens.size(); k++)
{
auto & ai = uniquededgedatatokens[k];
if (!isendworkaround.empty() && isendworkaround[k]) // secondary criterion to detect ends in broken lattices
{
k--; // don't advance, since nothing to advance over
goto skipscores;
}
// this is a regular token: update it in-place
if (ai.unit >= idmap.size())
throw std::runtime_error ("fread: broken-file heuristics failed");
ai.updateunit (idmap); // updates itself
if (!ai.last)
continue;
skipscores:
// if last then skip over the lm and ac scores
k += skipscoretokens;
uniquealignments++;
}
fprintf (stderr, "fread: mapped %d unique alignments\n", uniquealignments);
}
if (info.impliedspunitid != spunit)
{
// fprintf (stderr, "fread: inconsistent spunit id in file %d vs. expected %d; due to erroneous heuristic\n", info.impliedspunitid, spunit); // [v-hansu] comment out becaues it takes up most of the log
// it's actually OK, we can live with this, since we only decompress and then move on without any assumptions
//throw std::runtime_error ("fread: mismatching /sp/ units");
}
// reconstruct old lattice format from this --TODO: remove once we change to new data representation
rebuildedges (info.impliedspunitid != spunit/*to be able to read somewhat broken V2 lattice archives*/);
}
else
throw std::runtime_error ("fread: unsupported lattice format version");
}
// parallel versions (defined in parallelforwardbackward.cpp)
class parallelstate
{
struct parallelstateimpl * pimpl;
bool cpumode;
public:
parallelstate();
~parallelstate();
bool enabled() const { return pimpl != NULL; }; // true if functions in here are available or not
void copyalignments(edgealignments & edgealignments);
void entercomputation (const class msra::asr::simplesenonehmm & hmms, const mbrclassdefinition mbrclassdef); // pass models in (to GPU)
// no exitcomputation(); tear down the object instead
struct parallelstateimpl * operator->() { return pimpl; } // to access the actual state (which are declared inside parallelstateimpl class)
const struct parallelstateimpl * operator->() const { return pimpl; } // to access the actual state (which are declared inside parallelstateimpl class)
const size_t getsilunitid ();
void getedgeacscores (std::vector<float> & edgeacscores);
void getedgealignments (std::vector<unsigned short> & edgealignments);
};
// forward-backward function
// Note: logLLs and posteriors may be the same matrix (aliased).
double forwardbackward (parallelstate & parallelstate, const class msra::math::ssematrixbase & logLLs, const class msra::asr::simplesenonehmm & hmms,
class msra::math::ssematrixbase & result, class msra::math::ssematrixbase & errorsignalbuf,
const float lmf, const float wp, const float amf, const float boostingfactor, const bool sMBRmode, const_array_ref<size_t> uids = const_array_ref<size_t>(),
const_array_ref<htkmlfwordsequence::word> transcript = const_array_ref<htkmlfwordsequence::word>(), const std::vector<float> & transcriptunigrams = std::vector<float>()) const;
wstring key; // (keep our own name (key) so we can identify ourselves for diagnostics messages)
const wchar_t * getkey() const { return key.c_str(); }
};
// ===========================================================================
// archive -- a disk-based archive of lattices
// Optimized for sequentially retrieving lattices in order of original archive
// building process.
// ===========================================================================
class archive
{
const std::unordered_map<std::string,size_t> & modelsymmap; // [triphone name] -> index used in model
// set of lattice archive files referenced
// Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files.
std::vector<std::wstring> archivepaths; // [archiveindex] -> archive path
size_t getarchiveindex (const std::wstring & path) // get index of a path in archivepaths[]; create new entry if needed
{
auto iter = std::find (archivepaths.begin(), archivepaths.end(), path);
size_t i = iter - archivepaths.begin();
if (i == archivepaths.size())
archivepaths.push_back (path);
return i;
}
// set of phoneme mappings
// Each archive file has its associated .symlist that defines the symbol mappings
typedef std::vector<unsigned int> symbolidmapping;
mutable std::vector<symbolidmapping> symmaps; // [archiveindex][unit] -> global unit map
template<class SYMMAP> static size_t getid (const SYMMAP & symmap, const std::string & key)
{
auto iter = symmap.find (key);
if (iter == symmap.end())
throw std::runtime_error (std::string ("getcachedidmap: symbol not found in user-supplied symbol map: ") + key);
return iter->second;
}
template<class SYMMAP> const symbolidmapping & getcachedidmap (size_t archiveindex, const SYMMAP & symmap/*[string] -> numeric id*/) const
{
symbolidmapping & idmap = symmaps[archiveindex];
if (idmap.empty()) // TODO: delete this: && !modelsymmap.empty()/*no mapping; used in conversion*/)
{ // need to read the map and establish the mapping
// get the symlist file
const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist";
fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
vector<char> textbuffer;
auto lines = msra::files::fgetfilelines (symlistpath, textbuffer);
// establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found
idmap.reserve (lines.size() +1); // last entry is a fake entry to return the /sp/ unit
string symstring, tosymstring;
symstring.reserve (100); tosymstring.reserve (100);
foreach_index (i, lines)
{
char * line = lines[i];
char * sym = line;
// parse out a mapping (log SPC phys)
char * p = strchr (sym, ' ');
if (p != NULL) // mapping: just verify that the supplied symmap has the same mapping
{
*p = 0;
const char * tosym = p + 1;
symstring = sym; // (reusing existing object to avoid malloc)
tosymstring = tosym;
if (getid (symmap, symstring) != getid (symmap, tosymstring))
throw std::runtime_error (std::string ("getcachedidmap: mismatching symbol id for ") + sym + " vs. " + tosym);
}
else
{
if ((size_t) i != idmap.size()) // non-mappings must come first (this is to ensure compatibility with pre-mapping files)
throw std::runtime_error ("getcachedidmap: mixed up symlist file");
symstring = sym; // (reusing existing object to avoid malloc)
idmap.push_back ((unsigned int) getid (symmap, symstring));
}
}
// append a fixed-position entry: last entry means /sp/
idmap.push_back ((unsigned int) getid (symmap, "sp"));
}
return idmap;
}
// all lattices read so far
struct latticeref
{
uint64_t offset : 48;
uint64_t archiveindex : 16;
latticeref (uint64_t offset, size_t archiveindex) : offset (offset), archiveindex (archiveindex) {}
};
static_assert (sizeof (latticeref) == 8, "unexpected byte size of struct latticeref");
mutable size_t currentarchiveindex; // which archive is open
mutable auto_file_ptr f; // cached archive file handle of currentarchiveindex
hash_map<std::wstring,latticeref> toc; // [key] -> (file, offset) --table of content (.toc file)
public:
// construct = open the archive
//archive() : currentarchiveindex (SIZE_MAX) {}
// test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode)
bool empty() const { return archivepaths.empty(); }
// construct from a list of TOC files
archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap)
{
if (tocpaths.empty()) // nothing to read--keep silent
return;
fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", tocpaths.size(), tocpaths[0].c_str());
foreach_index (i, tocpaths)
{
fprintf (stderr, ".");
open (tocpaths[i]);
}
fprintf (stderr, " %d total lattices referenced in %d archive files\n", toc.size(), archivepaths.size());
}
// open an archive
// Can be called for multiple archives.
// BUGBUG: NOT YET. We only really support one archive file at this point. Important to do that though.
void open (const std::wstring & tocpath)
{
// BUGBUG: we only really support one archive file at this point
// read the TOC in one swoop
std::vector<char> textbuffer;
auto toclines = msra::files::fgetfilelines (tocpath, textbuffer);
// parse it one by one
size_t archiveindex = SIZE_MAX; // its index
foreach_index (i, toclines)
{
const char * line = toclines[i];
const char * p = strchr (line, '=');
if (p == NULL)
throw std::runtime_error ("open: invalid TOC line (no = sign): " + std::string (line));
const std::wstring key = msra::strfun::utf16 (std::string (line, p - line));
p++;
const char * q = strchr (p, '[');
if (q == NULL)
throw std::runtime_error ("open: invalid TOC line (no [): " + std::string (line));
if (q != p)
{
const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
// TODO: should we allow paths relative to TOC file?
archiveindex = getarchiveindex (archivepath);
}
if (archiveindex == SIZE_MAX)
throw std::runtime_error ("open: invalid TOC line (empty archive pathname): " + std::string (line));
char c;
uint64_t offset;
if (sscanf_s (q, "[%I64u]%c", &offset, &c, sizeof (c)) != 1)
throw std::runtime_error ("open: invalid TOC line (bad [] expression): " + std::string (line));
if (!toc.insert (make_pair (key, latticeref (offset, archiveindex))).second)
throw std::runtime_error ("open: TOC entry leads to duplicate key: " + std::string (line));
}
// initialize symmaps --alloc the array, but actually read the symmap on demand
symmaps.resize (archivepaths.size());
}
// check if a lattice for a given key is available --do this during initial check ideally
bool haslattice (const std::wstring & key) const { return toc.find (key) != toc.end(); }
#if 0 // TODO: change design to keep the #frames in the TOC, so we can check for mismatches before entering the training iteration
// return # frames for a key, or 0 if lattice not found (this combines the function of haslattice(), we save one lookup)
size_t getlatticeframes (const std::wstring & key) const
{
auto iter = toc.find (key);
if (iter == toc.end())
return 0;
else
return iter->second->xyz; // oops!
}
#endif
// get a lattice
// This function is designed to be called from a retry loop due to the realistic chance of server disconnects or other server failures.
// 'key' is supposed to be known to exist. Use haslattice() to ensure. This is because this function is called from a retry loop.
// Lattices will have unit ids updated according to the modelsymmap.
// V1 lattices will be converted. 'spsenoneid' is used in the conversion for optimizing storing 0-frame /sp/ aligns.
void getlattice (const std::wstring & key, lattice & L,
size_t expectedframes = SIZE_MAX /*if unknown*/) const
{
auto iter = toc.find (key);
if (iter == toc.end())
throw std::logic_error ("getlattice: requested lattice for non-existent key; haslattice() should have been used to check availability");
// get the archive that the lattice lives in and its byte offset
const size_t archiveindex = iter->second.archiveindex;
const auto offset = iter->second.offset;
// get id map (used below); this may lazily load a .symlist file. We do it here rather than later w.r.t. an outer retry loop.
auto & idmap = getcachedidmap (archiveindex, modelsymmap); // at first time, this will load the .symlist file and create a mapping to the user SYMMAP
const size_t spunit = idmap.back(); // ugh--getcachedidmap() just appends it to the end
#if 1 // prep for fixing the pushing of /sp/ at the end --we actually can just look it up! Duh
const size_t spunit2 = getid (modelsymmap, "sp");
if (spunit2 != spunit)
throw std::logic_error ("getlattice: huh? same lookup of /sp/ gives different result?");
#endif
// open archive file in case it is not the current one
if (archiveindex != currentarchiveindex)
{
f = fopenOrDie (archivepaths[archiveindex], L"rbS"); // or throw (will close old 'f' iff succeeded)
currentarchiveindex = archiveindex;
}
try // (for read operation)
{
// seek to start
fsetpos (f, offset);
// get it
L.fread (f, idmap, spunit);
#ifdef HACK_IN_SILENCE // hack to simulate DEL in the lattice
const size_t silunit = getid (modelsymmap, "sil");
const bool addsp = true;
L.hackinsilencesubstitutionedges (silunit, spunit, addsp);
#endif
}
catch (...) // to retry a read error due to a disconnected file handle, we need to reopen the file
{
currentarchiveindex = SIZE_MAX;
f = NULL; // this closes the file handle
throw;
}
// check if number of frames is as expected
if (expectedframes != SIZE_MAX && L.getnumframes() != expectedframes)
throw std::logic_error ("getlattice: number of frames mismatch between numerator lattice and features");
// remember the latice key for diagnostics messages
L.key = key;
};
// static method for building an archive
static void build (const std::vector<std::wstring> & infiles, const std::wstring & outpath,
const std::unordered_map<std::string,size_t> & modelsymmap,
const msra::asr::htkmlfreader<msra::asr::htkmlfentry,msra::lattices::lattice::htkmlfwordsequence> & labels,
const msra::lm::CMGramLM & lm, const msra::lm::CSymbolSet & unigramsymbols);
// static method for converting an archive to a new format
// Extended features:
// - check consistency (don't write out)
// - dump to stdout
// - merge two lattices (for merging numer into denom lattices)
static void convert (const std::wstring & intocpath, const std::wstring & intocpath2, const std::wstring & outpath,
const msra::asr::simplesenonehmm & hset);
};
};};
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...