swh:1:snp:f50ab94432af916b5fb8b4ad831e8dddded77084
Tip revision: a02752ef96873c7fae1e02bf78caa448d10a40d0 authored by Sergii Dymchenko on 09 May 2018, 00:45:58 UTC
Use O_TRUNC when saving ONNX models to prevent possible file corruption.
Use O_TRUNC when saving ONNX models to prevent possible file corruption.
Tip revision: a02752e
htkfeatio.h
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
// htkfeatio.h -- helper for I/O of HTK feature files
//
#pragma once
#include "Basics.h"
#include "basetypes.h"
#include "fileutil.h"
#include "simple_checked_arrays.h"
#include <string>
#include <regex>
#include <set>
#include <unordered_map>
#include <stdint.h>
#include <limits.h>
#include <wchar.h>
#include "simplesenonehmm.h"
#include <array>
#include "minibatchsourcehelpers.h"
namespace msra { namespace asr {
using namespace std;
// ===========================================================================
// htkfeatio -- common base class for reading and writing HTK feature files
// ===========================================================================
class htkfeatio
{
protected:
auto_file_ptr f;
wstring physicalpath; // path of this file
bool needbyteswapping; // need to swap the bytes?
string featkind; // HTK feature-kind string
size_t featdim; // feature dimension
unsigned int featperiod; // sampling period
// note that by default we assume byte swapping (seems to be HTK default)
htkfeatio()
: needbyteswapping(true), featdim(0), featperiod(0)
{
}
// set the feature kind variables --if already set then validate that they are the same
// Path is only for error message.
void setkind(string kind, size_t dim, unsigned int period, const wstring& path)
{
if (featkind.empty()) // not set yet: just memorize them
{
assert(featdim == 0 && featperiod == 0);
featkind = kind;
featdim = dim;
featperiod = period;
}
else // set already: check if consistent
{
if (featkind != kind || featdim != dim || featperiod != period)
RuntimeError("setkind: inconsistent feature kind for file '%ls'", path.c_str());
}
}
static short swapshort(short v) throw()
{
const unsigned char* b = (const unsigned char*) &v;
return (short) ((b[0] << 8) + b[1]);
}
static unsigned short swapunsignedshort(unsigned short v) throw()
{
const unsigned char* b = (const unsigned char*)&v;
return (unsigned short)((b[0] << 8) + b[1]);
}
static int swapint(int v) throw()
{
const unsigned char* b = (const unsigned char*) &v;
return (int) (((((b[0] << 8) + b[1]) << 8) + b[2]) << 8) + b[3];
}
struct fileheader
{
int nsamples;
int sampperiod;
unsigned short sampsize;
short sampkind;
void read(FILE* f)
{
nsamples = fgetint(f);
sampperiod = fgetint(f);
sampsize = (unsigned short) fgetshort(f);
sampkind = fgetshort(f);
}
// read header of idx feature cach
void idxRead(FILE* f)
{
int magic = swapint(fgetint(f));
if (magic != 2051)
RuntimeError("reading idx feature cache header: invalid magic");
nsamples = swapint(fgetint(f));
sampperiod = 0;
sampkind = (short) 9; // user type
int nRows = swapint(fgetint(f));
int nCols = swapint(fgetint(f));
int rawsampsize = nRows * nCols;
sampsize = (unsigned short) rawsampsize; // features are stored as bytes;
if (sampsize != rawsampsize)
RuntimeError("reading idx feature cache header: sample size overflow");
}
void write(FILE* f)
{
fputint(f, nsamples);
fputint(f, sampperiod);
fputshort(f, (short) sampsize);
fputshort(f, sampkind);
}
void byteswap()
{
nsamples = swapint(nsamples);
sampperiod = swapint(sampperiod);
sampsize = swapunsignedshort(sampsize);
sampkind = swapshort(sampkind);
}
};
static const int BASEMASK = 077;
static const int PLP = 11;
static const int MFCC = 6;
static const int FBANK = 7;
static const int USER = 9;
static const int FESTREAM = 12;
static const int HASENERGY = 0100; // _E log energy included
static const int HASNULLE = 0200; // _N absolute energy suppressed
static const int HASDELTA = 0400; // _D delta coef appended
static const int HASACCS = 01000; // _A acceleration coefs appended
static const int HASCOMPX = 02000; // _C is compressed
static const int HASZEROM = 04000; // _Z zero meaned
static const int HASCRCC = 010000; // _K has CRC check
static const int HASZEROC = 020000; // _0 0'th Cepstra included
static const int HASVQ = 040000; // _V has VQ index attached
static const int HASTHIRD = 0100000; // _T has Delta-Delta-Delta index attached
};
// ===========================================================================
// htkfeatwriter -- write HTK feature file
// This is designed to write a single file only (no archive mode support).
// ===========================================================================
class htkfeatwriter : protected htkfeatio
{
size_t curframe;
vector<float> tmp;
public:
short parsekind(const string& str)
{
vector<string> params = msra::strfun::split(str, ";");
if (params.empty())
RuntimeError("parsekind: invalid param kind string");
vector<string> parts = msra::strfun::split(params[0], "_");
// map base kind
short sampkind;
string basekind = parts[0];
if (basekind == "PLP")
sampkind = PLP;
else if (basekind == "MFCC")
sampkind = MFCC;
else if (basekind == "FBANK")
sampkind = FBANK;
else if (basekind == "USER")
sampkind = USER;
else
RuntimeError("parsekind: unsupported param base kind");
// map qualifiers
for (size_t i = 1; i < parts.size(); i++)
{
string opt = parts[i];
if (opt.length() != 1)
RuntimeError("parsekind: invalid param kind string");
switch (opt[0])
{
case 'E':
sampkind |= HASENERGY;
break;
case 'D':
sampkind |= HASDELTA;
break;
case 'N':
sampkind |= HASNULLE;
break;
case 'A':
sampkind |= HASACCS;
break;
case 'T':
sampkind |= HASTHIRD;
break;
case 'Z':
sampkind |= HASZEROM;
break;
case '0':
sampkind |= HASZEROC;
break;
default:
RuntimeError("parsekind: invalid qualifier in param kind string");
}
}
return sampkind;
}
public:
// open the file for writing
htkfeatwriter(wstring path, string kind, size_t dim, unsigned int period)
{
setkind(kind, dim, period, path);
// write header
fileheader H;
H.nsamples = 0; // unknown for now, updated in close()
H.sampperiod = period;
const int bytesPerValue = sizeof(float); // we do not support compression for now
size_t rawsampsize = featdim * bytesPerValue;
H.sampsize = (unsigned short) rawsampsize;
if (H.sampsize != rawsampsize)
RuntimeError("htkfeatwriter: sample size overflow");
H.sampkind = parsekind(kind);
if (needbyteswapping)
H.byteswap();
f = fopenOrDie(path, L"wbS");
H.write(f);
curframe = 0;
}
// write a frame
void write(const vector<float>& v)
{
if (v.size() != featdim)
LogicError("htkfeatwriter: inconsistent feature dimension");
if (needbyteswapping)
{
tmp.resize(v.size());
foreach_index (k, v)
tmp[k] = v[k];
msra::util::byteswap(tmp);
fwriteOrDie(tmp, f);
}
else
fwriteOrDie(v, f);
curframe++;
}
// finish
// This updates the header.
// BUGBUG: need to implement safe-save semantics! Otherwise won't work reliably with -make mode.
// ... e.g. set DeleteOnClose temporarily, and clear at the end?
void close(size_t numframes)
{
if (curframe != numframes)
LogicError("htkfeatwriter: inconsistent number of frames passed to close()");
fflushOrDie(f);
// now implant the length field; it's at offset 0
int nSamplesFile = (int) numframes;
if (needbyteswapping)
nSamplesFile = swapint(nSamplesFile);
fseekOrDie(f, 0);
fputint(f, nSamplesFile);
fflushOrDie(f);
f = NULL; // this triggers an fclose() on auto_file_ptr
}
// read an entire utterance into a matrix
// Matrix type needs to have operator(i,j) and resize(n,m).
// We write to a tmp file first to ensure we don't leave broken files that would confuse make mode.
template <class MATRIX>
static void write(const wstring& path, const string& kindstr, unsigned int period, const MATRIX& feat)
{
wstring tmppath = path + L"$$"; // tmp path for make-mode compliant
unlinkOrDie(path); // delete if old file is already there
// write it out
size_t featdim = feat.rows();
size_t numframes = feat.cols();
vector<float> v(featdim);
htkfeatwriter W(tmppath, kindstr, feat.rows(), period);
#ifdef SAMPLING_EXPERIMENT
for (size_t i = 0; i < numframes; i++)
{
foreach_index (k, v)
{
float val = feat(k, i) - logf((float) SAMPLING_EXPERIMENT);
if (i % SAMPLING_EXPERIMENT == 0)
v[k] = val;
else
v[k] += (float) (log(1 + exp(val - v[k]))); // log add
}
if (i % SAMPLING_EXPERIMENT == SAMPLING_EXPERIMENT - 1)
W.write(v);
}
#else
for (size_t i = 0; i < numframes; i++)
{
foreach_index (k, v)
v[k] = feat(k, i);
W.write(v);
}
#endif
#ifdef SAMPLING_EXPERIMENT
W.close(numframes / SAMPLING_EXPERIMENT);
#else
W.close(numframes);
#endif
// rename to final destination
// (This would only fail in strange circumstances such as accidental multiple processes writing to the same file.)
renameOrDie(tmppath, path);
}
};
// ===========================================================================
// htkfeatreader -- read HTK feature file, with archive support
//
// To support archives, one instance of this can (and is supposed to) be used
// repeatedly. All feat files read on the same instance are validated to have
// the same feature kind.
//
// For archives, this caches the last used file handle, in expectation that most reads
// are sequential anyway. In conjunction with a big buffer, this makes a huge difference.
// ===========================================================================
class htkfeatreader : protected htkfeatio
{
// information on current file
// File handle and feature type information is stored in the underlying htkfeatio object.
size_t physicalframes; // total number of frames in physical file
// TODO make this nicer
bool isidxformat; // support reading of features in idxformat as well (it's a hack, but different format's are not supported yet)
uint64_t physicaldatastart; // byte offset of first data byte
size_t vecbytesize; // size of one vector in bytes
bool addEnergy; // add in energy as data is read (will all have zero values)
bool compressed; // is compressed to 16-bit values
bool hascrcc; // need to skip crcc
vector<float> a, b; // for decompression
vector<short> tmp; // for decompression
vector<unsigned char> tmpByteVector; // for decompression of idx files
size_t curframe; // current # samples read so far
size_t numframes; // number of samples for current logical file
size_t energyElements; // how many energy elements to add if addEnergy is true
public:
// parser for complex a=b[s,e] syntax
struct parsedpath
{
// Note: This is not thread-safe
static std::unordered_map<std::wstring, unsigned int> archivePathStringMap;
static std::vector<std::wstring> archivePathStringVector;
protected:
friend class htkfeatreader;
msra::strfun::cstring logicalpath; // virtual path that this file should be understood to belong to
private:
unsigned int archivePathIdx;
protected:
// physical path of archive file
wstring archivepath() const
{
return archivePathStringVector[archivePathIdx];
}
bool isarchive; // true if archive (range specified)
bool isidxformat; // support reading of features in idxformat as well (it's a hack, but different format's are not supported yet)
size_t s, e; // first and last frame inside the archive file; (0, INT_MAX) if not given
void malformed(const wstring& path) const
{
RuntimeError("parsedpath: malformed path '%ls'", path.c_str());
}
// consume and return up to 'delim'; remove from 'input' (we try to avoid C++0x here for VS 2008 compat)
static wstring consume(wstring& input, const wchar_t* delim)
{
vector<wstring> parts = msra::strfun::split(input, delim); // (not very efficient, but does not matter here)
if (parts.size() == 1)
input.clear(); // not found: consume to end
else
input = parts[1]; // found: break at delimiter
return parts[0];
}
public:
// constructor parses a=b[s,e] syntax and fills in the file
// Can be used implicitly e.g. by passing a string to open().
parsedpath(const wstring& pathParam)
: logicalpath("")
{
wstring xpath(pathParam);
wstring archivepath;
// parse out logical path
wstring localLogicalpath = consume(xpath, L"=");
isidxformat = false;
if (xpath.empty()) // no '=' detected: pass entire file (it's not an archive)
{
archivepath = localLogicalpath;
s = 0;
e = INT_MAX;
isarchive = false;
// check for "-ubyte" suffix in path name => it is an idx file
wstring ubyte(L"-ubyte");
size_t pos = archivepath.size() >= ubyte.size() ? archivepath.size() - ubyte.size() : 0;
wstring suffix = archivepath.substr(pos, ubyte.size());
isidxformat = ubyte == suffix;
}
else // a=b[s,e] syntax detected
{
archivepath = consume(xpath, L"[");
if (xpath.empty()) // actually it's only a=b
{
s = 0;
e = INT_MAX;
isarchive = false;
}
else
{
s = msra::strfun::toint(consume(xpath, L","));
if (xpath.empty())
malformed(pathParam);
e = msra::strfun::toint(consume(xpath, L"]"));
// TODO \r should be handled elsewhere; refine this
if (!xpath.empty() && xpath != L"\r")
malformed(pathParam);
isarchive = true;
}
}
auto iter = archivePathStringMap.find(archivepath);
if (iter != archivePathStringMap.end())
{
archivePathIdx = iter->second;
}
else
{
archivePathIdx = (unsigned int)archivePathStringMap.size();
archivePathStringMap[archivepath] = archivePathIdx;
archivePathStringVector.push_back(archivepath);
}
logicalpath = msra::strfun::utf8(localLogicalpath);
}
// get the physical path for 'make' test
wstring physicallocation() const
{
return archivepath();
}
// Gets logical path of the utterance.
string GetLogicalPath() const
{
assert(!logicalpath.empty());
return logicalpath.substr(0, logicalpath.find_last_of("."));
}
// Clears logical path after parsing, in order not to duplicate it
// with the one stored in the corpus descriptor.
void ClearLogicalPath()
{
logicalpath.clear();
logicalpath.shrink_to_fit();
}
// casting to wstring yields the logical path
operator wstring() const
{
return msra::strfun::utf16(logicalpath);
}
// get duration in frames
size_t numframes() const
{
if (!isarchive)
RuntimeError("parsedpath: this mode requires an input script with start and end frames given");
return e - s + 1;
}
};
// Make sure 'parsedpath' type has a move constructor
static_assert(std::is_move_constructible<parsedpath>::value, "Type 'parsedpath' should be move constructible!");
private:
// open the physical HTK file
// This is different from the logical (virtual) path name in the case of an archive.
void openphysical(const parsedpath& ppath)
{
wstring physpath = ppath.physicallocation();
// auto_file_ptr f2 = fopenOrDie (physpath, L"rbS");
auto_file_ptr f2(fopenOrDie(physpath, L"rb")); // removed 'S' for now, as we mostly run local anyway, and this will speed up debugging
// read the header (12 bytes for htk feature files)
fileheader H;
isidxformat = ppath.isidxformat;
if (!isidxformat)
H.read(f2);
else // read header of idxfile
H.idxRead(f2);
// take a guess as to whether we need byte swapping or not
bool needbyteswapping2 = ((unsigned int) swapint(H.sampperiod) < (unsigned int) H.sampperiod);
if (needbyteswapping2)
H.byteswap();
// interpret sampkind
int basekind = H.sampkind & BASEMASK;
string kind;
switch (basekind)
{
case PLP:
kind = "PLP";
break;
case MFCC:
kind = "MFCC";
break;
case FBANK:
kind = "FBANK";
break;
case USER:
kind = "USER";
break;
case FESTREAM:
kind = "USER";
break; // we return this as USER type (with guid)
default:
RuntimeError("htkfeatreader:unsupported feature kind");
}
// add qualifiers
if (H.sampkind & HASENERGY)
kind += "_E";
if (H.sampkind & HASDELTA)
kind += "_D";
if (H.sampkind & HASNULLE)
kind += "_N";
if (H.sampkind & HASACCS)
kind += "_A";
if (H.sampkind & HASTHIRD)
kind += "_T";
bool compressed2 = (H.sampkind & HASCOMPX) != 0;
bool hascrcc2 = (H.sampkind & HASCRCC) != 0;
if (H.sampkind & HASZEROM)
kind += "_Z";
if (H.sampkind & HASZEROC)
kind += "_0";
if (H.sampkind & HASVQ)
RuntimeError("htkfeatreader:we do not support VQ");
// skip additional GUID in FESTREAM features
if (H.sampkind == FESTREAM)
{ // ... note: untested
unsigned char guid[16];
freadOrDie(&guid, sizeof(guid), 1, f2);
kind += ";guid=";
for (int i = 0; i < sizeof(guid) / sizeof(*guid); i++)
kind += msra::strfun::strprintf("%02x", guid[i]);
}
// other checks
size_t bytesPerValue = isidxformat ? 1 : (compressed2 ? sizeof(short) : sizeof(float));
if (H.sampsize % bytesPerValue != 0)
RuntimeError("htkfeatreader:sample size not multiple of dimension");
size_t dim = H.sampsize / bytesPerValue;
// read the values for decompressing
vector<float> a2, b2;
if (compressed2)
{
freadOrDie(a2, dim, f2);
freadOrDie(b2, dim, f2);
H.nsamples -= 4; // these are counted as 4 frames--that's the space they use
if (needbyteswapping2)
{
msra::util::byteswap(a2);
msra::util::byteswap(b2);
}
}
// done: swap it in
int64_t bytepos = fgetpos(f2);
auto location = ((std::wstring)ppath).empty() ? ppath.physicallocation() : (std::wstring)ppath;
setkind(kind, dim, H.sampperiod, location); // this checks consistency
this->physicalpath.swap(physpath);
this->physicaldatastart = bytepos;
this->physicalframes = H.nsamples;
this->f.swap(f2); // note: this will get the previous f2 auto-closed at the end of this function
this->needbyteswapping = needbyteswapping2;
this->compressed = compressed2;
this->a.swap(a2);
this->b.swap(b2);
this->vecbytesize = H.sampsize;
this->hascrcc = hascrcc2;
}
void close() // force close the open file --use this in case of read failure
{
f = NULL; // assigning a new FILE* to f will close the old FILE* if any
physicalpath.clear();
}
public:
htkfeatreader()
{
addEnergy = false;
energyElements = 0;
}
// helper to create a parsed-path object
// const auto path = parse (xpath)
parsedpath parse(const wstring& xpath)
{
return parsedpath(xpath);
}
// read a feature file
// Returns number of frames in that file.
// This understands the more complex syntax a=b[s,e] and optimizes a little
size_t open(const parsedpath& ppath)
{
// do not reopen the file if it is the same; use fsetpos() instead
if (f == NULL || ppath.physicallocation() != physicalpath)
openphysical(ppath);
if (ppath.isarchive) // reading a sub-range from an archive
{
if (ppath.s > ppath.e)
RuntimeError("open: start frame %d > end frame %d in '%ls'", (int)ppath.s, (int)ppath.e, ((wstring)ppath).c_str());
if (ppath.e >= physicalframes)
RuntimeError("open: end frame exceeds archive's total number of frames %d in '%ls'", (int)physicalframes, ((wstring)ppath).c_str());
int64_t dataoffset = physicaldatastart + ppath.s * vecbytesize;
fsetpos(f, dataoffset); // we assume fsetpos(), which is our own, is smart to not flush the read buffer
curframe = 0;
numframes = ppath.e + 1 - ppath.s;
}
else // reading a full file
{
curframe = 0;
numframes = physicalframes;
assert(fgetpos(f) == physicaldatastart);
}
return numframes;
}
// get dimension and type information for a feature file
// This will alter the state of this object in that it opens the file. It is efficient to read it right afterwards
void getinfo(const parsedpath& ppath, string& featkind2, size_t& featdim2, unsigned int& featperiod2)
{
open(ppath);
featkind2 = this->featkind;
featdim2 = this->featdim;
featperiod2 = this->featperiod;
}
// called to add energy as we read
void AddEnergy(size_t energyElements2)
{
this->energyElements = energyElements2;
this->addEnergy = energyElements2 != 0;
}
const string& getfeattype() const
{
return featkind;
}
operator bool() const
{
return curframe < numframes;
}
// read a vector from the open file
void read(std::vector<float>& v)
{
if (curframe >= numframes)
RuntimeError("htkfeatreader:attempted to read beyond end");
if (!compressed && !isidxformat) // not compressed--the easy one
{
freadOrDie(v, featdim, f);
if (needbyteswapping)
msra::util::byteswap(v);
}
else if (isidxformat)
{
// read into temp vector
freadOrDie(tmpByteVector, featdim, f);
v.resize(featdim);
foreach_index (k, v)
v[k] = (float) tmpByteVector[k];
}
else // need to decompress
{
// read into temp vector
freadOrDie(tmp, featdim, f);
if (needbyteswapping)
msra::util::byteswap(tmp);
// 'decompress' it
v.resize(tmp.size());
foreach_index (k, v)
v[k] = (tmp[k] + b[k]) / a[k];
}
curframe++;
}
// read a sequence of vectors from the open file into a range of frames [ts,te)
template <class MATRIX>
void read(MATRIX& feat, size_t ts, size_t te)
{
// read vectors from file and push to our target structure
vector<float> v(featdim + energyElements);
for (size_t t = ts; t < te; t++)
{
read(v);
// add the energy elements (all zero) if needed
if (addEnergy)
{
// we add the energy elements at the end of each section of features, (features, delta, delta-delta)
size_t posIncrement = featdim / energyElements;
size_t pos = posIncrement;
for (size_t i = 0; i < energyElements; i++, pos += posIncrement)
{
auto iter = v.begin() + pos + i;
v.insert(iter, 0.0f);
}
}
foreach_index(k, v)
feat(k, t) = v[k];
}
}
// read an entire utterance into an already allocated matrix
// Matrix type needs to have operator(i,j)
template <class MATRIX>
void read(const parsedpath& ppath, const string& kindstr, const unsigned int period, MATRIX& feat, bool needsExpansion=false)
{
// open the file and check dimensions
size_t numframes2 = open(ppath);
if (needsExpansion)
{
if (numframes2 != 1)
throw std::logic_error("read: if doing utterance-based expansion of features (e.g. ivectors), utterance must contain 1 frame only");
if (feat.rows() != featdim)
throw std::logic_error("read: stripe read called with wrong dimensions");
}
else
{
if (feat.cols() != numframes2 || feat.rows() != featdim)
LogicError("read: stripe read called with wrong dimensions");
}
if (kindstr != featkind || period != featperiod)
LogicError("read: attempting to mixing different feature kinds");
// read vectors from file and push to our target structure
try
{
read(feat, 0, numframes2);
if (needsExpansion) // copy first frame to all the frames in the stripe
{
for (int t = 1; t < feat.cols(); t++)
{
for (int k = 0; k < feat.rows(); k++)
{
feat(k, t) = feat(k, 0);
}
}
}
}
catch (...)
{
close();
throw;
}
}
// read an entire utterance into a virgen, allocatable matrix
// Matrix type needs to have operator(i,j) and resize(n,m)
template <class MATRIX>
void read(const parsedpath& ppath, string& kindstr, unsigned int& period, MATRIX& feat)
{
// get the file
size_t numframes2 = open(ppath);
feat.resize(featdim + energyElements, numframes2); // result matrix--columns are features
// read vectors from file and push to our target structure
try
{
read(feat, 0, numframes2);
}
catch (...)
{
close();
throw;
}
// return file info
kindstr = featkind;
period = featperiod;
}
};
struct htkmlfentry
{
unsigned int firstframe; // range [firstframe,firstframe+numframes)
unsigned int numframes;
msra::dbn::CLASSIDTYPE classid; // numeric state id
private:
// verify and save data
void setdata(size_t ts, size_t te, size_t uid)
{
if (te < ts)
RuntimeError("htkmlfentry: end time below start time??");
// save
firstframe = (unsigned int) ts;
numframes = (unsigned int) (te - ts);
classid = (msra::dbn::CLASSIDTYPE) uid;
// check for numeric overflow
if (firstframe != ts || firstframe + numframes != te || classid != uid)
RuntimeError("htkmlfentry: not enough bits for one of the values");
}
// parse the time range
// There are two formats:
// - original HTK
// - Dong's hacked format: ts te senonename senoneid
// We distinguish
static void parseframerange(const vector<char*>& toks, size_t& ts, size_t& te, const double htkTimeToFrame)
{
double rts = msra::strfun::todouble(toks[0]);
double rte = msra::strfun::todouble(toks[1]);
// if the difference between two frames is more than htkTimeToFrame, we expect conversion to time
if (rte - rts >= htkTimeToFrame - 1) // convert time to frame
{
ts = (size_t)(rts / htkTimeToFrame + 0.5); // get start frame
te = (size_t)(rte / htkTimeToFrame + 0.5); // get end frame
}
else
{
ts = (size_t)(rts);
te = (size_t)(rte);
}
}
public:
// parse format with original HTK state align MLF format and state list
void parsewithstatelist(const vector<char*>& toks, const unordered_map<std::string, size_t>& statelisthash, const double htkTimeToFrame)
{
size_t ts, te;
parseframerange(toks, ts, te, htkTimeToFrame);
auto iter = statelisthash.find(toks[2]);
if (iter == statelisthash.end())
RuntimeError("htkmlfentry: state %s not found in statelist", toks[2]);
const size_t uid = iter->second; // get state index
setdata(ts, te, uid);
}
// ... note: this will be too simplistic for parsing more complex MLF formats. Fix when needed.
// add support so that it can handle conditions where time instead of frame numer is used.
void parse(const vector<char*>& toks, const double htkTimeToFrame)
{
if (toks.size() != 4)
RuntimeError("htkmlfentry: currently we only support 4-column format");
size_t ts, te;
parseframerange(toks, ts, te, htkTimeToFrame);
size_t uid = msra::strfun::toint(toks[3]);
setdata(ts, te, uid);
}
};
template <class ENTRY, class WORDSEQUENCE>
class htkmlfreader : public map<wstring, vector<ENTRY>> // [key][i] the data
{
wstring curpath; // for error messages
unordered_map<std::string, size_t> statelistmap; // for state <=> index
map<wstring, WORDSEQUENCE> wordsequences; // [key] word sequences (if we are building word entries as well, for MMI)
void strtok(char* s, const char* delim, vector<char*>& toks)
{
toks.resize(0);
char* context = nullptr;
for (char* p = strtok_s(s, delim, &context); p; p = strtok_s(NULL, delim, &context))
toks.push_back(p);
}
void malformed(string what)
{
RuntimeError("htkmlfreader: %s in '%ls'", what.c_str(), curpath.c_str());
}
vector<char*> readlines(const wstring& path, vector<char>& buffer)
{
// load it into RAM in one huge chunk
auto_file_ptr f(fopenOrDie(path, L"rb"));
size_t len = filesize(f);
buffer.reserve(len + 1);
freadOrDie(buffer, len, f);
buffer.push_back(0); // this makes it a proper C string
// parse into lines
vector<char*> lines;
lines.reserve(len / 20);
strtok(&buffer[0], "\r\n", lines);
return lines;
}
template <typename WORDSYMBOLTABLE, typename UNITSYMBOLTABLE>
void parseentry(const vector<std::string>& lines, size_t line, const set<wstring>& restricttokeys,
const WORDSYMBOLTABLE* wordmap, const UNITSYMBOLTABLE* unitmap,
vector<typename WORDSEQUENCE::word>& wordseqbuffer, vector<typename WORDSEQUENCE::aligninfo>& alignseqbuffer,
const double htkTimeToFrame)
{
size_t idx = 0;
string filename = lines[idx++];
while (filename == "#!MLF!#") // skip embedded duplicate MLF headers (so user can 'cat' MLFs)
filename = lines[idx++];
// some mlf file have write errors, so skip malformed entry
if (filename.length() < 3 || filename[0] != '"' || filename[filename.length() - 1] != '"')
{
fprintf(stderr, "warning: filename entry (%s)\n", filename.c_str());
fprintf(stderr, "skip current mlf entry from line (%lu) until line (%lu).\n", (unsigned long)(line + idx), (unsigned long)(line + lines.size()));
return;
}
filename = filename.substr(1, filename.length() - 2); // strip quotes
if (filename.find("*/") == 0)
filename = filename.substr(2);
#ifdef _MSC_VER
wstring key = msra::strfun::utf16(regex_replace(filename, regex("\\.[^\\.\\\\/:]*$"), string())); // delete extension (or not if none)
#else
wstring key = msra::strfun::utf16(msra::dbn::removeExtension(filename)); // note that c++ 4.8 is incomplete for supporting regex
#endif
// determine lines range
size_t s = idx;
size_t e = lines.size() - 1;
// lines range: [s,e)
// don't parse unused entries (this is supposed to be used for very small debugging setups with huge MLFs)
if (!restricttokeys.empty() && restricttokeys.find(key) == restricttokeys.end())
return;
vector<ENTRY>& entries = (*this)[key]; // this creates a new entry
if (!entries.empty())
malformed(msra::strfun::strprintf("duplicate entry '%ls'", key.c_str()));
entries.resize(e - s);
wordseqbuffer.resize(0);
alignseqbuffer.resize(0);
vector<char*> toks;
for (size_t i = s; i < e; i++)
{
// We can mutate the original string as it is no longer needed after tokenization
strtok(const_cast<char*>(lines[i].c_str()), " \t", toks);
if (statelistmap.size() == 0)
entries[i - s].parse(toks, htkTimeToFrame);
else
entries[i - s].parsewithstatelist(toks, statelistmap, htkTimeToFrame);
// if we also read word entries, do it here
if (wordmap)
{
if (toks.size() > 6 /*word entry are in this column*/)
{
const char* w = toks[6]; // the word name
int wid = (*wordmap)[w]; // map to word id --may be -1 for unseen words in the transcript (word list typically comes from a test LM)
size_t wordindex = (wid == -1) ? WORDSEQUENCE::word::unknownwordindex : (size_t) wid;
wordseqbuffer.push_back(typename WORDSEQUENCE::word(wordindex, entries[i - s].firstframe, alignseqbuffer.size()));
}
if (unitmap)
{
if (toks.size() > 4)
{
const char* u = toks[4]; // the triphone name
auto iter = unitmap->find(u); // map to unit id
if (iter == unitmap->end())
RuntimeError("parseentry: unknown unit %s in utterance %ls", u, key.c_str());
const size_t uid = iter->second;
alignseqbuffer.push_back(typename WORDSEQUENCE::aligninfo(uid, 0 /*#frames--we accumulate*/));
}
if (alignseqbuffer.empty())
RuntimeError("parseentry: lonely senone entry at start without phone/word entry found, for utterance %ls", key.c_str());
alignseqbuffer.back().frames += entries[i - s].numframes; // (we do not have an overflow check here, but should...)
}
}
}
if (wordmap) // if reading word sequences as well (for MMI), then record it (in a separate map)
{
if (!entries.empty() && wordseqbuffer.empty())
RuntimeError("parseentry: got state alignment but no word-level info, although being requested, for utterance %ls", key.c_str());
// post-process silence
// - first !silence -> !sent_start
// - last !silence -> !sent_end
int silence = (*wordmap)["!silence"];
if (silence >= 0)
{
int sentstart = (*wordmap)["!sent_start"]; // these must have been created
int sentend = (*wordmap)["!sent_end"];
// map first and last !silence to !sent_start and !sent_end, respectively
if (sentstart >= 0 && wordseqbuffer.front().wordindex == (size_t) silence)
wordseqbuffer.front().wordindex = sentstart;
if (sentend >= 0 && wordseqbuffer.back().wordindex == (size_t) silence)
wordseqbuffer.back().wordindex = sentend;
}
// if (sentstart < 0 || sentend < 0 || silence < 0)
// LogicError("parseentry: word map must contain !silence, !sent_start, and !sent_end");
// implant
auto& wordsequence = wordsequences[key]; // this creates the map entry
wordsequence.words = wordseqbuffer; // makes a copy
wordsequence.align = alignseqbuffer;
}
}
public:
// return if input statename is sil state (hard code to compared first 3 chars with "sil")
bool issilstate(const string& statename) const // (later use some configuration table)
{
return (statename.size() > 3 && statename.at(0) == 's' && statename.at(1) == 'i' && statename.at(2) == 'l');
}
vector<bool> issilstatetable; // [state index] => true if is sil state (cached)
// return if input stateid represent sil state (by table lookup)
bool issilstate(const size_t id) const
{
assert(id < issilstatetable.size());
return issilstatetable[id];
}
struct nullmap
{
int operator[](const char* s) const
{
LogicError("nullmap: should never be used");
}
}; // to satisfy a template, never used... :(
// alternate constructor that optionally also reads word alignments (for MMI training); triggered by providing a 'wordmap'
// (We cannot use an optional arg in the constructor above because it interferes with the template resolution.)
template <typename WORDSYMBOLTABLE, typename UNITSYMBOLTABLE>
htkmlfreader(const vector<wstring>& paths, const set<wstring>& restricttokeys, const wstring& stateListPath, const WORDSYMBOLTABLE* wordmap, const UNITSYMBOLTABLE* unitmap, const double htkTimeToFrame)
{
// read state list
if (stateListPath != L"")
readstatelist(stateListPath);
// read MLF(s) --note: there can be multiple, so this is a loop
foreach_index (i, paths)
read(paths[i], restricttokeys, wordmap, unitmap, htkTimeToFrame);
}
// note: this function is not designed to be pretty but to be fast
template <typename WORDSYMBOLTABLE, typename UNITSYMBOLTABLE>
void read(const wstring& path, const set<wstring>& restricttokeys, const WORDSYMBOLTABLE* wordmap, const UNITSYMBOLTABLE* unitmap, const double htkTimeToFrame)
{
if (!restricttokeys.empty() && this->size() >= restricttokeys.size()) // no need to even read the file if we are there (we support multiple files)
return;
fprintf(stderr, "htkmlfreader: reading MLF file %ls ...", path.c_str());
curpath = path; // for error messages only
auto_file_ptr f(fopenOrDie(path, L"rb"));
std::string headerLine = fgetline(f);
if (headerLine != "#!MLF!#")
malformed("header missing");
// Read the file in blocks and parse MLF entries
std::vector<typename WORDSEQUENCE::word> wordsequencebuffer;
std::vector<typename WORDSEQUENCE::aligninfo> alignsequencebuffer;
size_t readBlockSize = 1000000;
std::vector<char> currBlockBuf(readBlockSize + 1);
size_t currLineNum = 1;
std::vector<string> currMLFLines;
bool reachedEOF = (feof(f) != 0);
char* nextReadPtr = currBlockBuf.data();
size_t nextReadSize = readBlockSize;
while (!reachedEOF)
{
size_t numBytesRead = fread(nextReadPtr, sizeof(char), nextReadSize, f);
reachedEOF = (numBytesRead != nextReadSize);
if (ferror(f))
RuntimeError("error reading from file: %s", strerror(errno));
// Add 0 at the end to make it a proper C string
nextReadPtr[numBytesRead] = 0;
// Now extract lines from the currBlockBuf and parse MLF entries
char* context = nullptr;
const char* delim = "\r\n";
auto consumeMLFLine = [&](const char* mlfLine)
{
currLineNum++;
currMLFLines.push_back(mlfLine);
if ((mlfLine[0] == '.') && (mlfLine[1] == 0)) // utterance end delimiter: a single dot on a line
{
if (restricttokeys.empty() || (this->size() < restricttokeys.size()))
{
parseentry(currMLFLines, currLineNum - currMLFLines.size(), restricttokeys, wordmap, unitmap, wordsequencebuffer, alignsequencebuffer, htkTimeToFrame);
}
currMLFLines.clear();
}
};
char* prevLine = strtok_s(currBlockBuf.data(), delim, &context);
for (char* currLine = strtok_s(NULL, delim, &context); currLine; currLine = strtok_s(NULL, delim, &context))
{
consumeMLFLine(prevLine);
prevLine = currLine;
}
// The last line read from the block may be a full line or part of a line
// We can tell by whether the terminating NULL for this line is the NULL
// we inserted after reading from the file
size_t prevLineLen = strlen(prevLine);
if ((prevLine + prevLineLen) == (nextReadPtr + numBytesRead))
{
// This is not a full line, but just a truncated part of a line.
// Lets copy this to the start of the currBlockBuf and read new data
// from there on
strcpy_s(currBlockBuf.data(), currBlockBuf.size(), prevLine);
nextReadPtr = currBlockBuf.data() + prevLineLen;
nextReadSize = readBlockSize - prevLineLen;
}
else
{
// A full line
consumeMLFLine(prevLine);
nextReadPtr = currBlockBuf.data();
nextReadSize = readBlockSize;
}
}
if (!currMLFLines.empty())
malformed("unexpected end in mid-utterance");
curpath.clear();
fprintf(stderr, " total %lu entries\n", (unsigned long)this->size());
}
// read state list, index is from 0
void readstatelist(const wstring& stateListPath = L"")
{
if (stateListPath != L"")
{
vector<char> buffer; // buffer owns the characters--don't release until done
vector<char*> lines = readlines(stateListPath, buffer);
size_t index;
issilstatetable.reserve(lines.size());
for (index = 0; index < lines.size(); index++)
{
statelistmap[lines[index]] = index;
issilstatetable.push_back(issilstate(lines[index]));
}
if (index != statelistmap.size())
RuntimeError("readstatelist: lines (%d) not equal to statelistmap size (%d)", (int) index, (int) statelistmap.size());
if (statelistmap.size() != issilstatetable.size())
RuntimeError("readstatelist: size of statelookuparray (%d) not equal to statelistmap size (%d)", (int) issilstatetable.size(), (int) statelistmap.size());
fprintf(stderr, "total %lu state names in state list %ls\n", (unsigned long)statelistmap.size(), stateListPath.c_str());
}
}
// return state num: varify the fintune layer dim
size_t getstatenum() const
{
return statelistmap.size();
}
size_t getstateid(string statename) // added by Hang Su adaptation
{
return statelistmap[statename];
}
// access to word sequences
const map<wstring, WORDSEQUENCE>& allwordtranscripts() const
{
return wordsequences;
}
};
};
}; // namespaces