https://github.com/Microsoft/CNTK
Tip revision: 66d8088c279cb656f3b1af6dd5685a94e890491e authored by Lingfeng Wu on 05 June 2017, 20:02:04 UTC
Merge remote-tracking branch 'origin/master' into chenzhehuai/lfbmmi-negstream
Merge remote-tracking branch 'origin/master' into chenzhehuai/lfbmmi-negstream
Tip revision: 66d8088
MLFIndexer.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#define _CRT_SECURE_NO_WARNINGS
#define _SCL_SECURE_NO_WARNINGS
#include "MLFIndexer.h"
#include "MLFUtils.h"
#include "ReaderUtil.h"
namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
MLFIndexer::MLFIndexer(FILE* file, bool frameMode, size_t chunkSize, size_t bufferSize) :
m_maxBufferSize(bufferSize),
m_file(file),
m_fileOffsetStart(0),
m_done(false),
m_index(chunkSize, true, frameMode)
{
if (!m_file)
RuntimeError("Input file not open for reading");
}
void MLFIndexer::RefillBuffer()
{
if (m_done)
return;
// Move start forward according to the already read buffer.
m_fileOffsetStart += m_buffer.size();
// Read the new portion of data into the buffer.
m_buffer.resize(m_maxBufferSize);
// Copy last partial line if it was left during the last read.
memcpy(&m_buffer[0], m_lastPartialLineInBuffer.data(), m_lastPartialLineInBuffer.size());
size_t bytesRead = fread(&m_buffer[0] + m_lastPartialLineInBuffer.size(), 1, m_buffer.size() - m_lastPartialLineInBuffer.size(), m_file);
if (bytesRead == (size_t)-1)
RuntimeError("Could not read from the input file.");
if (bytesRead == 0) // End of file reached.
{
boost::trim(m_lastPartialLineInBuffer);
if (!m_lastPartialLineInBuffer.empty()) // It seems like a corrupted file at the end.
RuntimeError("Unexpected line at the end of the file '%s'", m_lastPartialLineInBuffer.c_str());
m_buffer.clear();
m_lastPartialLineInBuffer.clear();
m_done = true;
return;
}
size_t readBufferSize = m_lastPartialLineInBuffer.size() + bytesRead;
// Let's find the last LF.
int lastLF = 0;
{
// Let's find the latest \n if exists.
for (lastLF = (int)readBufferSize - 1; lastLF >= 0; lastLF--)
{
if (m_buffer[lastLF] == '\n')
break;
}
if (lastLF < 0)
RuntimeError("Length of MLF sequence cannot exceed '%zu' bytes.", readBufferSize);
}
// Let's cut the buffer at the last EOL and save partial string
// in m_lastPartialLineInBuffer.
auto logicalBufferSize = lastLF + 1;
auto lastPartialLineSize = readBufferSize - logicalBufferSize;
// Remember the last parital line.
m_lastPartialLineInBuffer.resize(lastPartialLineSize);
if (lastPartialLineSize)
memcpy(&m_lastPartialLineInBuffer[0], m_buffer.data() + logicalBufferSize, lastPartialLineSize);
m_buffer.resize(logicalBufferSize);
}
static bool SingleDot(const boost::iterator_range<char*>& line)
{
return distance(line.begin(), line.end()) == 1 && *line.begin() == '.';
}
// Building an index of the MLF file:
// MLF file -> MLF Header [MLF Utterance]+
// MLF Utterance -> Key EOL [Frame Range EOL]+ "." EOL
// MLF file should start with the MLF header (State::Header -> State:UtteranceKey).
// Each utterance starts with an utterance key (State::UtteranceKey -> State::UtteranceFrames).
// End of utterance is indicated by a single dot on a line (State::UtteranceFrames -> State::UtteranceKey)
void MLFIndexer::Build(CorpusDescriptorPtr corpus)
{
if (!m_index.IsEmpty())
return;
m_index.Reserve(filesize(m_file));
RefillBuffer(); // read the first block of data
if (m_done)
RuntimeError("Input file is empty");
size_t id = 0;
State currentState = State::Header;
vector<boost::iterator_range<char*>> lines, tokens;
bool isValid = true; // Flag indicating whether the current sequence is valid.
size_t lastNonEmptyString = 0; // Needed to parse information about last frame
size_t sequenceStartOffset = 0; // Offset in file where current sequence starts.
while (!m_done)
{
lines.clear();
ReadLines(m_buffer, lines);
lastNonEmptyString = SIZE_MAX;
for (size_t i = 0; i < lines.size(); i++)
{
if (lines[i].begin() == lines[i].end()) // Skip all empty lines.
continue;
switch (currentState)
{
case State::Header:
{
if (string(lines[i].begin(), lines[i].end()) != "#!MLF!#")
RuntimeError("Expected MLF header was not found.");
currentState = State::UtteranceKey;
}
break;
case State::UtteranceKey:
{
// When several files are appended to a big mlf, there can be
// an MLF header between the utterances.
if (string(lines[i].begin(), lines[i].end()) == "#!MLF!#")
continue;
sequenceStartOffset = m_fileOffsetStart + lines[i].begin() - m_buffer.data();
isValid = TryParseSequenceKey(lines[i], id, corpus->KeyToId);
currentState = State::UtteranceFrames;
}
break;
case State::UtteranceFrames:
{
if (!SingleDot(lines[i]))
break; // Still current utterance.
// Ok, a single . on a line means we found the end of the utterance.
auto sequenceEndOffset = m_fileOffsetStart + lines[i].end() - m_buffer.data();
// Let's find last non empty string and parse information about frames out of it.
// Here we assume that the sequence is correct, if not - it will be invalidated later
// when the actual data is read.
if (lastNonEmptyString != SIZE_MAX)
m_lastNonEmptyLine = string(lines[lastNonEmptyString].begin(), lines[lastNonEmptyString].end());
uint32_t numberOfSamples = 0;
if (m_lastNonEmptyLine.empty())
isValid = false;
else
{
tokens.clear();
auto container = boost::make_iterator_range(&m_lastNonEmptyLine[0], &m_lastNonEmptyLine[0] + m_lastNonEmptyLine.size());
const static std::vector<bool> delim = DelimiterHash({ ' ' });
Split(container.begin(), container.end(), delim, tokens);
auto range = MLFFrameRange::ParseFrameRange(tokens, sequenceEndOffset);
numberOfSamples = static_cast<uint32_t>(range.second);
}
if (isValid)
m_index.AddSequence(SequenceDescriptor{ id, numberOfSamples }, sequenceStartOffset, sequenceEndOffset);
else
fprintf(stderr, "WARNING: Cannot parse the utterance '%s' at offset (%" PRIu64 ")\n", corpus->IdToKey(id).c_str(), sequenceStartOffset);
currentState = State::UtteranceKey; // Let's try the next one.
}
break;
default:
LogicError("Unexpected MLF state.");
}
lastNonEmptyString = i;
}
// Remembering last non empty string to be able to retrieve time frame information
// when the dot is just at the beginning of the next buffer.
if (lastNonEmptyString != SIZE_MAX)
m_lastNonEmptyLine = string(lines[lastNonEmptyString].begin(), lines[lastNonEmptyString].end());
else
m_lastNonEmptyLine.clear();
RefillBuffer();
}
// Clean the buffer.
std::vector<char> tmp;
m_buffer.swap(tmp);
}
void MLFIndexer::ReadLines(vector<char>& buffer, vector<boost::iterator_range<char*>>& lines)
{
lines.clear();
const static std::vector<bool> delim = DelimiterHash({ '\r', '\n' });
Split(buffer.data(), buffer.data() + buffer.size(), delim, lines);
}
// Tries to parse sequence key
// In MLF a sequence key should be in quotes. During parsing the extension should be removed.
bool MLFIndexer::TryParseSequenceKey(const boost::iterator_range<char*>& line, size_t& id, function<size_t(const string&)> keyToId)
{
id = 0;
string key(line.begin(), line.end());
boost::trim_right(key);
if (key.size() <= 2 || key.front() != '"' || key.back() != '"')
return false;
key = key.substr(1, key.size() - 2);
if (key.size() > 2 && key[0] == '*' && key[1] == '/') // Preserving the old behavior
key = key.substr(2);
// Remove extension if specified.
key = key.substr(0, key.find_last_of("."));
id = keyToId(key);
return true;
}
}}}