https://github.com/Microsoft/CNTK
Tip revision: ac729f0b744c2675963dda1e68cb6c2ab9c55995 authored by Thiago Crepaldi on 30 January 2019, 04:24:27 UTC
Upgrade protobuf to 3.6.0
Upgrade protobuf to 3.6.0
Tip revision: ac729f0
HTKMLFReader.cpp
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include "HTKMLFReader.h"
#include "Config.h"
#include "HTKDeserializer.h"
#include "MLFDeserializer.h"
#include "ConfigHelper.h"
#include "Bundler.h"
#include "StringUtil.h"
#include "FramePacker.h"
#include "SequencePacker.h"
#include "TruncatedBpttPacker.h"
#include "BlockRandomizer.h"
#include "NoRandomizer.h"
namespace CNTK {
using namespace Microsoft::MSR::CNTK;
std::vector<DataDeserializerPtr> CreateDeserializers(const ConfigParameters& readerConfig, CorpusDescriptorPtr corpus)
{
std::vector<std::wstring> featureNames;
std::vector<std::wstring> labelNames;
std::vector<std::wstring> notused;
ConfigHelper config(readerConfig);
config.GetDataNamesFromConfig(featureNames, labelNames, notused, notused);
if (featureNames.size() < 1)
{
InvalidArgument("Network needs at least 1 feature specified.");
}
std::vector<DataDeserializerPtr> featureDeserializers;
std::vector<DataDeserializerPtr> labelDeserializers;
bool primary = true;
// The first deserializer is the driving one, it defines chunking.
// TODO: should we make this explicit configuration parameter
for (const auto& featureName : featureNames)
{
auto deserializer = std::make_shared<HTKDeserializer>(corpus, readerConfig(featureName), featureName, primary);
primary = false;
featureDeserializers.push_back(deserializer);
}
for (const auto& labelName : labelNames)
{
auto deserializer = std::make_shared<MLFDeserializer>(corpus, readerConfig(labelName), labelName);
labelDeserializers.push_back(deserializer);
}
std::vector<DataDeserializerPtr> deserializers;
deserializers.insert(deserializers.end(), featureDeserializers.begin(), featureDeserializers.end());
deserializers.insert(deserializers.end(), labelDeserializers.begin(), labelDeserializers.end());
return deserializers;
}
HTKMLFReader::HTKMLFReader(const ConfigParameters& readerConfig)
: m_seed(0)
{
// TODO: deserializers and transformers will be dynamically loaded
// from external libraries based on the configuration/brain script.
bool frameMode = readerConfig(L"frameMode", true);
bool truncated = readerConfig(L"truncated", false);
if (frameMode && truncated)
{
LogicError("frameMode and truncated BPTT are mutually exclusive.");
}
if (frameMode)
{
m_packingMode = PackingMode::sample;
}
else if (truncated)
{
m_packingMode = PackingMode::truncated;
}
else
{
m_packingMode = PackingMode::sequence;
}
// nbruttsineachrecurrentiter is old reader configuration, truncationLength is the new one.
// If truncation length is specified we estimate
// the number of parallel sequences we have to pack as max(1, (mbsize/truncationLength))
// If nbruttsineachrecurrentiter is specified we assume that the truncation size is mbSize
// and the real minibatch size in mbSize * nbruttsineachrecurrentiter[epochIndex]
m_truncationLength = readerConfig(L"truncationLength", 0);
m_numParallelSequencesForAllEpochs =
readerConfig(L"nbruttsineachrecurrentiter", ConfigParameters::Array(intargvector(vector<int> { 1 })));
bool useNumericSequenceKeys = readerConfig(L"useNumericSequenceKeys", false);
CorpusDescriptorPtr corpus = std::make_shared<CorpusDescriptor>(useNumericSequenceKeys);
ConfigHelper config(readerConfig);
size_t window = config.GetRandomizationWindow();
auto deserializers = CreateDeserializers(readerConfig, corpus);
if (deserializers.empty())
{
LogicError("Please specify at least a single input stream.");
}
bool cleanse = readerConfig(L"checkData", true);
auto bundler = std::make_shared<Bundler>(readerConfig, corpus, deserializers[0], deserializers, cleanse);
int verbosity = readerConfig(L"verbosity", 0);
std::wstring readMethod = config.GetRandomizer();
// TODO: this should be bool. Change when config per deserializer is allowed.
if (AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
{
m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, window, bundler,
/*shouldPrefetch =*/ true,
/*multithreadedGetNextSequences =*/ false, // default
/*maxNumberOfInvalidSequences =*/ 0, // default
/*sampleBasedRandomizationWindow =*/ true, // default
GetRandomSeed(readerConfig));
}
else if (AreEqualIgnoreCase(readMethod, std::wstring(L"none")))
{
m_sequenceEnumerator = std::make_shared<NoRandomizer>(bundler);
}
else
{
RuntimeError("readMethod must be 'blockRandomize' or 'none'.");
}
// Create output stream descriptions (all dense)
for (auto d : deserializers)
{
for (auto stream : d->StreamInfos())
{
if (m_packingMode == PackingMode::truncated)
{
// TODO: Currently BPTT does not support sparse format as output.
// We always require dense.
stream.m_storageFormat = StorageFormat::Dense;
}
stream.m_id = m_streams.size();
m_streams.push_back(stream);
}
}
// TODO: should we unify sample and sequence mode packers into a single one.
// TODO: functionally they are the same, the only difference is how we handle
// TODO: MBlayout and what is the perf hit for iterating/copying sequences.
// TODO: Should do more perf tests before unifying these two.
// TODO: As the next step the packers will be moved out of the readers into the
// TODO: core CNTK. They are format agnostic and can be used with any type of
// TODO: deserializers.
switch (m_packingMode)
{
case PackingMode::sample:
m_packer = std::make_shared<FramePacker>(m_sequenceEnumerator, m_streams);
break;
case PackingMode::sequence:
m_packer = std::make_shared<SequencePacker>(m_sequenceEnumerator, m_streams);
break;
case PackingMode::truncated:
m_packer = std::make_shared<TruncatedBPTTPacker>(m_sequenceEnumerator, m_streams);
break;
default:
LogicError("Unsupported type of packer '%d'.", (int)m_packingMode);
}
}
std::vector<StreamInformation> HTKMLFReader::GetStreamDescriptions()
{
assert(!m_streams.empty());
return m_streams;
}
void HTKMLFReader::StartEpoch(const EpochConfiguration& config, const std::map<std::wstring, int>& requiredStreams)
{
EpochConfiguration cfg = config;
if (m_packingMode == PackingMode::truncated)
{
size_t minibatchSize = config.m_minibatchSizeInSamples;
size_t truncationLength = m_truncationLength;
if (truncationLength == 0)
{
// Old config, the truncation length is specified as the minibatch size.
// In this case the truncation size is mbSize
// and the real minibatch size is truncation size * nbruttsineachrecurrentiter
fprintf(stderr, "Legacy configuration is used for truncated BPTT mode, please adapt the config to explicitly specify truncationLength.\n");
truncationLength = minibatchSize;
size_t numParallelSequences = m_numParallelSequencesForAllEpochs[config.m_epochIndex];
minibatchSize = numParallelSequences * truncationLength;
}
cfg.m_minibatchSizeInSamples = minibatchSize;
cfg.m_truncationSize = truncationLength;
}
ReaderBase::StartEpoch(cfg, requiredStreams);
}
}