Raw File
Tip revision: c18212eeac755c586f2dbf29fac00eee47bba1f4 authored by Peyman Manikashani on 02 August 2018, 23:13:08 UTC
BN FP16 fix
Tip revision: c18212e
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See file in the project root for full license information.

#include "stdafx.h"
#include "LatticeDeserializer.h"
#include "LatticeIndexBuilder.h"
#include "ConfigHelper.h"
#include "Basics.h"
#include "MLFUtils.h"

namespace CNTK {

using namespace Microsoft::MSR::CNTK;

using namespace std;

// This class stores sequence data for HTK for floats.
struct LatticeFloatSequenceData : DenseSequenceData
    LatticeFloatSequenceData(void* data, unsigned int bufferSize, const NDShape& frameShape, shared_ptr<vector<char>> pParentBuffer) :DenseSequenceData(bufferSize,true),
        m_buffer(data), m_frameShape(frameShape), m_pParentBuffer(pParentBuffer)

    const void* GetDataBuffer() override
        return m_buffer;

    const NDShape& GetSampleShape() override
        return m_frameShape;

    const NDShape& m_frameShape;
    void* m_buffer;
    shared_ptr<vector<char>> m_pParentBuffer;

// Base class for chunks in frame and sequence mode.
// The lifetime is always less than the lifetime of the parent deserializer.
class LatticeDeserializer::ChunkBase : public Chunk
    shared_ptr<vector<char>> m_pBuffer; // Ptr to the buffer for the whole chunk
    vector<bool> m_valid;    // Bit mask whether the parsed sequence is valid.

    const LatticeDeserializer& m_deserializer;
    const ChunkDescriptor& m_descriptor;     // Current chunk descriptor.
    int m_verbosity;
    ChunkBase(const LatticeDeserializer& deserializer, const ChunkDescriptor& descriptor, const wstring& fileName, int verbosity):
        if (descriptor.NumberOfSequences() == 0 || descriptor.SizeInBytes() == 0)
            LogicError("Empty chunks are not supported.");

        if (m_verbosity == 1)
            fprintf(stderr, "Reading lattice from file '%ls'\n", fileName.c_str());

        auto f = FileWrapper::OpenOrDie(fileName, L"rbS");
        size_t sizeInBytes = descriptor.SizeInBytes();

        // Make sure we always have 3 at the end for buffer overrun, i.e. 4 byte alignment
        // This is required because currently lattices are exposed as an array of floats, because CPUMatrix does not support chars.
        // TODO: switch to char when possible.
        vector<char> buffer(sizeInBytes + sizeof(float) - 1);
        for (int fl = 0; fl < sizeof(float) - 1; fl++)
            buffer[sizeInBytes + fl] = 0;

        // Seek and read chunk into memory.
        f.SeekOrDie(descriptor.StartOffset(), SEEK_SET);

        f.ReadOrDie(, sizeInBytes, 1);

        m_pBuffer = make_shared<vector<char> >(move(buffer));

        // all sequences are valid by default.
        m_valid.resize(m_descriptor.NumberOfSequences(), true);

    string KeyOf(const SequenceDescriptor& s)
        return m_deserializer.m_corpus->IdToKey(s.m_key);

// MLF chunk when operating in sequence mode.
class LatticeDeserializer::SequenceChunk : public LatticeDeserializer::ChunkBase

    SequenceChunk(const LatticeDeserializer& parent, const ChunkDescriptor& descriptor, const wstring& fileName, int verbosity)
        : ChunkBase(parent, descriptor, fileName, verbosity), m_ndShape({ 1 })

    void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result) override
        return GetSequence<float>(sequenceIndex, result);

    template<class ElementType>
    void GetSequence(size_t sequenceIndex, vector<SequenceDataPtr>& result)
        const auto& sequence = m_descriptor.Sequences()[sequenceIndex];
        if (m_verbosity == 1)
            fprintf(stderr, "Reading sequence '%s'...\n", KeyOf(sequence).c_str());

        // Deserialize the binary lattice graph and serialize it into a vector
        SequenceDataPtr s = make_shared<LatticeFloatSequenceData>(m_pBuffer->data() + sequence.OffsetInChunk(), sequence.NumberOfSamples(), m_ndShape, m_pBuffer);


    const NDShape m_ndShape; 

    CorpusDescriptorPtr corpus,
    const ConfigParameters& cfg,
    bool primary)
    : DataDeserializerBase(primary),
    if (primary)
        LogicError("Lattice deserializer does not support primary mode, it cannot control chunking. "
            "Please specify HTK deserializer as the first deserializer in your config file.");

    m_verbosity = cfg(L"verbosity", 0);
    m_chunkSizeBytes = cfg(L"chunkSizeInBytes", g_64MB);

    ConfigParameters input = cfg(L"input");
    auto inputName = input.GetMemberIds().front();

    ConfigParameters streamConfig = input(inputName);

    ConfigHelper config(streamConfig);
    InitializeChunkInfos(corpus, config);

size_t LatticeDeserializer::RecordChunk(const string& latticePath, const vector<string>& tocLines, CorpusDescriptorPtr corpus, bool enableCaching, bool lastChunkInTOC)
    size_t totalNumSequences = 0;
    wstring latticePathW;
    latticePathW.assign(latticePath.begin(), latticePath.end());
    attempt(5, [this, latticePathW, tocLines, enableCaching, corpus, lastChunkInTOC]()
        LatticeIndexBuilder builder(FileWrapper(latticePathW, L"rbS"), tocLines, corpus, lastChunkInTOC);


    auto& index = m_indices.back();
    // Build auxiliary for GetSequenceByKey.
    for (const auto& chunk : index->Chunks())
        // Preparing chunk info that will be exposed to the outside.
        auto chunkId = static_cast<ChunkIdType>(m_chunks.size());
        for (uint32_t i = 0; i < chunk.NumberOfSequences(); ++i)
            const auto& sequence = chunk[i];
            auto sequenceIndex = i;
            m_keyToChunkLocation.push_back(std::make_tuple(sequence.m_key, chunkId, sequenceIndex));

        totalNumSequences += chunk.NumberOfSequences();
        m_chunkToFileIndex.insert(make_pair(&chunk, m_latticeFiles.size() - 1));
        if (m_chunks.size() >= numeric_limits<ChunkIdType>::max())
            RuntimeError("Number of chunks exceeded overflow limit.");

    return totalNumSequences;

static inline bool LessByFirstItem(const std::tuple<size_t, size_t, size_t>& a, const std::tuple<size_t, size_t, size_t>& b)
    return std::get<0>(a) < std::get<0>(b);

// Initializes chunks based on the configuration and utterance descriptions.
void LatticeDeserializer::InitializeChunkInfos(CorpusDescriptorPtr corpus, ConfigHelper& config)
    std::string latticeIndexPath = config.GetLatticeIndexFilePath();

    fprintf(stderr, "Reading lattice index file '%s' ...\n", latticeIndexPath.c_str());
    ifstream latticeIndexStream(latticeIndexPath.c_str());
    if (!(latticeIndexStream && latticeIndexStream.good()))
        RuntimeError("Failed to open input file: '%s'", latticeIndexPath.c_str());

    bool enableCaching = corpus->IsHashingEnabled() && config.GetCacheIndex();
    size_t totalNumSequences = 0;
    vector<string> tocLines;
    string tocPath;
    while (getline(latticeIndexStream, tocPath))
        tocPath.erase(tocPath.find_last_not_of(" \n\r\t") + 1);
        std::ifstream tocFileStream(tocPath);
        if (!(tocFileStream && tocFileStream.good())) 
            fprintf(stderr, "Failed to open input file: %s", tocPath.c_str());

        std::string tocLine;
        bool firstIndex = true;
        string prevLatticePath;
        while (std::getline(tocFileStream, tocLine))
            size_t start = tocLine.find("=") + 1;
            size_t end = tocLine.find("[");
            string latticePath = tocLine.substr(start, end - start);
            if (latticePath.size() > 0) {
                if (firstIndex)
                    firstIndex = false;
                else {
                    totalNumSequences += RecordChunk(prevLatticePath, tocLines, corpus, enableCaching, false);

                prevLatticePath = latticePath;
        totalNumSequences += RecordChunk(prevLatticePath, tocLines, corpus, enableCaching, true);

    std::sort(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), LessByFirstItem);

    fprintf(stderr, "LatticeDeserializer: '%zu' sequences\n", totalNumSequences);

// Describes exposed stream - a single stream of htk features.
void LatticeDeserializer::InitializeStreams(const wstring& featureName)
    StreamInformation stream;
    stream.m_id = 0;
    stream.m_name = featureName;
    stream.m_sampleLayout = NDShape({ 1 });
    stream.m_storageFormat = StorageFormat::Dense;
    stream.m_elementType = DataType::Float;
    stream.m_isBinary = true;

// Gets information about available chunks.
std::vector<ChunkInfo> LatticeDeserializer::ChunkInfos()
    std::vector<ChunkInfo> chunks;
    for (size_t i = 0; i < m_chunks.size(); ++i)
        ChunkInfo cd;
        cd.m_id = static_cast<ChunkIdType>(i);
        if (cd.m_id != i)
            RuntimeError("ChunkIdType overflow during creation of a chunk description.");

        cd.m_numberOfSequences = m_chunks[i]->NumberOfSequences();
        cd.m_numberOfSamples = m_chunks[i]->NumberOfSamples();
    return chunks;

// Gets sequences for a particular chunk.
// This information is used by the randomizer to fill in current windows of sequences.
void LatticeDeserializer::SequenceInfosForChunk(ChunkIdType, vector<SequenceInfo>& result)
    LogicError("Lattice deserializer does not support primary mode, it cannot control chunking. "
        "Please specify HTK deserializer as the first deserializer in your config file.");


ChunkPtr LatticeDeserializer::GetChunk(ChunkIdType chunkId)
    ChunkPtr result;
    attempt(5, [this, &result, chunkId]()
        auto chunk = m_chunks[chunkId];
        auto& fileName = m_latticeFiles[m_chunkToFileIndex[chunk]];

        result = make_shared<SequenceChunk>(*this, *chunk, fileName, m_verbosity);

    return result;

bool LatticeDeserializer::GetSequenceInfoByKey(const SequenceKey& key, SequenceInfo& result)
    auto found = std::lower_bound(m_keyToChunkLocation.begin(), m_keyToChunkLocation.end(), std::make_tuple(key.m_sequence, 0, 0),

    if (found == m_keyToChunkLocation.end() || std::get<0>(*found) != key.m_sequence)
        return false;

    auto chunkId = std::get<1>(*found);
    auto sequenceIndexInChunk = std::get<2>(*found);

    result.m_chunkId = std::get<1>(*found);
    result.m_key = key;

    assert(result.m_key.m_sample == 0);

    const auto* chunk = m_chunks[chunkId];
    const auto& sequence = chunk->Sequences()[sequenceIndexInChunk];
    result.m_indexInChunk = sequenceIndexInChunk;
    result.m_numberOfSamples = sequence.m_numberOfSamples;

    return true;

back to top